9124: GEDCOM doesn't accept CR as a line terminator
Original code used readline() when file was in binary mode; this works only if file contains '\n', true only for CRLF and LF line endings. Switched to file text mode with correct encoding and universal newline support.
This commit is contained in:
parent
16e2ed4f54
commit
be6715cd99
@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
#
|
||||
# GRAMPS modules
|
||||
# Gramps modules
|
||||
#
|
||||
#------------------------------------------------------------------------
|
||||
from gramps.gen.const import GRAMPS_LOCALE as glocale
|
||||
@ -65,29 +65,33 @@ def importData(database, filename, user):
|
||||
if DbMixin not in database.__class__.__bases__:
|
||||
database.__class__.__bases__ = (DbMixin,) + \
|
||||
database.__class__.__bases__
|
||||
|
||||
try:
|
||||
ifile = open(filename, "rb")
|
||||
except IOError:
|
||||
return
|
||||
|
||||
# Opening in utf-8 with universal newline to allow cr, lf, and crlf
|
||||
# If the file is really UTF16 or a varient, the next block code will
|
||||
# not find anything even if it is there, but this is ok since it
|
||||
# won't be ANSEL, or is inconsistent...
|
||||
with open(filename, "r", encoding='utf-8', errors='replace',
|
||||
newline=None) as ifile:
|
||||
ansel = False
|
||||
gramps = False
|
||||
for index in range(50):
|
||||
# Treat the file as though it is UTF-8 since this is the more modern
|
||||
# option; and anyway it doesn't really matter as we are only trying to
|
||||
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
|
||||
# ignore anything that can't be translated.
|
||||
# Treat the file as though it is UTF-8 since this is the more
|
||||
# modern option; and anyway it doesn't really matter as we are
|
||||
# only trying to detect a CHAR or SOUR line which is only
|
||||
# 7-bit ASCII anyway, and we ignore anything that can't be
|
||||
# translated.
|
||||
line = ifile.readline()
|
||||
line = line.decode(encoding='utf-8', errors='replace')
|
||||
line = line.split()
|
||||
if len(line) == 0:
|
||||
break
|
||||
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
|
||||
if len(line) > 2 and line[1][0:4] == 'CHAR' \
|
||||
and line[2] == "ANSEL":
|
||||
ansel = True
|
||||
if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
|
||||
if len(line) > 2 and line[1][0:4] == 'SOUR' \
|
||||
and line[2] == "GRAMPS":
|
||||
gramps = True
|
||||
ifile.close()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
if not gramps and ansel and user.uistate:
|
||||
top = Glade()
|
||||
|
@ -94,7 +94,7 @@ import codecs
|
||||
from xml.parsers.expat import ParserCreate
|
||||
from collections import defaultdict
|
||||
import string
|
||||
from io import StringIO
|
||||
from io import StringIO, TextIOWrapper
|
||||
from urllib.parse import urlparse
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
@ -1251,41 +1251,41 @@ class BaseReader(object):
|
||||
|
||||
class UTF8Reader(BaseReader):
|
||||
|
||||
def __init__(self, ifile, __add_msg):
|
||||
BaseReader.__init__(self, ifile, 'utf8', __add_msg)
|
||||
def __init__(self, ifile, __add_msg, enc):
|
||||
BaseReader.__init__(self, ifile, enc, __add_msg)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.ifile.seek(0)
|
||||
data = self.ifile.read(3)
|
||||
if data != b"\xef\xbb\xbf":
|
||||
self.ifile.seek(0)
|
||||
if enc == 'UTF_8_SIG':
|
||||
self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
|
||||
errors='replace', newline=None)
|
||||
else:
|
||||
self.ifile = TextIOWrapper(ifile, encoding='utf_8',
|
||||
errors='replace', newline=None)
|
||||
|
||||
def readline(self):
|
||||
line = self.ifile.readline()
|
||||
line = line.decode(self.enc, errors='replace')
|
||||
return line.translate(STRIP_DICT)
|
||||
|
||||
class UTF16Reader(BaseReader):
|
||||
|
||||
def __init__(self, ifile, __add_msg):
|
||||
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
|
||||
BaseReader.__init__(self, new_file, '', __add_msg)
|
||||
BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
|
||||
self.ifile = TextIOWrapper(ifile, encoding='utf_16',
|
||||
errors='replace', newline=None)
|
||||
self.reset()
|
||||
|
||||
def readline(self):
|
||||
line = self.ifile.readline()
|
||||
line = line.decode('utf8', errors='replace')
|
||||
return line.translate(STRIP_DICT)
|
||||
|
||||
class AnsiReader(BaseReader):
|
||||
|
||||
def __init__(self, ifile, __add_msg):
|
||||
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
|
||||
self.ifile = TextIOWrapper(ifile, encoding='latin1',
|
||||
errors='replace', newline=None)
|
||||
|
||||
def readline(self):
|
||||
line = self.ifile.readline()
|
||||
line = line.decode(self.enc, errors='replace')
|
||||
if line.translate(DEL_AND_C1) != line:
|
||||
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
|
||||
return line.translate(STRIP_DICT)
|
||||
@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
|
||||
|
||||
def __init__(self, ifile, __add_msg):
|
||||
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
|
||||
self.ifile = TextIOWrapper(ifile, encoding='cp1252',
|
||||
errors='replace', newline=None)
|
||||
|
||||
def readline(self):
|
||||
line = self.ifile.readline()
|
||||
line = line.decode(self.enc, errors='replace')
|
||||
return line.translate(STRIP_DICT)
|
||||
|
||||
class AnselReader(BaseReader):
|
||||
@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
|
||||
return ans
|
||||
|
||||
def __init__(self, ifile, __add_msg):
|
||||
BaseReader.__init__(self, ifile, "", __add_msg)
|
||||
BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
|
||||
# In theory, we should have been able to skip the encode/decode from
|
||||
# ascii. But this way allows us to use pythons universal newline
|
||||
self.ifile = TextIOWrapper(ifile, encoding='ascii',
|
||||
errors='surrogateescape', newline=None)
|
||||
|
||||
def readline(self):
|
||||
return self.__ansel_to_unicode(self.ifile.readline())
|
||||
line = self.ifile.readline()
|
||||
linebytes = line.encode(encoding='ascii', errors='surrogateescape')
|
||||
return self.__ansel_to_unicode(linebytes)
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
|
||||
|
||||
if enc == "ANSEL":
|
||||
rdr = AnselReader(ifile, self.__add_msg)
|
||||
elif enc in ("UTF-8", "UTF8"):
|
||||
rdr = UTF8Reader(ifile, self.__add_msg)
|
||||
elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
|
||||
rdr = UTF8Reader(ifile, self.__add_msg, enc)
|
||||
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
|
||||
rdr = UTF16Reader(ifile, self.__add_msg)
|
||||
elif enc in ("CP1252", "WINDOWS-1252"):
|
||||
@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
|
||||
def __detect_file_decoder(self, input_file):
|
||||
"""
|
||||
Detects the file encoding of the file by looking for a BOM
|
||||
(byte order marker) in the GEDCOM file. If we detect a UTF-16
|
||||
encoded file, we must connect to a wrapper using the codecs
|
||||
package.
|
||||
(byte order marker) in the GEDCOM file. If we detect a UTF-16 or
|
||||
UTF-8-BOM encoded file, we choose appropriate decoders. If no BOM
|
||||
is detected, we return in UTF-8 mode it is the more modern option;
|
||||
and anyway it doesn't really matter as we are only looking for GEDCOM
|
||||
keywords which are only 7-bit ASCII anyway.
|
||||
In any case, we Always return the file in text mode with transparent
|
||||
newline (CR, LF, or CRLF).
|
||||
"""
|
||||
line = input_file.read(2)
|
||||
if line == b"\xef\xbb":
|
||||
input_file.read(1)
|
||||
self.enc = "UTF8"
|
||||
return input_file
|
||||
self.enc = "utf_8_sig"
|
||||
return TextIOWrapper(input_file, encoding='utf_8_sig',
|
||||
errors='replace', newline=None)
|
||||
elif line == b"\xff\xfe" or line == b"\xfe\xff":
|
||||
self.enc = "UTF16"
|
||||
input_file.seek(0)
|
||||
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
|
||||
return TextIOWrapper(input_file, encoding='utf_16',
|
||||
errors='replace', newline=None)
|
||||
elif not line :
|
||||
raise GedcomError(self.__EMPTY_GED)
|
||||
elif line[0] == b"\x00" or line[1] == b"\x00":
|
||||
elif line == b"\x30\x00" or line == b"\x00\x30":
|
||||
raise GedcomError(self.__BAD_UTF16)
|
||||
else:
|
||||
input_file.seek(0)
|
||||
return input_file
|
||||
return TextIOWrapper(input_file, encoding='utf-8',
|
||||
errors='replace', newline=None)
|
||||
|
||||
def parse(self):
|
||||
"""
|
||||
@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
|
||||
reader = self.__detect_file_decoder(self.ifile)
|
||||
|
||||
for line in reader:
|
||||
# Treat the file as though it is UTF-8 since this will be right if a
|
||||
# BOM was detected; it is the more modern option; and anyway it
|
||||
# doesn't really matter as we are only trying to detect a CHAR line
|
||||
# which is only 7-bit ASCII anyway, and we ignore anything that
|
||||
# can't be translated.
|
||||
line = line.decode(encoding='utf-8', errors='replace')
|
||||
# Scan for a few items, keep counts. Also look for actual CHAR
|
||||
# Keyword to figure out actual encodeing for non-unicode file types
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
|
||||
LOG.debug("parse pcnt %d" % self.pcnt)
|
||||
LOG.debug("parse famc %s" % dict(self.famc))
|
||||
LOG.debug("parse fams %s" % dict(self.fams))
|
||||
self.ifile = reader # need this to keep python from autoclosing file
|
||||
|
||||
def get_famc_map(self):
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user