3082, 4439, 7134, 8279 Various fixes for GEDCOM import.

0003082: 1/4 and 1/2 ANSEL characters not supported on importing ANSEL
GEDCOM
0004439: [Info]: characters ignored on a Gedcom encoded ANSI (cp1252
West Europe, USA)
0007134: Failure importing ANSEL encoded gedcom file.
0008279: GEDCOM import fails for ANSI file incorrectly opened with the
utf8 locale

Fixed GEDCOM import not working properly for Python3; other problems
also corected, including fixing the inability to import Windows CP1252
coded files. Also more consistent fix for 8014. Ensured any error
messages are not lost. Fixed a few ANSEL characters that were not
translated.
This commit is contained in:
kulath 2015-03-24 16:36:12 +00:00
parent 57a367e8c4
commit 4b8ca4a824
2 changed files with 212 additions and 79 deletions

View File

@ -76,15 +76,31 @@ def importData(database, filename, user):
database.__class__.__bases__ = (DbMixin,) + \
database.__class__.__bases__
try:
ifile = open(filename, "r")
except IOError:
return
if sys.version_info[0] < 3:
try:
ifile = open(filename, "rU")
except IOError:
return
else:
try:
ifile = open(filename, "rb")
except IOError:
return
# print("file opened")
ansel = False
gramps = False
for index in range(50):
line = ifile.readline().split()
# Treat the file as though it is UTF-8 since this is the more modern
# option; and anyway it doesn't really matter as we are only trying to
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
# ignore anything that can't be translated.
line = ifile.readline()
if sys.version_info[0] < 3:
line = unicode(line, encoding='utf-8', errors='replace')
else:
line = line.decode(encoding='utf-8', errors='replace')
line = line.split()
if len(line) == 0:
break
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":

View File

@ -578,6 +578,10 @@ LDS_STATUS = {
# table for skipping illegal control chars in GEDCOM import
# Only 09, 0A, 0D are allowed.
STRIP_DICT = dict.fromkeys(list(range(9))+list(range(11, 13))+list(range(14, 32)))
# The C1 Control characters are not treated in Latin-1 (ISO-8859-1) as
# undefined, but if they have been used, the file is probably supposed to be
# cp1252
DEL_AND_C1 = dict.fromkeys(list(range(0x7F, 0x9F)))
#-------------------------------------------------------------------------
#
@ -689,7 +693,7 @@ class GedcomDateParser(DateParser):
#-------------------------------------------------------------------------
class Lexer(object):
def __init__(self, ifile):
def __init__(self, ifile, __add_msg):
self.ifile = ifile
self.current_list = []
self.eof = False
@ -700,6 +704,7 @@ class Lexer(object):
TOKEN_CONT : self.__fix_token_cont,
TOKEN_CONC : self.__fix_token_conc,
}
self.__add_msg = __add_msg
def readline(self):
if len(self.current_list) <= 1 and not self.eof:
@ -738,6 +743,7 @@ class Lexer(object):
self.eof = True
return
original_line = line
try:
# According to the GEDCOM 5.5 standard,
# Chapter 1 subsection Grammar
@ -771,6 +777,13 @@ class Lexer(object):
tag = line[0]
line_value = line[2]
except:
problem = _("Line ignored ")
text = original_line.rstrip('\n\r')
prob_width = 66
problem = problem.ljust(prob_width)[0:(prob_width-1)]
text = text.replace("\n", "\n".ljust(prob_width + 22))
message = "%s %s" % (problem, text)
self.__add_msg(message)
continue
token = TOKENS.get(tag, TOKEN_UNKNOWN)
@ -1234,27 +1247,29 @@ class GedInfoParser(object):
#
#-------------------------------------------------------------------------
class BaseReader(object):
def __init__(self, ifile, encoding):
def __init__(self, ifile, encoding, __add_msg):
self.ifile = ifile
self.enc = encoding
self.__add_msg = __add_msg
def reset(self):
self.ifile.seek(0)
def readline(self):
if sys.version_info[0] < 3:
line = unicode(self.ifile.readline(),
encoding=self.enc,
errors='replace')
else:
line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT)
raise NotImplemented
def report_error(self, problem, line):
line = line.rstrip('\n\r')
prob_width = 66
problem = problem.ljust(prob_width)[0:(prob_width-1)]
text = line.replace("\n", "\n".ljust(prob_width + 22))
message = "%s %s" % (problem, text)
self.__add_msg(message)
class UTF8Reader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, 'utf8')
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'utf8', __add_msg)
self.reset()
def reset(self):
@ -1275,23 +1290,61 @@ class UTF8Reader(BaseReader):
class UTF16Reader(BaseReader):
def __init__(self, ifile):
def __init__(self, ifile, __add_msg):
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
BaseReader.__init__(self, new_file, 'utf16')
BaseReader.__init__(self, new_file, '', __add_msg)
self.reset()
def readline(self):
l = self.ifile.readline()
if l.strip():
return l
line = self.ifile.readline()
if sys.version_info[0] < 3:
line = unicode(line,
encoding='utf8',
errors='replace')
if line.strip():
return line.translate(STRIP_DICT)
else:
line = self.ifile.readline()
line = unicode(line,
encoding='utf8',
errors='replace')
return line.translate(STRIP_DICT)
else:
return self.ifile.readline()
line = line.decode('utf8', errors='replace')
return line.translate(STRIP_DICT)
class AnsiReader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, 'latin1')
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
def readline(self):
line = self.ifile.readline()
if sys.version_info[0] < 3:
line = unicode(line,
encoding=self.enc,
errors='replace')
else:
line = line.decode(self.enc, errors='replace')
if line.translate(DEL_AND_C1) != line:
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
return line.translate(STRIP_DICT)
class CP1252Reader(BaseReader):
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
def readline(self):
line = self.ifile.readline()
if sys.version_info[0] < 3:
line = unicode(line,
encoding=self.enc,
errors='replace')
else:
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT)
class AnselReader(BaseReader):
"""
ANSEL to Unicode Conversion
@ -1311,7 +1364,8 @@ class AnselReader(BaseReader):
TODO: should we allow TAB, as a Gramps extension?
"""
__printable_ascii = list(map(chr, list(range(32, 127)))) # note: up thru 126
__use_ASCII = list(map(chr, [10, 27, 29 , 30, 31])) + __printable_ascii
# LF CR Esc GS RS US
__use_ASCII = list(map(chr, [10, 13, 27, 29 , 30, 31])) + __printable_ascii
# mappings of single byte ANSEL codes to unicode
__onebyte = {
@ -1324,9 +1378,11 @@ class AnselReader(BaseReader):
b'\xB4' : '\u00fe', b'\xB5' : '\u00e6', b'\xB6' : '\u0153',
b'\xB7' : '\u02ba', b'\xB8' : '\u0131', b'\xB9' : '\u00a3',
b'\xBA' : '\u00f0', b'\xBC' : '\u01a1', b'\xBD' : '\u01b0',
b'\xBE' : '\u25a1', b'\xBF' : '\u25a0',
b'\xC0' : '\u00b0', b'\xC1' : '\u2113', b'\xC2' : '\u2117',
b'\xC3' : '\u00a9', b'\xC4' : '\u266f', b'\xC5' : '\u00bf',
b'\xC6' : '\u00a1', b'\xC7' : '\u00df', b'\xC8' : '\u20ac',
b'\xCD' : '\u0065', b'\xCE' : '\u006f', b'\xCF' : '\u00df',
}
# combining forms (in ANSEL, they precede the modified ASCII character
@ -1347,6 +1403,7 @@ class AnselReader(BaseReader):
b'\xF3' : '\u0324', b'\xF4' : '\u0325', b'\xF5' : '\u0333',
b'\xF6' : '\u0332', b'\xF7' : '\u0326', b'\xF8' : '\u031c',
b'\xF9' : '\u032e', b'\xFA' : '\ufe22', b'\xFB' : '\ufe23',
b'\xFC' : '\u0338',
b'\xFE' : '\u0313',
}
@ -1504,52 +1561,94 @@ class AnselReader(BaseReader):
b'\xF9\x48' : '\u1e2a', b'\xF9\x68' : '\u1e2b',
}
@staticmethod
def __ansel_to_unicode(s):
def __ansel_to_unicode(self, s):
""" Convert an ANSEL encoded string to unicode """
buff = StringIO()
while s:
if ord(s[0]) < 128:
if s[0] in AnselReader.__use_ASCII:
head = s[0]
else:
# substitute space for disallowed (control) chars
head = ' '
s = s[1:]
else:
if s[0:2] in AnselReader.__twobyte:
head = AnselReader.__twobyte[s[0:2]]
s = s[2:]
elif s[0] in AnselReader.__onebyte:
head = AnselReader.__onebyte[s[0]]
s = s[1:]
elif s[0] in AnselReader.__acombiners:
c = AnselReader.__acombiners[s[0]]
# always consume the combiner
s = s[1:]
next = s[0]
if next in AnselReader.__printable_ascii:
# consume next as well
s = s[1:]
# unicode: combiner follows base-char
head = next + c
else:
# just drop the unexpected combiner
continue
else:
head = '\ufffd' # "Replacement Char"
s = s[1:]
buff.write(head.encode("utf-8"))
error = ""
if sys.version_info[0] < 3:
while s:
if ord(s[0]) < 128:
if s[0] in AnselReader.__use_ASCII:
head = s[0]
else:
# substitute space for disallowed (control) chars
error += " (%#X)" % ord(s[0])
head = ' '
s = s[1:]
else:
if s[0:2] in AnselReader.__twobyte:
head = AnselReader.__twobyte[s[0:2]]
s = s[2:]
elif s[0] in AnselReader.__onebyte:
head = AnselReader.__onebyte[s[0]]
s = s[1:]
elif s[0] in AnselReader.__acombiners:
c = AnselReader.__acombiners[s[0]]
# always consume the combiner
s = s[1:]
next = s[0]
if next in AnselReader.__printable_ascii:
# consume next as well
s = s[1:]
# unicode: combiner follows base-char
head = next + c
else:
# just drop the unexpected combiner
error += " (%#X)" % ord(s[0])
continue
else:
error += " (%#X)" % ord(s[0])
head = '\ufffd' # "Replacement Char"
s = s[1:]
buff.write(head.encode("utf-8"))
ans = unicode(buff.getvalue(), "utf-8")
else:
ans = buff.getvalue().decode("utf-8")
while s:
if s[0] < 128:
if chr(s[0]) in AnselReader.__use_ASCII:
head = chr(s[0])
else:
# substitute space for disallowed (control) chars
error += " (%#X)" % s[0]
head = ' '
s = s[1:]
else:
if s[0:2] in AnselReader.__twobyte:
head = AnselReader.__twobyte[s[0:2]]
s = s[2:]
elif bytes([s[0]]) in AnselReader.__onebyte:
head = AnselReader.__onebyte[bytes([s[0]])]
s = s[1:]
elif bytes([s[0]]) in AnselReader.__acombiners:
c = AnselReader.__acombiners[bytes([s[0]])]
# always consume the combiner
s = s[1:]
next_byte = s[0]
if next_byte < 128 and chr(next_byte) in AnselReader.__printable_ascii:
# consume next as well
s = s[1:]
# unicode: combiner follows base-char
head = chr(next_byte) + c
else:
# just drop the unexpected combiner
error += " (%#X)" % s[0]
continue
else:
error += " (%#X)" % s[0]
head = '\ufffd' # "Replacement Char"
s = s[1:]
buff.write(head)
ans = buff.getvalue()
if error:
# e.g. Illegal character (oxAB) (0xCB)... 1 NOTE xyz?pqr?lmn
self.report_error(_("Illegal character%s") % error, ans)
buff.close()
return ans
def __init__(self, ifile):
BaseReader.__init__(self, ifile, "")
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, "", __add_msg)
def readline(self):
return self.__ansel_to_unicode(self.ifile.readline())
@ -2651,15 +2750,17 @@ class GedcomParser(UpdateCallback):
enc = stage_one.get_encoding()
if enc == "ANSEL":
rdr = AnselReader(ifile)
rdr = AnselReader(ifile, self.__add_msg)
elif enc in ("UTF-8", "UTF8"):
rdr = UTF8Reader(ifile)
elif enc in ("UTF-16", "UTF16", "UNICODE"):
rdr = UTF16Reader(ifile)
rdr = UTF8Reader(ifile, self.__add_msg)
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
rdr = UTF16Reader(ifile, self.__add_msg)
elif enc in ("CP1252", "WINDOWS-1252"):
rdr = CP1252Reader(ifile, self.__add_msg)
else:
rdr = AnsiReader(ifile)
rdr = AnsiReader(ifile, self.__add_msg)
self.lexer = Lexer(rdr)
self.lexer = Lexer(rdr, self.__add_msg)
self.filename = filename
self.backoff = False
@ -7163,8 +7264,13 @@ class GedcomParser(UpdateCallback):
sattr.set_value(line.data)
self.def_src.add_attribute(sattr)
elif line.token == TOKEN_FORM:
if line.data != "LINEAGE-LINKED":
self.__add_msg(_("GEDCOM form not supported"), line, state)
if line.data == "LINEAGE-LINKED":
pass
elif line.data.upper() == "LINEAGE-LINKED":
# Allow Lineage-Linked etc. though it should be in uppercase
self.__add_msg(_("GEDCOM FORM should be in uppercase"), line, state)
else:
self.__add_msg(_("GEDCOM FORM not supported"), line, state)
if self.use_def_src:
sattr = SrcAttribute()
sattr.set_type(_('GEDCOM form'))
@ -7709,7 +7815,7 @@ class GedcomStageOne(object):
input_file.read(1)
self.enc = "UTF8"
return input_file
elif line == b"\xff\xfe":
elif line == b"\xff\xfe" or line == b"\xfe\xff":
self.enc = "UTF16"
input_file.seek(0)
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
@ -7730,25 +7836,33 @@ class GedcomStageOne(object):
reader = self.__detect_file_decoder(self.ifile)
for line in reader:
# Treat the file as though it is UTF-8 since this will be right if a
# BOM was detected; it is the more modern option; and anyway it
# doesn't really matter as we are only trying to detect a CHAR line
# which is only 7-bit ASCII anyway, and we ignore anything that
# can't be translated.
if sys.version_info[0] < 3:
line = unicode(line, encoding='utf-8', errors='replace')
else:
line = line.decode(encoding='utf-8', errors='replace')
line = line.strip()
if not line:
continue
self.lcnt += 1
data = line.split(None, 2) + ['']
try:
data = line.split(None, 2) + ['']
(level, key, value) = data[:3]
level = int(level)
key = conv_to_unicode(key.strip())
value = conv_to_unicode(value.strip())
key = key.strip()
value = value.strip()
except:
LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt)
continue
if level == 0 and key[0] == '@':
if value == ("FAM", "FAMILY") :
if value in ("FAM", "FAMILY") :
current_family_id = key.strip()[1:-1]
elif value == ("INDI", "INDIVIDUAL"):
elif value in ("INDI", "INDIVIDUAL"):
self.pcnt += 1
elif key in ("HUSB", "HUSBAND", "WIFE") and \
self.__is_xref_value(value):
@ -7758,6 +7872,9 @@ class GedcomStageOne(object):
elif key == 'CHAR' and not self.enc:
assert(isinstance(value, STRTYPE))
self.enc = value
LOG.debug("parse pcnt %d" % self.pcnt)
LOG.debug("parse famc %s" % dict(self.famc))
LOG.debug("parse fams %s" % dict(self.fams))
def get_famc_map(self):
"""