3082, 4439, 7134, 8279 Various fixes for GEDCOM import.

0003082: 1/4 and 1/2 ANSEL characters not supported on importing ANSEL GEDCOM 0004439: [Info]: characters ignored on a Gedcom encoded ANSI (cp1252 West Europe, USA) 0007134: Failure importing ANSEL encoded gedcom file. 0008279: GEDCOM import fails for ANSI file incorrectly opened with the utf8 locale Fixed GEDCOM import not working properly for Python3; other problems also corected, including fixing the inability to import Windows CP1252 coded files. Also more consistent fix for 8014. Ensured any error messages are not lost. Fixed a few ANSEL characters that were not translated.
2015-03-24 16:36:12 +00:00
parent 57a367e8c4
commit 4b8ca4a824
2 changed files with 212 additions and 79 deletions
@@ -76,15 +76,31 @@ def importData(database, filename, user):
        database.__class__.__bases__ = (DbMixin,) +  \
                                        database.__class__.__bases__

-    try:
-        ifile = open(filename, "r")
-    except IOError:
-        return
+    if sys.version_info[0] < 3:
+        try:
+            ifile = open(filename, "rU")
+        except IOError:
+            return
+    else:
+        try:
+            ifile = open(filename, "rb")
+        except IOError:
+            return

+#    print("file opened")
    ansel = False
    gramps = False
    for index in range(50):
-        line = ifile.readline().split()
+        # Treat the file as though it is UTF-8 since this is the more modern
+        # option; and anyway it doesn't really matter as we are only trying to
+        # detect a CHAR or SOUR line which is only 7-bit ASCII anyway,  and we
+        # ignore anything that can't be translated.
+        line = ifile.readline()
+        if sys.version_info[0] < 3:
+            line = unicode(line, encoding='utf-8', errors='replace')
+        else:
+            line = line.decode(encoding='utf-8', errors='replace')
+        line = line.split()
        if len(line) == 0:
            break
        if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
@@ -578,6 +578,10 @@ LDS_STATUS = {
 # table for skipping illegal control chars in GEDCOM import
 # Only 09, 0A, 0D are allowed.
 STRIP_DICT = dict.fromkeys(list(range(9))+list(range(11, 13))+list(range(14, 32)))
+# The C1 Control characters are not treated in Latin-1 (ISO-8859-1) as
+# undefined, but if they have been used, the file is probably supposed to be
+# cp1252
+DEL_AND_C1 = dict.fromkeys(list(range(0x7F, 0x9F)))

 #-------------------------------------------------------------------------
 #
@@ -689,7 +693,7 @@ class GedcomDateParser(DateParser):
 #-------------------------------------------------------------------------
 class Lexer(object):

-    def __init__(self, ifile):
+    def __init__(self, ifile, __add_msg):
        self.ifile = ifile
        self.current_list = []
        self.eof = False
@@ -700,6 +704,7 @@ class Lexer(object):
            TOKEN_CONT : self.__fix_token_cont,
            TOKEN_CONC : self.__fix_token_conc,
            }
+        self.__add_msg = __add_msg

    def readline(self):
        if len(self.current_list) <= 1 and not self.eof:
@@ -738,6 +743,7 @@ class Lexer(object):
                self.eof = True
                return

+            original_line = line
            try:
                # According to the GEDCOM 5.5 standard,
                # Chapter 1 subsection Grammar
@@ -771,6 +777,13 @@ class Lexer(object):
                    tag = line[0]
                    line_value = line[2]
            except:
+                problem = _("Line ignored ")
+                text = original_line.rstrip('\n\r')
+                prob_width = 66
+                problem = problem.ljust(prob_width)[0:(prob_width-1)]
+                text = text.replace("\n", "\n".ljust(prob_width + 22))
+                message = "%s              %s" % (problem, text)
+                self.__add_msg(message)
                continue

            token = TOKENS.get(tag, TOKEN_UNKNOWN)
@@ -1234,27 +1247,29 @@ class GedInfoParser(object):
 #
 #-------------------------------------------------------------------------
 class BaseReader(object):
-    def __init__(self, ifile, encoding):
+    def __init__(self, ifile, encoding, __add_msg):
        self.ifile = ifile
        self.enc = encoding
+        self.__add_msg = __add_msg

    def reset(self):
        self.ifile.seek(0)

    def readline(self):
-        if sys.version_info[0] < 3:
-            line = unicode(self.ifile.readline(), 
-                           encoding=self.enc,
-                           errors='replace')
-        else:
-            line = self.ifile.readline()
-            line = line.decode(self.enc, errors='replace')
-        return line.translate(STRIP_DICT)
+        raise NotImplemented
+
+    def report_error(self, problem, line):
+        line = line.rstrip('\n\r')
+        prob_width = 66
+        problem = problem.ljust(prob_width)[0:(prob_width-1)]
+        text = line.replace("\n", "\n".ljust(prob_width + 22))
+        message = "%s               %s" % (problem, text)
+        self.__add_msg(message)

 class UTF8Reader(BaseReader):

-    def __init__(self, ifile):
-        BaseReader.__init__(self, ifile, 'utf8')
+    def __init__(self, ifile, __add_msg):
+        BaseReader.__init__(self, ifile, 'utf8', __add_msg)
        self.reset()

    def reset(self):
@@ -1275,23 +1290,61 @@ class UTF8Reader(BaseReader):

 class UTF16Reader(BaseReader):

-    def __init__(self, ifile):
+    def __init__(self, ifile, __add_msg):
        new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
-        BaseReader.__init__(self, new_file, 'utf16')
+        BaseReader.__init__(self, new_file, '', __add_msg)
        self.reset()

    def readline(self):
-        l = self.ifile.readline()
-        if l.strip():
-            return l
+        line = self.ifile.readline()
+        if sys.version_info[0] < 3:
+            line = unicode(line,
+                           encoding='utf8',
+                           errors='replace')
+            if line.strip():
+                return line.translate(STRIP_DICT)
+            else:
+                line = self.ifile.readline()
+                line = unicode(line,
+                               encoding='utf8',
+                               errors='replace')
+                return line.translate(STRIP_DICT)
        else:
-            return self.ifile.readline()
+            line = line.decode('utf8', errors='replace')
+            return line.translate(STRIP_DICT)

 class AnsiReader(BaseReader):

-    def __init__(self, ifile):
-        BaseReader.__init__(self, ifile, 'latin1')
-    
+    def __init__(self, ifile, __add_msg):
+        BaseReader.__init__(self, ifile, 'latin1', __add_msg)
+   
+    def readline(self):
+        line = self.ifile.readline()
+        if sys.version_info[0] < 3:
+            line = unicode(line, 
+                           encoding=self.enc,
+                           errors='replace')
+        else:
+            line = line.decode(self.enc, errors='replace')
+        if line.translate(DEL_AND_C1) != line:
+            self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
+        return line.translate(STRIP_DICT)
+
+class CP1252Reader(BaseReader):
+
+    def __init__(self, ifile, __add_msg):
+        BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
+   
+    def readline(self):
+        line = self.ifile.readline()
+        if sys.version_info[0] < 3:
+            line = unicode(line, 
+                           encoding=self.enc,
+                           errors='replace')
+        else:
+            line = line.decode(self.enc, errors='replace')
+        return line.translate(STRIP_DICT)
+
 class AnselReader(BaseReader):
    """
    ANSEL to Unicode Conversion
@@ -1311,7 +1364,8 @@ class AnselReader(BaseReader):
    TODO: should we allow TAB, as a Gramps extension?
    """
    __printable_ascii = list(map(chr, list(range(32, 127)))) # note: up thru 126
-    __use_ASCII = list(map(chr, [10, 27, 29 , 30, 31])) + __printable_ascii
+    #                            LF  CR  Esc GS   RS  US
+    __use_ASCII = list(map(chr, [10, 13, 27, 29 , 30, 31])) + __printable_ascii
    
    # mappings of single byte ANSEL codes to unicode
    __onebyte = {
@@ -1324,9 +1378,11 @@ class AnselReader(BaseReader):
         b'\xB4' : '\u00fe',   b'\xB5' : '\u00e6',   b'\xB6' : '\u0153',   
         b'\xB7' : '\u02ba',   b'\xB8' : '\u0131',   b'\xB9' : '\u00a3',   
         b'\xBA' : '\u00f0',   b'\xBC' : '\u01a1',   b'\xBD' : '\u01b0',   
+         b'\xBE' : '\u25a1',   b'\xBF' : '\u25a0',
         b'\xC0' : '\u00b0',   b'\xC1' : '\u2113',   b'\xC2' : '\u2117',   
         b'\xC3' : '\u00a9',   b'\xC4' : '\u266f',   b'\xC5' : '\u00bf',   
         b'\xC6' : '\u00a1',   b'\xC7' : '\u00df',   b'\xC8' : '\u20ac',  
+         b'\xCD' : '\u0065',   b'\xCE' : '\u006f',   b'\xCF' : '\u00df',
        }
    
    # combining forms (in ANSEL, they precede the modified ASCII character
@@ -1347,6 +1403,7 @@ class AnselReader(BaseReader):
         b'\xF3' : '\u0324',   b'\xF4' : '\u0325',   b'\xF5' : '\u0333',   
         b'\xF6' : '\u0332',   b'\xF7' : '\u0326',   b'\xF8' : '\u031c',   
         b'\xF9' : '\u032e',   b'\xFA' : '\ufe22',   b'\xFB' : '\ufe23',   
+         b'\xFC' : '\u0338',
         b'\xFE' : '\u0313',  
       }
    
@@ -1504,52 +1561,94 @@ class AnselReader(BaseReader):
         b'\xF9\x48' : '\u1e2a',   b'\xF9\x68' : '\u1e2b',  
       }

-    @staticmethod
-    def __ansel_to_unicode(s):
+    def __ansel_to_unicode(self, s):
        """ Convert an ANSEL encoded string to unicode """
    
        buff = StringIO()
-        while s:
-            if ord(s[0]) < 128:
-                if s[0] in AnselReader.__use_ASCII:
-                    head = s[0]
-                else:
-                    # substitute space for disallowed (control) chars
-                    head = ' '
-                s = s[1:]
-            else:
-                if s[0:2] in AnselReader.__twobyte:
-                    head = AnselReader.__twobyte[s[0:2]]
-                    s = s[2:]
-                elif s[0] in AnselReader.__onebyte:
-                    head = AnselReader.__onebyte[s[0]]
-                    s = s[1:]
-                elif s[0] in AnselReader.__acombiners:
-                    c =  AnselReader.__acombiners[s[0]]
-                    # always consume the combiner
-                    s = s[1:]
-                    next = s[0]
-                    if next in AnselReader.__printable_ascii:
-                        # consume next as well
-                        s = s[1:]
-                        # unicode: combiner follows base-char
-                        head = next + c
-                    else:
-                        # just drop the unexpected combiner
-                        continue 
-                else:
-                    head = '\ufffd' # "Replacement Char"
-                    s = s[1:]
-            buff.write(head.encode("utf-8"))
+        error = ""
        if sys.version_info[0] < 3:
+            while s:
+                if ord(s[0]) < 128:
+                    if s[0] in AnselReader.__use_ASCII:
+                        head = s[0]
+                    else:
+                        # substitute space for disallowed (control) chars
+                        error += " (%#X)" % ord(s[0])
+                        head = ' '
+                    s = s[1:]
+                else:
+                    if s[0:2] in AnselReader.__twobyte:
+                        head = AnselReader.__twobyte[s[0:2]]
+                        s = s[2:]
+                    elif s[0] in AnselReader.__onebyte:
+                        head = AnselReader.__onebyte[s[0]]
+                        s = s[1:]
+                    elif s[0] in AnselReader.__acombiners:
+                        c =  AnselReader.__acombiners[s[0]]
+                        # always consume the combiner
+                        s = s[1:]
+                        next = s[0]
+                        if next in AnselReader.__printable_ascii:
+                            # consume next as well
+                            s = s[1:]
+                            # unicode: combiner follows base-char
+                            head = next + c
+                        else:
+                            # just drop the unexpected combiner
+                            error += " (%#X)" % ord(s[0])
+                            continue 
+                    else:
+                        error += " (%#X)" % ord(s[0])
+                        head = '\ufffd' # "Replacement Char"
+                        s = s[1:]
+                buff.write(head.encode("utf-8"))
            ans = unicode(buff.getvalue(), "utf-8")
        else:
-            ans = buff.getvalue().decode("utf-8")
+            while s:
+                if s[0] < 128:
+                    if chr(s[0]) in AnselReader.__use_ASCII:
+                        head = chr(s[0])
+                    else:
+                        # substitute space for disallowed (control) chars
+                        error += " (%#X)" % s[0]
+                        head = ' '
+                    s = s[1:]
+                else:
+                    if s[0:2] in AnselReader.__twobyte:
+                        head = AnselReader.__twobyte[s[0:2]]
+                        s = s[2:]
+                    elif bytes([s[0]]) in AnselReader.__onebyte:
+                        head = AnselReader.__onebyte[bytes([s[0]])]
+                        s = s[1:]
+                    elif bytes([s[0]]) in AnselReader.__acombiners:
+                        c =  AnselReader.__acombiners[bytes([s[0]])]
+                        # always consume the combiner
+                        s = s[1:]
+                        next_byte = s[0]
+                        if next_byte < 128 and chr(next_byte) in AnselReader.__printable_ascii:
+                            # consume next as well
+                            s = s[1:]
+                            # unicode: combiner follows base-char
+                            head = chr(next_byte) + c
+                        else:
+                            # just drop the unexpected combiner
+                            error += " (%#X)" % s[0]
+                            continue 
+                    else:
+                        error += " (%#X)" % s[0]
+                        head = '\ufffd' # "Replacement Char"
+                        s = s[1:]
+                buff.write(head)
+            ans = buff.getvalue()
+
+        if error:
+            # e.g. Illegal character (oxAB) (0xCB)... 1 NOTE xyz?pqr?lmn
+            self.report_error(_("Illegal character%s") % error, ans)
        buff.close()
        return ans

-    def __init__(self, ifile):
-        BaseReader.__init__(self, ifile, "")
+    def __init__(self, ifile, __add_msg):
+        BaseReader.__init__(self, ifile, "", __add_msg)

    def readline(self):
        return self.__ansel_to_unicode(self.ifile.readline())
@@ -2651,15 +2750,17 @@ class GedcomParser(UpdateCallback):
        enc = stage_one.get_encoding()

        if enc == "ANSEL":
-            rdr = AnselReader(ifile)
+            rdr = AnselReader(ifile, self.__add_msg)
        elif enc in ("UTF-8", "UTF8"):
-            rdr = UTF8Reader(ifile)
-        elif enc in ("UTF-16", "UTF16", "UNICODE"):
-            rdr = UTF16Reader(ifile)
+            rdr = UTF8Reader(ifile, self.__add_msg)
+        elif enc in ("UTF-16LE", "UTF-16BE",  "UTF16", "UNICODE"):
+            rdr = UTF16Reader(ifile, self.__add_msg)
+        elif enc in ("CP1252", "WINDOWS-1252"):
+            rdr = CP1252Reader(ifile, self.__add_msg)
        else:
-            rdr = AnsiReader(ifile)
+            rdr = AnsiReader(ifile, self.__add_msg)

-        self.lexer = Lexer(rdr)
+        self.lexer = Lexer(rdr, self.__add_msg)
        self.filename = filename
        self.backoff = False

@@ -7163,8 +7264,13 @@ class GedcomParser(UpdateCallback):
                    sattr.set_value(line.data)
                    self.def_src.add_attribute(sattr)
            elif line.token == TOKEN_FORM:
-                if line.data != "LINEAGE-LINKED":
-                    self.__add_msg(_("GEDCOM form not supported"), line, state)
+                if line.data == "LINEAGE-LINKED":
+                    pass
+                elif line.data.upper() == "LINEAGE-LINKED":
+                    # Allow Lineage-Linked etc. though it should be in uppercase
+                    self.__add_msg(_("GEDCOM FORM should be in uppercase"), line, state)
+                else:
+                    self.__add_msg(_("GEDCOM FORM not supported"), line, state)
                if self.use_def_src:
                    sattr = SrcAttribute()
                    sattr.set_type(_('GEDCOM form'))
@@ -7709,7 +7815,7 @@ class GedcomStageOne(object):
            input_file.read(1)
            self.enc = "UTF8"
            return input_file
-        elif line == b"\xff\xfe":
+        elif line == b"\xff\xfe" or line == b"\xfe\xff":
            self.enc = "UTF16"
            input_file.seek(0)
            return codecs.EncodedFile(input_file, 'utf8', 'utf16')
@@ -7730,25 +7836,33 @@ class GedcomStageOne(object):
        reader = self.__detect_file_decoder(self.ifile)

        for line in reader:
+            # Treat the file as though it is UTF-8 since this will be right if a
+            # BOM was detected; it is the more modern option; and anyway it
+            # doesn't really matter as we are only trying to detect a CHAR line
+            # which is only 7-bit ASCII anyway,  and we ignore anything that
+            # can't be translated.
+            if sys.version_info[0] < 3:
+                line = unicode(line, encoding='utf-8', errors='replace')
+            else:
+                line = line.decode(encoding='utf-8', errors='replace')
            line = line.strip()
            if not line:
                continue
            self.lcnt += 1

-            data = line.split(None, 2) + ['']
            try:
+                data = line.split(None, 2) + ['']
                (level, key, value) = data[:3]
                level = int(level)
-                key = conv_to_unicode(key.strip())
-                value = conv_to_unicode(value.strip())
+                key = key.strip()
+                value = value.strip()
            except:
-                LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt)
                continue

            if level == 0 and key[0] == '@':
-                if value == ("FAM", "FAMILY") :
+                if value in ("FAM", "FAMILY") :
                    current_family_id = key.strip()[1:-1]
-                elif value == ("INDI", "INDIVIDUAL"):
+                elif value in ("INDI", "INDIVIDUAL"):
                    self.pcnt += 1
            elif key in ("HUSB", "HUSBAND", "WIFE") and \
                 self.__is_xref_value(value):
@@ -7758,6 +7872,9 @@ class GedcomStageOne(object):
            elif key == 'CHAR' and not self.enc:
                assert(isinstance(value, STRTYPE))
                self.enc = value
+        LOG.debug("parse pcnt %d" % self.pcnt)
+        LOG.debug("parse famc %s" % dict(self.famc))
+        LOG.debug("parse fams %s" % dict(self.fams))

    def get_famc_map(self):
        """