Faster GEDCOM imports, source lists

svn: r448
2001-10-06 18:25:31 +00:00
parent 17ab259c4a
commit ebe4af348d
18 changed files with 738 additions and 560 deletions
--- a/src/plugins/ReadGedcom.py
+++ b/src/plugins/ReadGedcom.py
@ -48,6 +48,11 @@ glade_file = None
 clear_data = 0
 is_ftw = 0

+def nocnv(s):
+    return s
+
+_cnv = nocnv
+
 photo_types = [ "jpeg", "bmp", "pict", "pntg", "tpic", "png", "gif",
                "tiff", "pcx" ]

@ -69,6 +74,8 @@ lineRE = re.compile(r"\s*(\d+)\s+(\S+)\s*(.*)$")
 headRE = re.compile(r"\s*(\d+)\s+HEAD")
 nameRegexp = re.compile(r"([\S\s]*\S)?\s*/([^/]+)?/\s*,?\s*([\S]+)?")

+placemap = {}
+
 #-------------------------------------------------------------------------
 #
 #
@ -96,7 +103,9 @@ def find_file(fullname,altpath):
 #
 #
 #-------------------------------------------------------------------------
+
 def importData(database, filename):
+
    global callback
    global topDialog
    global glade_file
@ -125,12 +134,17 @@ def importData(database, filename):
        GnomeErrorDialog(_("%s could not be opened\n") % filename)
        return

+    import time
+    t1 = time.time()
    g.parse_gedcom_file()
+    t2 = time.time()
+    print t2-t1

    statusTop.get_widget("close").set_sensitive(1)

    utils.modified()
-    callback(1)
+    if callback:
+        callback(1)

 #-------------------------------------------------------------------------
 #
@ -183,9 +197,9 @@ class GedcomParser:
        self.dir_path = os.path.dirname(file)
        self.localref = 0

-        f = open(file,"r")
-        self.lines = f.readlines()
-        f.close()
+        self.f = open(file,"r")
+        self.index = 0
+        self.backoff = 0

        self.file_obj = window.get_widget("file")
        self.encoding_obj = window.get_widget("encoding")
@ -200,8 +214,6 @@ class GedcomParser:
        self.error_text_obj.set_word_wrap(0)
        
        self.update(self.file_obj,file)
-        
-	self.index = 0
        self.code = 0

    #---------------------------------------------------------------------
@ -220,24 +232,27 @@ class GedcomParser:
    #
    #---------------------------------------------------------------------
    def get_next(self):
-        line = string.replace(self.lines[self.index],'\r','')
-        if self.code == ANSEL:
-            line = latin_ansel.ansel_to_latin(line)
-        elif self.code == UNICODE:
-            line = latin_utf8.utf8_to_latin(line)
-	match = lineRE.match(line)
-        if not match:
-            msg = _("Warning: line %d was not understood, so it was ignored.") % self.index
-            self.error_text_obj.insert_defaults(msg)
-            msg = "\n\t%s\n" % self.lines[self.index-1]
-            self.error_text_obj.insert_defaults(msg)
-            self.error_count = self.error_count + 1
-            self.update(self.errors_obj,str(self.error_count))
-            match = lineRE.match("999 XXX XXX")
-        
-        self.index = self.index + 1
-    	return match.groups()
-
+        if self.backoff == 0:
+            self.text = _cnv(string.strip(self.f.readline()))
+            self.index = self.index + 1
+            l = string.split(self.text, None, 2)
+            ln = len(l)
+            try:
+                if ln == 2:
+                    self.groups = (int(l[0]),l[1],"")
+                else:
+                    self.groups = (int(l[0]),l[1],l[2])
+            except:
+                msg = _("Warning: line %d was not understood, so it was ignored.") % self.index
+                self.error_text_obj.insert_defaults(msg)
+                msg = "\n\t%s\n" % self.text
+                self.error_text_obj.insert_defaults(msg)
+                self.error_count = self.error_count + 1
+                self.update(self.errors_obj,str(self.error_count))
+                self.groups = (999, "XXX", "XXX")
+        self.backoff = 0
+        return self.groups
+            
    #---------------------------------------------------------------------
    #
    #
@ -246,7 +261,7 @@ class GedcomParser:
    def barf(self,level):
        msg = _("Warning: line %d was not understood, so it was ignored.") % self.index
        self.error_text_obj.insert_defaults(msg)
-        msg = "\n\t%s\n" % self.lines[self.index-1]
+        msg = "\n\t%s\n" % self.text
        self.error_text_obj.insert_defaults(msg)
        self.error_count = self.error_count + 1
        self.update(self.errors_obj,str(self.error_count))
@ -268,7 +283,7 @@ class GedcomParser:
    #
    #---------------------------------------------------------------------
    def backup(self):
-        self.index = self.index - 1
+        self.backoff = 1

    #---------------------------------------------------------------------
    #
@ -296,7 +311,8 @@ class GedcomParser:

        if matches[1] != "TRLR":
 	    self.barf(0)
-
+        self.f.close()
+        
    #---------------------------------------------------------------------
    #
    #
@ -325,7 +341,7 @@ class GedcomParser:
    def parse_submitter(self):
 	matches = self.get_next()

-        if matches[2] != "SUBN":
+        if matches[2] != "SUBM":
            self.backup()
 	    return
        else:
@ -414,7 +430,6 @@ class GedcomParser:
                    noteobj.set(text + self.parse_continue_data(1))
                self.parse_note_data(1)
            elif matches[2] == "OBJE":
-                print "OBJE",matches[1]
                self.ignore_sub_junk(1)
 	    elif matches[1] == "TRLR":
                self.backup()
@ -505,6 +520,11 @@ class GedcomParser:
                        child.setMainFamily(None)
                    child.addAltFamily(self.family,mrel,frel)
 	    elif matches[1] == "NCHI" or matches[1] == "RIN" or matches[1] == "SUBM":  
+                a = Attribute()
+                a.setType("Number of Children")
+                a.setValue(matches[2])
+                self.family.addAttribute(a)
+	    elif matches[1] == "RIN" or matches[1] == "SUBM":  
                pass
            elif matches[1] == "REFN" or matches[1] == "CHAN":
                self.ignore_sub_junk(2)
@ -601,7 +621,9 @@ class GedcomParser:
                        self.nmap[matches[2]] = noteobj
                        self.person.setNoteObj(noteobj)
 	    elif matches[1] == "SEX":
-                if matches[2][0] == "M":
+                if matches[2] == '':
+                    self.person.setGender(Person.unknown)
+                elif matches[2][0] == "M":
                    self.person.setGender(Person.male)
                else:
                    self.person.setGender(Person.female)
@ -838,7 +860,7 @@ class GedcomParser:
            elif matches[1] == "SOUR":
                source_ref = SourceRef()
                source_ref.setBase(self.db.findSource(matches[2],self.smap))
-                address.setSourceRef(source_ref)
+                address.addSourceRef(source_ref)
                self.parse_source_reference(source_ref,level+1)
            elif matches[1] == "PLAC":
                address.setStreet(matches[2])
@ -937,7 +959,7 @@ class GedcomParser:
                else:
                    source_ref.setBase(self.db.findSource(matches[2],self.smap))
                    self.parse_source_reference(source_ref,level+1)
-                event.setSourceRef(source_ref)
+                event.addSourceRef(source_ref)
            elif matches[1] == "FAMC":
                family = self.db.findFamily(matches[2],self.fmap)
                if event.getName() == "Birth":
@ -953,11 +975,8 @@ class GedcomParser:
                    event.setDescription(val)
                    self.ignore_sub_junk(level+1)
                else:
-                    place = None
-                    for p in self.db.getPlaceMap().values():
-                        if val == p.get_title():
-                            place = p
-                            break
+                    if placemap.has_key(val):
+                        place = placemap[val]
                    else:
                        place = Place()
                        place.set_title(matches[2])
@ -1028,14 +1047,11 @@ class GedcomParser:
                else:
                    source_ref.setBase(self.db.findSource(matches[2],self.smap))
                    self.parse_source_reference(source_ref,level+1)
-                event.setSourceRef(source_ref)
+                event.addSourceRef(source_ref)
            elif matches[1] == "PLAC":
                val = matches[2]
-                place = None
-                for p in self.db.getPlaceMap().values():
-                    if val == p.get_title():
-                        place = p
-                        break
+                if placemap.has_key(val):
+                    place = placemap[val]
                else:
                    place = Place()
                    place.set_title(matches[2])
@ -1145,7 +1161,7 @@ class GedcomParser:
            elif matches[1] == "SOUR":
                source_ref = SourceRef()
                source_ref.setBase(self.db.findSource(matches[2],self.smap))
-                name.setSourceRef(source_ref)
+                name.addSourceRef(source_ref)
                self.parse_source_reference(source_ref,level+1)
            elif matches[1][0:4] == "NOTE":
                if matches[2] and matches[2][0] != "@":
@ -1164,10 +1180,10 @@ class GedcomParser:

    def parse_header_head(self):
        """validiates that this is a valid GEDCOM file"""
-        line = string.replace(self.lines[self.index],'\r','')
+        line = string.replace(self.f.readline(),'\r','')
 	match = headRE.search(line)
        if not match:
-	    raise GedcomParser.BadFile, self.lines[self.index]
+	    raise GedcomParser.BadFile, line
        self.index = self.index + 1

    #---------------------------------------------------------------------
@ -1177,6 +1193,7 @@ class GedcomParser:
    #---------------------------------------------------------------------
    def parse_header_source(self):
        global is_ftw
+        global _cnv
        
        while 1:
 	    matches = self.get_next()
@ -1211,8 +1228,10 @@ class GedcomParser:
                if matches[2] == "UNICODE" or matches[2] == "UTF-8" or \
                   matches[2] == "UTF8":
                    self.code = UNICODE
+                    _cnv = latin_utf8.utf8_to_latin
                elif matches[2] == "ANSEL":
                    self.code = ANSEL
+                    _cnv = latin_ansel.ansel_to_latin
                self.ignore_sub_junk(2)
                self.update(self.encoding_obj,matches[2])
   	    elif matches[1] == "GEDC":
@ -1598,3 +1617,13 @@ def readData(database,active_person,cb):
 from Plugins import register_import

 register_import(readData,_("Import from GEDCOM"))
+
+if __name__ == "__main__":
+    import profile
+    import sys
+    global db
+
+    glade_file = "plugins/gedcomimport.glade"
+    
+    db = RelDataBase()
+    profile.run('importData(db,sys.argv[1])')