2007-02-20 Don Allingham <don@gramps-project.org>
* src/GrampsDbUtils/_GedcomParse.py: Use the FAM->INDI mappings discovered in the StageOne parsing to fix missing relationships in Ancestry.com generated GEDCOM files. svn: r8198
This commit is contained in:
@@ -1,3 +1,8 @@
|
|||||||
|
2007-02-20 Don Allingham <don@gramps-project.org>
|
||||||
|
* src/GrampsDbUtils/_GedcomParse.py: Use the FAM->INDI mappings
|
||||||
|
discovered in the StageOne parsing to fix missing relationships
|
||||||
|
in Ancestry.com generated GEDCOM files.
|
||||||
|
|
||||||
2007-02-20 Benny malengier <bm@cage.ugent.be>
|
2007-02-20 Benny malengier <bm@cage.ugent.be>
|
||||||
* gramps_main.py : register_stock_icons
|
* gramps_main.py : register_stock_icons
|
||||||
* src/images: delete old icons
|
* src/images: delete old icons
|
||||||
|
@@ -269,31 +269,41 @@ class StageOne:
|
|||||||
self.famc = {}
|
self.famc = {}
|
||||||
self.fams = {}
|
self.fams = {}
|
||||||
self.enc = ""
|
self.enc = ""
|
||||||
|
self.pcnt = 0
|
||||||
|
self.lcnt = 0
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
current = ""
|
current = ""
|
||||||
for line in self.ifile:
|
for line in self.ifile:
|
||||||
|
self.lcnt +=1
|
||||||
data = line.split(None,2) + ['']
|
data = line.split(None,2) + ['']
|
||||||
|
try:
|
||||||
(level, key, value) = data[:3]
|
(level, key, value) = data[:3]
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
level = int(level)
|
level = int(level)
|
||||||
|
except:
|
||||||
|
print line
|
||||||
|
sys.exit(1)
|
||||||
key = key.strip()
|
key = key.strip()
|
||||||
|
|
||||||
if level == 0 and value == "FAM":
|
if level == 0:
|
||||||
|
if value == "FAM":
|
||||||
current = key.strip()
|
current = key.strip()
|
||||||
current = current[1:-1]
|
current = current[1:-1]
|
||||||
|
elif value == "INDI":
|
||||||
|
self.pcnt += 1
|
||||||
elif key in ("HUSB", "WIFE") and value and value[0] == '@':
|
elif key in ("HUSB", "WIFE") and value and value[0] == '@':
|
||||||
value = value[1:-1]
|
value = value[1:-1]
|
||||||
if self.fams.has_key(current):
|
if self.fams.has_key(value):
|
||||||
self.fams[current].append(value)
|
self.fams[value].append(current)
|
||||||
else:
|
else:
|
||||||
self.fams[current] = [value]
|
self.fams[value] = [current]
|
||||||
elif key == "CHIL" and value and value[0] == '@':
|
elif key == "CHIL" and value and value[0] == '@':
|
||||||
value = value[1:-1]
|
value = value[1:-1]
|
||||||
if self.famc.has_key(current):
|
if self.famc.has_key(value):
|
||||||
self.famc[current].append(value)
|
self.famc[value].append(current)
|
||||||
else:
|
else:
|
||||||
self.famc[current] = [value]
|
self.famc[value] = [current]
|
||||||
elif key == 'CHAR':
|
elif key == 'CHAR':
|
||||||
self.enc = value
|
self.enc = value
|
||||||
|
|
||||||
@@ -306,93 +316,11 @@ class StageOne:
|
|||||||
def get_encoding(self):
|
def get_encoding(self):
|
||||||
return self.enc
|
return self.enc
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
def get_person_count(self):
|
||||||
#
|
return self.pcnt
|
||||||
#
|
|
||||||
#
|
|
||||||
#-------------------------------------------------------------------------
|
|
||||||
class NoteParser:
|
|
||||||
"""
|
|
||||||
Performs the first pass of a GEDCOM file parse.
|
|
||||||
"""
|
|
||||||
def __init__(self, ifile, broken, override):
|
|
||||||
if override:
|
|
||||||
if override == 1:
|
|
||||||
self.cnv = ansel_to_utf8
|
|
||||||
elif override == 2:
|
|
||||||
self.cnv = latin_to_utf8
|
|
||||||
else:
|
|
||||||
self.cnv = nocnv
|
|
||||||
else:
|
|
||||||
for index in range(50):
|
|
||||||
line = ifile.readline().split()
|
|
||||||
if len(line) > 2 and line[1] == 'CHAR':
|
|
||||||
if line[2] == "ANSEL":
|
|
||||||
self.cnv = ansel_to_utf8
|
|
||||||
elif line[2] in ["UNICODE","UTF-8","UTF8"]:
|
|
||||||
self.cnv = nocnv
|
|
||||||
else:
|
|
||||||
self.cnv = latin_to_utf8
|
|
||||||
|
|
||||||
self.name_map = {}
|
def get_line_count(self):
|
||||||
|
return self.lcnt
|
||||||
self.count = 0
|
|
||||||
self.person_count = 0
|
|
||||||
self.trans = None
|
|
||||||
self.groups = None
|
|
||||||
|
|
||||||
ifile.seek(0)
|
|
||||||
innote = False
|
|
||||||
noteobj = RelLib.Note()
|
|
||||||
|
|
||||||
for line in ifile:
|
|
||||||
try:
|
|
||||||
text = line.translate(TRANS_TABLE, DEL_CHARS)
|
|
||||||
except:
|
|
||||||
text = line
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = self.cnv(text)
|
|
||||||
except:
|
|
||||||
text = text.translate(TRANS_TABLE2)
|
|
||||||
|
|
||||||
self.count += 1
|
|
||||||
if innote:
|
|
||||||
|
|
||||||
match = CONT_RE.match(text)
|
|
||||||
if match:
|
|
||||||
noteobj.append("\n" + match.groups()[0])
|
|
||||||
continue
|
|
||||||
|
|
||||||
match = CONC_RE.match(text)
|
|
||||||
if match:
|
|
||||||
if broken:
|
|
||||||
noteobj.append(" " + match.groups()[0])
|
|
||||||
else:
|
|
||||||
noteobj.append(match.groups()[0])
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Here we have finished parsing CONT/CONC tags for the NOTE
|
|
||||||
# and ignored the rest of the tags (SOUR,CHAN,REFN,RIN).
|
|
||||||
innote = False
|
|
||||||
match = NOTE_RE.match(text)
|
|
||||||
if match:
|
|
||||||
data = match.groups()[0]
|
|
||||||
noteobj = RelLib.Note()
|
|
||||||
self.name_map["@%s@" % data] = noteobj
|
|
||||||
noteobj.append(match.groups()[1])
|
|
||||||
innote = True
|
|
||||||
elif PERSON_RE.match(line):
|
|
||||||
self.person_count += 1
|
|
||||||
|
|
||||||
def get_map(self):
|
|
||||||
return self.name_map
|
|
||||||
|
|
||||||
def get_lines(self):
|
|
||||||
return self.count
|
|
||||||
|
|
||||||
def get_persons(self):
|
|
||||||
return self.person_count
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
@@ -408,13 +336,13 @@ class GedcomParser(UpdateCallback):
|
|||||||
SyntaxError = "Syntax Error"
|
SyntaxError = "Syntax Error"
|
||||||
BadFile = "Not a GEDCOM file"
|
BadFile = "Not a GEDCOM file"
|
||||||
|
|
||||||
def __init__(self, dbase, ifile, filename, callback, code_set, note_map,
|
def __init__(self, dbase, ifile, filename, callback, stage_one):
|
||||||
lines, people):
|
|
||||||
UpdateCallback.__init__(self, callback)
|
UpdateCallback.__init__(self, callback)
|
||||||
self.set_total(lines)
|
|
||||||
|
self.set_total(stage_one.get_line_count())
|
||||||
|
|
||||||
self.repo2id = {}
|
self.repo2id = {}
|
||||||
self.maxpeople = people
|
self.maxpeople = stage_one.get_person_count()
|
||||||
self.dbase = dbase
|
self.dbase = dbase
|
||||||
self.emapper = GedcomUtils.IdFinder(dbase.get_gramps_ids(EVENT_KEY),
|
self.emapper = GedcomUtils.IdFinder(dbase.get_gramps_ids(EVENT_KEY),
|
||||||
dbase.eprefix)
|
dbase.eprefix)
|
||||||
@@ -424,11 +352,15 @@ class GedcomParser(UpdateCallback):
|
|||||||
self.repo_count = 0
|
self.repo_count = 0
|
||||||
self.source_count = 0
|
self.source_count = 0
|
||||||
|
|
||||||
|
self.famc_map = stage_one.get_famc_map()
|
||||||
|
self.fams_map = stage_one.get_fams_map()
|
||||||
|
|
||||||
|
print self.fams_map
|
||||||
|
|
||||||
self.place_parser = GedcomUtils.PlaceParser()
|
self.place_parser = GedcomUtils.PlaceParser()
|
||||||
self.debug = False
|
self.debug = False
|
||||||
self.inline_srcs = {}
|
self.inline_srcs = {}
|
||||||
self.media_map = {}
|
self.media_map = {}
|
||||||
self.note_map = note_map
|
|
||||||
self.refn = {}
|
self.refn = {}
|
||||||
self.added = set()
|
self.added = set()
|
||||||
self.gedmap = GedcomInfoDB()
|
self.gedmap = GedcomInfoDB()
|
||||||
@@ -441,8 +373,8 @@ class GedcomParser(UpdateCallback):
|
|||||||
self.dir_path = os.path.dirname(filename)
|
self.dir_path = os.path.dirname(filename)
|
||||||
self.localref = 0
|
self.localref = 0
|
||||||
self.placemap = {}
|
self.placemap = {}
|
||||||
self.broken_conc_list = [ 'FamilyOrigins', 'FTW' ]
|
|
||||||
self.is_ftw = False
|
self.is_ftw = False
|
||||||
|
self.is_ancestry_com = False
|
||||||
|
|
||||||
self.pid_map = GedcomUtils.IdMapper(
|
self.pid_map = GedcomUtils.IdMapper(
|
||||||
self.dbase.id_trans,
|
self.dbase.id_trans,
|
||||||
@@ -850,13 +782,13 @@ class GedcomParser(UpdateCallback):
|
|||||||
self.lexer = Reader(ifile)
|
self.lexer = Reader(ifile)
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.backoff = False
|
self.backoff = False
|
||||||
self.override = code_set
|
self.override = False
|
||||||
|
#
|
||||||
if self.override != 0:
|
# if self.override != 0:
|
||||||
if self.override == 1:
|
# if self.override == 1:
|
||||||
self.lexer.set_charset_fn(ansel_to_utf8)
|
# self.lexer.set_charset_fn(ansel_to_utf8)
|
||||||
elif self.override == 2:
|
# elif self.override == 2:
|
||||||
self.lexer.set_charset_fn(latin_to_utf8)
|
# self.lexer.set_charset_fn(latin_to_utf8)
|
||||||
|
|
||||||
fullpath = os.path.normpath(os.path.abspath(filename))
|
fullpath = os.path.normpath(os.path.abspath(filename))
|
||||||
self.geddir = os.path.dirname(fullpath)
|
self.geddir = os.path.dirname(fullpath)
|
||||||
@@ -1293,6 +1225,11 @@ class GedcomParser(UpdateCallback):
|
|||||||
# set up the state for the parsing
|
# set up the state for the parsing
|
||||||
state = GedcomUtils.CurrentState(person=self.person, level=1)
|
state = GedcomUtils.CurrentState(person=self.person, level=1)
|
||||||
|
|
||||||
|
# Ancestry.com GEDCOM files are massively broken, not providing
|
||||||
|
# the FAMC and FAMS values for a person
|
||||||
|
if self.is_ancestry_com:
|
||||||
|
self.map_ancestry_com(line.token_text.strip())
|
||||||
|
|
||||||
# do the actual parsing
|
# do the actual parsing
|
||||||
self.parse_level(state, self.indi_parse_tbl, self.func_person_event)
|
self.parse_level(state, self.indi_parse_tbl, self.func_person_event)
|
||||||
|
|
||||||
@@ -3933,12 +3870,36 @@ class GedcomParser(UpdateCallback):
|
|||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
|
def map_ancestry_com(self, original_gid):
|
||||||
|
"""
|
||||||
|
GEDCOM files created by Ancestry.com for some reason do not include
|
||||||
|
the FAMC and FAMS mappings in the INDI record. If we don't fix this,
|
||||||
|
we end up with a bunch of broken family connections. The family
|
||||||
|
references the people, but the people do not reference the family.
|
||||||
|
|
||||||
|
To resolve this, we use the mappings acquired from the first pass
|
||||||
|
of the parsing. The StageOne parser will grab the mappins from the
|
||||||
|
family to the child on the first pass, and we can use them here.
|
||||||
|
|
||||||
|
We have to make sure we use the original person ID, since the StageOne
|
||||||
|
parser does not remap colliding IDs.
|
||||||
|
"""
|
||||||
|
for fams_id in self.fams_map.get(original_gid,[]):
|
||||||
|
mapped_id = self.fid_map[fams_id]
|
||||||
|
fams_handle = self.find_family_handle(mapped_id)
|
||||||
|
self.person.add_family_handle(fams_handle)
|
||||||
|
|
||||||
|
for famc_id in self.famc_map.get(original_gid,[]):
|
||||||
|
mapped_id = self.fid_map[famc_id]
|
||||||
|
famc_handle = self.find_family_handle(mapped_id)
|
||||||
|
self.person.add_parent_family_handle(famc_handle)
|
||||||
|
|
||||||
def parse_note(self, line, obj, level):
|
def parse_note(self, line, obj, level):
|
||||||
# reference to a named note defined elsewhere
|
# reference to a named note defined elsewhere
|
||||||
if line.token == TOKEN_RNOTE:
|
if line.token == TOKEN_RNOTE:
|
||||||
obj.add_note(line.data.strip())
|
obj.add_note(line.data.strip())
|
||||||
else:
|
else:
|
||||||
new_note = Note(line.data)
|
new_note = RelLib.Note(line.data)
|
||||||
self.dbase.commit_note(new_note,self.trans)
|
self.dbase.commit_note(new_note,self.trans)
|
||||||
obj.add_note(new_note.handle)
|
obj.add_note(new_note.handle)
|
||||||
self.skip_subordinate_levels(level+1)
|
self.skip_subordinate_levels(level+1)
|
||||||
@@ -4000,6 +3961,8 @@ class GedcomParser(UpdateCallback):
|
|||||||
self.lexer.set_broken_conc(self.gedsource.get_conc())
|
self.lexer.set_broken_conc(self.gedsource.get_conc())
|
||||||
if line.data == "FTW":
|
if line.data == "FTW":
|
||||||
self.is_ftw = True
|
self.is_ftw = True
|
||||||
|
if line.data == "Ancestry.com Family Trees":
|
||||||
|
self.is_ancestry_com = True
|
||||||
genby = line.data
|
genby = line.data
|
||||||
elif line.token == TOKEN_NAME:
|
elif line.token == TOKEN_NAME:
|
||||||
pass
|
pass
|
||||||
|
@@ -26,7 +26,7 @@ import os
|
|||||||
import gtk
|
import gtk
|
||||||
|
|
||||||
import Errors
|
import Errors
|
||||||
from _GedcomParse import GedcomParser, NoteParser, StageOne
|
from _GedcomParse import GedcomParser, StageOne
|
||||||
from QuestionDialog import ErrorDialog
|
from QuestionDialog import ErrorDialog
|
||||||
from bsddb import db
|
from bsddb import db
|
||||||
|
|
||||||
@@ -71,20 +71,20 @@ def importData(database, filename, callback=None, use_trans=False):
|
|||||||
|
|
||||||
def import2(database, filename, callback, code_set, use_trans):
|
def import2(database, filename, callback, code_set, use_trans):
|
||||||
# add some checking here
|
# add some checking here
|
||||||
|
import time
|
||||||
|
t = time.time()
|
||||||
try:
|
try:
|
||||||
ifile = open(filename,"rU")
|
ifile = open(filename,"rU")
|
||||||
np = StageOne(ifile)
|
np = StageOne(ifile)
|
||||||
np.parse()
|
np.parse()
|
||||||
print np.get_encoding()
|
print np.get_encoding()
|
||||||
ifile.seek(0)
|
ifile.seek(0)
|
||||||
np = NoteParser(ifile, False, code_set)
|
gedparse = GedcomParser(database, ifile, filename, callback, np)
|
||||||
ifile.seek(0)
|
|
||||||
gedparse = GedcomParser(database, ifile, filename, callback, code_set,
|
|
||||||
np.get_map(), np.get_lines(),np.get_persons())
|
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
ErrorDialog(_("%s could not be opened\n") % filename, str(msg))
|
ErrorDialog(_("%s could not be opened\n") % filename, str(msg))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
if database.get_number_of_people() == 0:
|
if database.get_number_of_people() == 0:
|
||||||
use_trans = False
|
use_trans = False
|
||||||
|
|
||||||
@@ -107,6 +107,7 @@ def import2(database, filename, callback, code_set, use_trans):
|
|||||||
except Errors.GedcomError, msg:
|
except Errors.GedcomError, msg:
|
||||||
ErrorDialog(_('Error reading GEDCOM file'), str(msg))
|
ErrorDialog(_('Error reading GEDCOM file'), str(msg))
|
||||||
return
|
return
|
||||||
|
print time.time()-t
|
||||||
|
|
||||||
def import_from_string(database, text, callback, code_set, use_trans):
|
def import_from_string(database, text, callback, code_set, use_trans):
|
||||||
# add some checking here
|
# add some checking here
|
||||||
|
@@ -88,7 +88,6 @@
|
|||||||
|
|
||||||
<child>
|
<child>
|
||||||
<widget class="GtkTable" id="table15">
|
<widget class="GtkTable" id="table15">
|
||||||
<property name="border_width">12</property>
|
|
||||||
<property name="visible">True</property>
|
<property name="visible">True</property>
|
||||||
<property name="n_rows">6</property>
|
<property name="n_rows">6</property>
|
||||||
<property name="n_columns">8</property>
|
<property name="n_columns">8</property>
|
||||||
|
Reference in New Issue
Block a user