2007-02-20 Don Allingham <don@gramps-project.org>
* src/GrampsDbUtils/_GedcomParse.py: Use the FAM->INDI mappings discovered in the StageOne parsing to fix missing relationships in Ancestry.com generated GEDCOM files. svn: r8198
This commit is contained in:
parent
70e12c490e
commit
c7e9961040
@ -1,3 +1,8 @@
|
||||
2007-02-20 Don Allingham <don@gramps-project.org>
|
||||
* src/GrampsDbUtils/_GedcomParse.py: Use the FAM->INDI mappings
|
||||
discovered in the StageOne parsing to fix missing relationships
|
||||
in Ancestry.com generated GEDCOM files.
|
||||
|
||||
2007-02-20 Benny malengier <bm@cage.ugent.be>
|
||||
* gramps_main.py : register_stock_icons
|
||||
* src/images: delete old icons
|
||||
|
@ -269,31 +269,41 @@ class StageOne:
|
||||
self.famc = {}
|
||||
self.fams = {}
|
||||
self.enc = ""
|
||||
self.pcnt = 0
|
||||
self.lcnt = 0
|
||||
|
||||
def parse(self):
|
||||
current = ""
|
||||
for line in self.ifile:
|
||||
self.lcnt +=1
|
||||
data = line.split(None,2) + ['']
|
||||
(level, key, value) = data[:3]
|
||||
value = value.strip()
|
||||
level = int(level)
|
||||
try:
|
||||
(level, key, value) = data[:3]
|
||||
value = value.strip()
|
||||
level = int(level)
|
||||
except:
|
||||
print line
|
||||
sys.exit(1)
|
||||
key = key.strip()
|
||||
|
||||
if level == 0 and value == "FAM":
|
||||
current = key.strip()
|
||||
current = current[1:-1]
|
||||
if level == 0:
|
||||
if value == "FAM":
|
||||
current = key.strip()
|
||||
current = current[1:-1]
|
||||
elif value == "INDI":
|
||||
self.pcnt += 1
|
||||
elif key in ("HUSB", "WIFE") and value and value[0] == '@':
|
||||
value = value[1:-1]
|
||||
if self.fams.has_key(current):
|
||||
self.fams[current].append(value)
|
||||
if self.fams.has_key(value):
|
||||
self.fams[value].append(current)
|
||||
else:
|
||||
self.fams[current] = [value]
|
||||
self.fams[value] = [current]
|
||||
elif key == "CHIL" and value and value[0] == '@':
|
||||
value = value[1:-1]
|
||||
if self.famc.has_key(current):
|
||||
self.famc[current].append(value)
|
||||
if self.famc.has_key(value):
|
||||
self.famc[value].append(current)
|
||||
else:
|
||||
self.famc[current] = [value]
|
||||
self.famc[value] = [current]
|
||||
elif key == 'CHAR':
|
||||
self.enc = value
|
||||
|
||||
@ -306,93 +316,11 @@ class StageOne:
|
||||
def get_encoding(self):
|
||||
return self.enc
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
class NoteParser:
|
||||
"""
|
||||
Performs the first pass of a GEDCOM file parse.
|
||||
"""
|
||||
def __init__(self, ifile, broken, override):
|
||||
if override:
|
||||
if override == 1:
|
||||
self.cnv = ansel_to_utf8
|
||||
elif override == 2:
|
||||
self.cnv = latin_to_utf8
|
||||
else:
|
||||
self.cnv = nocnv
|
||||
else:
|
||||
for index in range(50):
|
||||
line = ifile.readline().split()
|
||||
if len(line) > 2 and line[1] == 'CHAR':
|
||||
if line[2] == "ANSEL":
|
||||
self.cnv = ansel_to_utf8
|
||||
elif line[2] in ["UNICODE","UTF-8","UTF8"]:
|
||||
self.cnv = nocnv
|
||||
else:
|
||||
self.cnv = latin_to_utf8
|
||||
def get_person_count(self):
|
||||
return self.pcnt
|
||||
|
||||
self.name_map = {}
|
||||
|
||||
self.count = 0
|
||||
self.person_count = 0
|
||||
self.trans = None
|
||||
self.groups = None
|
||||
|
||||
ifile.seek(0)
|
||||
innote = False
|
||||
noteobj = RelLib.Note()
|
||||
|
||||
for line in ifile:
|
||||
try:
|
||||
text = line.translate(TRANS_TABLE, DEL_CHARS)
|
||||
except:
|
||||
text = line
|
||||
|
||||
try:
|
||||
text = self.cnv(text)
|
||||
except:
|
||||
text = text.translate(TRANS_TABLE2)
|
||||
|
||||
self.count += 1
|
||||
if innote:
|
||||
|
||||
match = CONT_RE.match(text)
|
||||
if match:
|
||||
noteobj.append("\n" + match.groups()[0])
|
||||
continue
|
||||
|
||||
match = CONC_RE.match(text)
|
||||
if match:
|
||||
if broken:
|
||||
noteobj.append(" " + match.groups()[0])
|
||||
else:
|
||||
noteobj.append(match.groups()[0])
|
||||
continue
|
||||
|
||||
# Here we have finished parsing CONT/CONC tags for the NOTE
|
||||
# and ignored the rest of the tags (SOUR,CHAN,REFN,RIN).
|
||||
innote = False
|
||||
match = NOTE_RE.match(text)
|
||||
if match:
|
||||
data = match.groups()[0]
|
||||
noteobj = RelLib.Note()
|
||||
self.name_map["@%s@" % data] = noteobj
|
||||
noteobj.append(match.groups()[1])
|
||||
innote = True
|
||||
elif PERSON_RE.match(line):
|
||||
self.person_count += 1
|
||||
|
||||
def get_map(self):
|
||||
return self.name_map
|
||||
|
||||
def get_lines(self):
|
||||
return self.count
|
||||
|
||||
def get_persons(self):
|
||||
return self.person_count
|
||||
def get_line_count(self):
|
||||
return self.lcnt
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
@ -408,13 +336,13 @@ class GedcomParser(UpdateCallback):
|
||||
SyntaxError = "Syntax Error"
|
||||
BadFile = "Not a GEDCOM file"
|
||||
|
||||
def __init__(self, dbase, ifile, filename, callback, code_set, note_map,
|
||||
lines, people):
|
||||
def __init__(self, dbase, ifile, filename, callback, stage_one):
|
||||
UpdateCallback.__init__(self, callback)
|
||||
self.set_total(lines)
|
||||
|
||||
self.set_total(stage_one.get_line_count())
|
||||
|
||||
self.repo2id = {}
|
||||
self.maxpeople = people
|
||||
self.maxpeople = stage_one.get_person_count()
|
||||
self.dbase = dbase
|
||||
self.emapper = GedcomUtils.IdFinder(dbase.get_gramps_ids(EVENT_KEY),
|
||||
dbase.eprefix)
|
||||
@ -424,11 +352,15 @@ class GedcomParser(UpdateCallback):
|
||||
self.repo_count = 0
|
||||
self.source_count = 0
|
||||
|
||||
self.famc_map = stage_one.get_famc_map()
|
||||
self.fams_map = stage_one.get_fams_map()
|
||||
|
||||
print self.fams_map
|
||||
|
||||
self.place_parser = GedcomUtils.PlaceParser()
|
||||
self.debug = False
|
||||
self.inline_srcs = {}
|
||||
self.media_map = {}
|
||||
self.note_map = note_map
|
||||
self.refn = {}
|
||||
self.added = set()
|
||||
self.gedmap = GedcomInfoDB()
|
||||
@ -441,8 +373,8 @@ class GedcomParser(UpdateCallback):
|
||||
self.dir_path = os.path.dirname(filename)
|
||||
self.localref = 0
|
||||
self.placemap = {}
|
||||
self.broken_conc_list = [ 'FamilyOrigins', 'FTW' ]
|
||||
self.is_ftw = False
|
||||
self.is_ancestry_com = False
|
||||
|
||||
self.pid_map = GedcomUtils.IdMapper(
|
||||
self.dbase.id_trans,
|
||||
@ -850,13 +782,13 @@ class GedcomParser(UpdateCallback):
|
||||
self.lexer = Reader(ifile)
|
||||
self.filename = filename
|
||||
self.backoff = False
|
||||
self.override = code_set
|
||||
|
||||
if self.override != 0:
|
||||
if self.override == 1:
|
||||
self.lexer.set_charset_fn(ansel_to_utf8)
|
||||
elif self.override == 2:
|
||||
self.lexer.set_charset_fn(latin_to_utf8)
|
||||
self.override = False
|
||||
#
|
||||
# if self.override != 0:
|
||||
# if self.override == 1:
|
||||
# self.lexer.set_charset_fn(ansel_to_utf8)
|
||||
# elif self.override == 2:
|
||||
# self.lexer.set_charset_fn(latin_to_utf8)
|
||||
|
||||
fullpath = os.path.normpath(os.path.abspath(filename))
|
||||
self.geddir = os.path.dirname(fullpath)
|
||||
@ -1293,6 +1225,11 @@ class GedcomParser(UpdateCallback):
|
||||
# set up the state for the parsing
|
||||
state = GedcomUtils.CurrentState(person=self.person, level=1)
|
||||
|
||||
# Ancestry.com GEDCOM files are massively broken, not providing
|
||||
# the FAMC and FAMS values for a person
|
||||
if self.is_ancestry_com:
|
||||
self.map_ancestry_com(line.token_text.strip())
|
||||
|
||||
# do the actual parsing
|
||||
self.parse_level(state, self.indi_parse_tbl, self.func_person_event)
|
||||
|
||||
@ -3933,12 +3870,36 @@ class GedcomParser(UpdateCallback):
|
||||
|
||||
###############################################################################
|
||||
|
||||
def map_ancestry_com(self, original_gid):
|
||||
"""
|
||||
GEDCOM files created by Ancestry.com for some reason do not include
|
||||
the FAMC and FAMS mappings in the INDI record. If we don't fix this,
|
||||
we end up with a bunch of broken family connections. The family
|
||||
references the people, but the people do not reference the family.
|
||||
|
||||
To resolve this, we use the mappings acquired from the first pass
|
||||
of the parsing. The StageOne parser will grab the mappins from the
|
||||
family to the child on the first pass, and we can use them here.
|
||||
|
||||
We have to make sure we use the original person ID, since the StageOne
|
||||
parser does not remap colliding IDs.
|
||||
"""
|
||||
for fams_id in self.fams_map.get(original_gid,[]):
|
||||
mapped_id = self.fid_map[fams_id]
|
||||
fams_handle = self.find_family_handle(mapped_id)
|
||||
self.person.add_family_handle(fams_handle)
|
||||
|
||||
for famc_id in self.famc_map.get(original_gid,[]):
|
||||
mapped_id = self.fid_map[famc_id]
|
||||
famc_handle = self.find_family_handle(mapped_id)
|
||||
self.person.add_parent_family_handle(famc_handle)
|
||||
|
||||
def parse_note(self, line, obj, level):
|
||||
# reference to a named note defined elsewhere
|
||||
if line.token == TOKEN_RNOTE:
|
||||
obj.add_note(line.data.strip())
|
||||
else:
|
||||
new_note = Note(line.data)
|
||||
new_note = RelLib.Note(line.data)
|
||||
self.dbase.commit_note(new_note,self.trans)
|
||||
obj.add_note(new_note.handle)
|
||||
self.skip_subordinate_levels(level+1)
|
||||
@ -4000,6 +3961,8 @@ class GedcomParser(UpdateCallback):
|
||||
self.lexer.set_broken_conc(self.gedsource.get_conc())
|
||||
if line.data == "FTW":
|
||||
self.is_ftw = True
|
||||
if line.data == "Ancestry.com Family Trees":
|
||||
self.is_ancestry_com = True
|
||||
genby = line.data
|
||||
elif line.token == TOKEN_NAME:
|
||||
pass
|
||||
|
@ -26,7 +26,7 @@ import os
|
||||
import gtk
|
||||
|
||||
import Errors
|
||||
from _GedcomParse import GedcomParser, NoteParser, StageOne
|
||||
from _GedcomParse import GedcomParser, StageOne
|
||||
from QuestionDialog import ErrorDialog
|
||||
from bsddb import db
|
||||
|
||||
@ -71,20 +71,20 @@ def importData(database, filename, callback=None, use_trans=False):
|
||||
|
||||
def import2(database, filename, callback, code_set, use_trans):
|
||||
# add some checking here
|
||||
import time
|
||||
t = time.time()
|
||||
try:
|
||||
ifile = open(filename,"rU")
|
||||
np = StageOne(ifile)
|
||||
np.parse()
|
||||
print np.get_encoding()
|
||||
ifile.seek(0)
|
||||
np = NoteParser(ifile, False, code_set)
|
||||
ifile.seek(0)
|
||||
gedparse = GedcomParser(database, ifile, filename, callback, code_set,
|
||||
np.get_map(), np.get_lines(),np.get_persons())
|
||||
gedparse = GedcomParser(database, ifile, filename, callback, np)
|
||||
except IOError, msg:
|
||||
ErrorDialog(_("%s could not be opened\n") % filename, str(msg))
|
||||
return
|
||||
|
||||
|
||||
if database.get_number_of_people() == 0:
|
||||
use_trans = False
|
||||
|
||||
@ -107,6 +107,7 @@ def import2(database, filename, callback, code_set, use_trans):
|
||||
except Errors.GedcomError, msg:
|
||||
ErrorDialog(_('Error reading GEDCOM file'), str(msg))
|
||||
return
|
||||
print time.time()-t
|
||||
|
||||
def import_from_string(database, text, callback, code_set, use_trans):
|
||||
# add some checking here
|
||||
|
@ -88,7 +88,6 @@
|
||||
|
||||
<child>
|
||||
<widget class="GtkTable" id="table15">
|
||||
<property name="border_width">12</property>
|
||||
<property name="visible">True</property>
|
||||
<property name="n_rows">6</property>
|
||||
<property name="n_columns">8</property>
|
||||
|
Loading…
x
Reference in New Issue
Block a user