From 32e6699bffab302b67da9fd6afa8fde64d83327a Mon Sep 17 00:00:00 2001 From: Don Allingham Date: Fri, 9 Mar 2007 06:58:45 +0000 Subject: [PATCH] 2007-03-08 Don Allingham * src/GrampsDbUtils/_GedcomParse.py * src/GrampsDbUtils/_GedcomChar.py * src/GrampsDbUtils/_GedcomLex.py svn: r8286 --- ChangeLog | 5 +++ src/GrampsDbUtils/_GedcomChar.py | 16 +++++--- src/GrampsDbUtils/_GedcomLex.py | 20 +++++----- src/GrampsDbUtils/_GedcomParse.py | 64 ++++++++++++++++++------------- 4 files changed, 62 insertions(+), 43 deletions(-) diff --git a/ChangeLog b/ChangeLog index f8dbc94e4..3282877d5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2007-03-08 Don Allingham + * src/GrampsDbUtils/_GedcomParse.py + * src/GrampsDbUtils/_GedcomChar.py + * src/GrampsDbUtils/_GedcomLex.py + 2007-03-06 Brian Matherly * src/plugins/AncestorChart.py: Deleted - just an old version of AncestorChart2.py diff --git a/src/GrampsDbUtils/_GedcomChar.py b/src/GrampsDbUtils/_GedcomChar.py index 086187f21..a30682aa5 100644 --- a/src/GrampsDbUtils/_GedcomChar.py +++ b/src/GrampsDbUtils/_GedcomChar.py @@ -19,6 +19,7 @@ # from ansel_utf8 import ansel_to_utf8 +import codecs class BaseReader: def __init__(self, ifile, encoding): @@ -53,13 +54,16 @@ class UTF8Reader(BaseReader): class UTF16Reader(BaseReader): def __init__(self, ifile): - BaseReader.__init__(self, ifile, 'utf16') + new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16') + BaseReader.__init__(self, new_file, 'utf16') + self.reset() - def reset(self): - self.ifile.seek(0) - data = self.ifile.read(2) - if data != "\xff\xfe": - self.ifile.seek(0) + def readline(self): + l = self.ifile.readline() + if l.strip(): + return l + else: + return self.ifile.readline() class AnsiReader(BaseReader): diff --git a/src/GrampsDbUtils/_GedcomLex.py b/src/GrampsDbUtils/_GedcomLex.py index e2aa210bc..8a3d365f6 100644 --- a/src/GrampsDbUtils/_GedcomLex.py +++ b/src/GrampsDbUtils/_GedcomLex.py @@ -18,8 +18,6 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: _ReadGedcom.py 8032 2007-02-03 17:11:05Z hippy $ - "Import from GEDCOM" __revision__ = "$Revision: $" @@ -131,9 +129,9 @@ class GedLine: Line Number, Level, Token Value, Token Text, and Data - Data is dependent on the context the Token Value. For most of tokens, this is - just a text string. However, for certain tokens where we know the context, we - can provide some value. The current parsed tokens are: + Data is dependent on the context the Token Value. For most of tokens, + this is just a text string. However, for certain tokens where we know + the context, we can provide some value. The current parsed tokens are: TOKEN_DATE - RelLib.Date TOKEN_SEX - RelLib.Person gender item @@ -142,13 +140,13 @@ class GedLine: def __init__(self, data): """ - If the level is 0, then this is a top level instance. In this case, we may - find items in the form of: + If the level is 0, then this is a top level instance. In this case, + we may find items in the form of: @ID@ - If this is not the top level, we check the MAP_DATA array to see if there is - a conversion function for the data. + If this is not the top level, we check the MAP_DATA array to see if + there is a conversion function for the data. """ self.line = data[4] self.level = data[0] @@ -369,8 +367,8 @@ class Reader: except: continue - data = (level, tokens.get(line[1], TOKEN_UNKNOWN), line[2], line[1], - self.index) + token = tokens.get(line[1], TOKEN_UNKNOWN) + data = (level, token, line[2], line[1], self.index) func = self.func_map.get(data[1]) if func: diff --git a/src/GrampsDbUtils/_GedcomParse.py b/src/GrampsDbUtils/_GedcomParse.py index 264c9f0e0..06244d653 100644 --- a/src/GrampsDbUtils/_GedcomParse.py +++ b/src/GrampsDbUtils/_GedcomParse.py @@ -93,6 +93,7 @@ import os import sys import re import time +import codecs from gettext import gettext as _ #------------------------------------------------------------------------ @@ -227,7 +228,6 @@ for _val in familyConstantEvents.keys(): # regular expressions # #------------------------------------------------------------------------- -INT_RE = re.compile(r"\s*(\d+)\s*$") NOTE_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+NOTE(.*)$") CONT_RE = re.compile(r"\s*\d+\s+CONT\s?(.*)$") CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$") @@ -245,15 +245,24 @@ class StageOne: def parse(self): current = "" - - line = self.ifile.read(3) + + line = self.ifile.read(2) if line == "\xef\xbb": self.ifile.read(1) self.enc = "UTF8" + self.reader = self.ifile + elif line == "\xff\xfe": + self.enc = "UTF16" + self.ifile.seek(0) + self.reader = codecs.EncodedFile(self.ifile, 'utf8', 'utf16') else: self.ifile.seek(0) + self.reader = self.ifile - for line in self.ifile: + for line in self.reader: + line = line.strip() + if not line: + continue self.lcnt += 1 data = line.split(None, 2) + [''] @@ -267,6 +276,7 @@ class StageOne: key = key.strip() except: LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt) + continue if level == 0 and key[0] == '@': if value == ("FAM", "FAMILY") : @@ -908,8 +918,8 @@ class GedcomParser(UpdateCallback): def __find_from_handle(self, gramps_id, table): """ Finds a handle corresponding the the specified GRAMPS ID. The passed - table contains the mapping. If the value is found, we return it, otherwise - we create a new handle, store it, and return it. + table contains the mapping. If the value is found, we return it, + otherwise we create a new handle, store it, and return it. """ intid = table.get(gramps_id) if not intid: @@ -1011,8 +1021,8 @@ class GedcomParser(UpdateCallback): already used (is in the db), we return the item in the db. Otherwise, we create a new repository, assign the handle and GRAMPS ID. - Some GEDCOM "flavors" destroy the specification, and declare the repository - inline instead of in a object. + Some GEDCOM "flavors" destroy the specification, and declare the + repository inline instead of in a object. """ repository = RelLib.Repository() if not gramps_id: @@ -1038,8 +1048,8 @@ class GedcomParser(UpdateCallback): already used (is in the db), we return the item in the db. Otherwise, we create a new repository, assign the handle and GRAMPS ID. - Some GEDCOM "flavors" destroy the specification, and declare the repository - inline instead of in a object. + Some GEDCOM "flavors" destroy the specification, and declare the + repository inline instead of in a object. """ note = RelLib.Note() if not gramps_id: @@ -1262,9 +1272,9 @@ class GedcomParser(UpdateCallback): def __parse_level(self, state, __map, default): """ - Loops trough the current GEDCOM level level, calling the appropriate functions - associated with the TOKEN. If no matching function for the token is found, the - default function is called instead. + Loops trough the current GEDCOM level level, calling the appropriate + functions associated with the TOKEN. If no matching function for the + token is found, the default function is called instead. """ while True: line = self.__get_next_line() @@ -1322,7 +1332,8 @@ class GedcomParser(UpdateCallback): """ # find the person - self.person = self.__find_or_create_person(self.pid_map[line.token_text]) + real_id = self.pid_map[line.token_text] + self.person = self.__find_or_create_person(real_id) # set up the state for the parsing state = GedcomUtils.CurrentState(person=self.person, level=1) @@ -1377,7 +1388,7 @@ class GedcomParser(UpdateCallback): @type state: CurrentState """ event_ref = self.__build_event_pair(state, RelLib.EventType.CUSTOM, - self.event_parse_tbl, line.data) + self.event_parse_tbl, line.data) state.person.add_event_ref(event_ref) def __skip_record(self, line, state): @@ -1597,8 +1608,8 @@ class GedcomParser(UpdateCallback): def __person_birt(self, line, state): """ Parses GEDCOM BIRT tag into a GRAMPS birth event. Additional work - must be done, since additional handling must be done by GRAMPS to set this up - as a birth reference event. + must be done, since additional handling must be done by GRAMPS to set + this up as a birth reference event. n BIRT [Y|] {1:1} +1 <> {0:1} p.* @@ -1642,8 +1653,8 @@ class GedcomParser(UpdateCallback): def __person_deat(self, line, state): """ Parses GEDCOM DEAT tag into a GRAMPS birth event. Additional work - must be done, since additional handling must be done by GRAMPS to set this up - as a death reference event. + must be done, since additional handling must be done by GRAMPS to set + this up as a death reference event. n DEAT [Y|] {1:1} +1 <> {0:1} p.* @@ -2147,8 +2158,8 @@ class GedcomParser(UpdateCallback): def __lds_form(self, line, state): """ - Parses the FORM tag thate defines the place structure for a place. This - tag, if found, will override any global place structure. + Parses the FORM tag thate defines the place structure for a place. + This tag, if found, will override any global place structure. @param line: The current line in GedLine format @type line: GedLine @@ -2384,8 +2395,8 @@ class GedcomParser(UpdateCallback): def __person_asso_type(self, line, state): """ Parses the INDI.ASSO.TYPE tag. GRAMPS only supports the ASSO tag when - the tag represents an INDI. So if the data is not INDI, we set the ignore - flag, so that we ignore the record. + the tag represents an INDI. So if the data is not INDI, we set the + ignore flag, so that we ignore the record. @param line: The current line in GedLine format @type line: GedLine @@ -2942,8 +2953,8 @@ class GedcomParser(UpdateCallback): def __event_place(self, line, state): """ Parse the place portion of a event. A special case has to be made for - Family Tree Maker, which violates the GEDCOM spec. It uses the PLAC field - to store the description or value assocated with the event. + Family Tree Maker, which violates the GEDCOM spec. It uses the PLAC + field to store the description or value assocated with the event. n PLAC {1:1} +1 FORM {0:1} @@ -3644,7 +3655,8 @@ class GedcomParser(UpdateCallback): def __repo_ref_medi(self, line, state): name = line.data - mtype = MEDIA_MAP.get(name.lower(), (RelLib.SourceMediaType.CUSTOM, name)) + mtype = MEDIA_MAP.get(name.lower(), + (RelLib.SourceMediaType.CUSTOM, name)) state.repo_ref.set_media_type(mtype) def __repo_ref_note(self, line, state):