From 45d29dc2a6a3ebfc121e3228a2fffdbe94597030 Mon Sep 17 00:00:00 2001
From: Don Allingham <don@gramps-project.org>
Date: Thu, 8 Feb 2007 06:09:46 +0000
Subject: [PATCH] 2007-02-07  Don Allingham  <don@gramps-project.org> 	*
 src/GrampsDbUtils/_GedcomLex.py: start of a lexical analyzer 	*
 src/GrampsDbUtils/_GedTokens.py: additional tokens

svn: r8076
---
 ChangeLog                       |   4 +
 src/GrampsDbUtils/_GedTokens.py |   2 +
 src/GrampsDbUtils/_GedcomLex.py | 326 ++++++++++++++++++++++++++++++++
 3 files changed, 332 insertions(+)
 create mode 100644 src/GrampsDbUtils/_GedcomLex.py

diff --git a/ChangeLog b/ChangeLog
index 9bcb1a4ef..8ccab9c25 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2007-02-07  Don Allingham  <don@gramps-project.org>
+	* src/GrampsDbUtils/_GedcomLex.py: start of a lexical analyzer
+	* src/GrampsDbUtils/_GedTokens.py: additional tokens
+
 2007-02-07  Zsolt Foldvari  <zfoldvar@users.sourceforge.net>
 	* src/GrampsDbUtils/_WriteGedcom.py (export_data): more complete SUBM
 	record on export.
diff --git a/src/GrampsDbUtils/_GedTokens.py b/src/GrampsDbUtils/_GedTokens.py
index fedef9858..a4e6ae0f1 100644
--- a/src/GrampsDbUtils/_GedTokens.py
+++ b/src/GrampsDbUtils/_GedTokens.py
@@ -131,6 +131,8 @@ TOKEN_CONL = 111
 TOKEN_AGE  = 112
 TOKEN_AGNC = 113
 TOKEN_RESN = 114
+TOKEN_ID = 115
+TOKEN_GEVENT = 116
 
 tokens = {
     "HEAD"         : TOKEN_HEAD,    "MEDI"         : TOKEN_MEDI,
diff --git a/src/GrampsDbUtils/_GedcomLex.py b/src/GrampsDbUtils/_GedcomLex.py
new file mode 100644
index 000000000..3b7e7b6f0
--- /dev/null
+++ b/src/GrampsDbUtils/_GedcomLex.py
@@ -0,0 +1,326 @@
+#
+# Gramps - a GTK+/GNOME based genealogy program
+#
+# Copyright (C) 2000-2006  Donald N. Allingham
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+# $Id: _ReadGedcom.py 8032 2007-02-03 17:11:05Z hippy $
+
+"Import from GEDCOM"
+
+#-------------------------------------------------------------------------
+#
+# standard python modules
+#
+#-------------------------------------------------------------------------
+import re
+import string
+from gettext import gettext as _
+
+#-------------------------------------------------------------------------
+#
+# GRAMPS modules
+#
+#-------------------------------------------------------------------------
+from ansel_utf8 import ansel_to_utf8
+
+from _GedcomInfo import *
+from _GedTokens import *
+
+#-------------------------------------------------------------------------
+#
+# latin/utf8 conversions
+#
+#-------------------------------------------------------------------------
+
+def _empty_func(a,b):
+    return
+
+def utf8_to_latin(s):
+    return s.encode('iso-8859-1','replace')
+
+def latin_to_utf8(s):
+    if type(s) == unicode:
+        return s
+    else:
+        return unicode(s,'iso-8859-1')
+
+def nocnv(s):
+    return unicode(s)
+
+#-------------------------------------------------------------------------
+#
+# constants
+#
+#-------------------------------------------------------------------------
+ANSEL = 1
+UNICODE = 2
+UPDATE = 25
+
+_transtable = string.maketrans('','')
+_delc = _transtable[0:8] + _transtable[10:31]
+_transtable2 = _transtable[0:128] + ('?' * 128)
+
+ged2gramps = {}
+for _val in personalConstantEvents.keys():
+    _key = personalConstantEvents[_val]
+    if _key != "":
+        ged2gramps[_key] = _val
+
+for _val in familyConstantEvents.keys():
+    _key = familyConstantEvents[_val]
+    if _key != "":
+        ged2gramps[_key] = _val
+
+
+#-------------------------------------------------------------------------
+#
+# GedLine
+#
+#-------------------------------------------------------------------------
+
+intRE       = re.compile(r"\s*(\d+)\s*$")
+nameRegexp  = re.compile(r"/?([^/]*)(/([^/]*)(/([^/]*))?)?")
+snameRegexp = re.compile(r"/([^/]*)/([^/]*)")
+modRegexp   = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$")
+calRegexp   = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$")
+rangeRegexp = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$")
+spanRegexp  = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$")
+intRegexp   = re.compile(r"\s*INT\s+([^(]+)\((.*)\)$")
+
+_calendar_map = {
+    "FRENCH R" : RelLib.Date.CAL_FRENCH,
+    "JULIAN"   : RelLib.Date.CAL_JULIAN,
+    "HEBREW"   : RelLib.Date.CAL_HEBREW,
+}
+
+_quality_map = {
+    'CAL' : RelLib.Date.QUAL_CALCULATED,
+    'INT' : RelLib.Date.QUAL_CALCULATED,
+    'EST' : RelLib.Date.QUAL_ESTIMATED,
+}
+
+_sex_map = {
+    'F' : RelLib.Person.FEMALE,
+    'M' : RelLib.Person.MALE,
+}
+
+class GedLine:
+
+    def __init__(self, data):
+        self.line = data[4]
+        self.level = data[0]
+        self.token = data[1]
+        self.token_text = data[3]
+        self.data = data[2]
+
+        if self.level == 0:
+            if self.token_text and self.token_text[0] == '@' and self.token_text[-1] == '@':
+                self.token = TOKEN_ID
+                self.token_text = self.token_text[1:-1]
+        else:
+            f = MAP_DATA.get(self.token)
+            if f:
+                f(self)
+
+    def do_nothing(self):
+        pass
+
+    def calc_sex(self):
+        self.data = _sex_map.get(self.data,RelLib.Person.UNKNOWN)
+
+    def calc_date(self):
+        self.data = extract_date(self.data)
+
+    def calc_unknown(self):
+        token = ged2gramps.get(self.token_text)
+        if token:
+            self.token = TOKEN_GEVENT
+            self.data = token
+        
+    def __repr__(self):
+        return "%d: %d (%d:%s) %s" % (self.line, self.level, self.token, 
+                                      self.token_text, self.data)
+
+MAP_DATA = {
+    TOKEN_UNKNOWN : GedLine.calc_unknown,
+    TOKEN_DATE    : GedLine.calc_date,
+    TOKEN_SEX     : GedLine.calc_sex,
+    }
+
+
+
+
+#-------------------------------------------------------------------------
+#
+# extract_date
+#
+#-------------------------------------------------------------------------
+import RelLib
+import DateHandler
+
+def extract_date(text):
+    dateobj = RelLib.Date()
+    try:
+        # extract out the MOD line
+        match = modRegexp.match(text)
+        if match:
+            (mod, text) = match.groups()
+            qual = _quality_map.get(mod, RelLib.Date.QUAL_NONE)
+        else:
+            qual = RelLib.Date.QUAL_NONE
+
+        # parse the range if we match, if so, return
+        match = rangeRegexp.match(text)
+        if match:
+            (cal1,data1,cal2,data2) = match.groups()
+
+            cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
+                    
+            start = DateHandler.parser.parse(data1)
+            stop =  DateHandler.parser.parse(data2)
+            dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_RANGE, cal,
+                        start.get_start_date() + stop.get_start_date())
+            dateobj.set_quality(qual)
+            return dateobj
+
+        # parse a span if we match
+        match = spanRegexp.match(text)
+        if match:
+            (cal1,data1,cal2,data2) = match.groups()
+
+            cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
+                    
+            start = DateHandler.parser.parse(data1)
+            stop =  DateHandler.parser.parse(data2)
+            dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_SPAN, cal,
+                        start.get_start_date() + stop.get_start_date())
+            dateobj.set_quality(qual)
+            return dateobj
+        
+        match = calRegexp.match(text)
+        if match:
+            (abt,cal,data) = match.groups()
+            dateobj = self.dp.parse("%s %s" % (abt, data))
+            dateobj.set_calendar(_calendar_map.get(cal, RelLib.Date.CAL_GREGORIAN))
+            dateobj.set_quality(qual)
+            return dateobj
+        else:
+            dval = DateHandler.parser.parse(text)
+            dval.set_quality(qual)
+            return dval
+    except IOError:
+        return self.dp.set_text(text)
+
+#-------------------------------------------------------------------------
+#
+# Reader - serves as the lexical analysis engine
+#
+#-------------------------------------------------------------------------
+class Reader:
+
+    def __init__(self,name):
+        self.f = open(name,'rU')
+        self.current_list = []
+        self.eof = False
+        self.cnv = None
+        self.broken_conc = False
+        self.cnt = 0
+        self.index = 0
+
+    def set_charset_fn(self,cnv):
+        self.cnv = cnv
+
+    def set_broken_conc(self,broken):
+        self.broken_conc = broken
+
+    def readline(self):
+        if len(self.current_list) <= 1 and not self.eof:
+            self.readahead()
+        try:
+            return GedLine(self.current_list.pop())
+        except:
+            return None
+
+    def readahead(self):
+        while len(self.current_list) < 5:
+            old_line = self.f.readline()
+            self.index += 1
+            line = old_line.strip('\r\n')
+            if not line:
+                if line != old_line:
+                    continue
+                else:
+                    self.f.close()
+                    self.eof = True
+                    break
+            if self.cnv:
+                try:
+                    line = self.cnv(line)
+                except:
+                    line = self.cnv(line.translate(_transtable2))
+
+            line = line.split(None,2) + ['']
+
+            try:
+                val = line[2].translate(_transtable,_delc)
+            except IndexError:
+                msg = _("Invalid GEDCOM syntax at line %d was ignored.") % self.index
+                continue
+                
+            try:
+                level = int(line[0])
+            except:
+                level = 0
+
+            data = (level,tokens.get(line[1],TOKEN_UNKNOWN), val,
+                    line[1], self.index)
+
+            if data[1] == TOKEN_CONT:
+                l = self.current_list[0]
+                self.current_list[0] = (l[0],l[1],l[2]+'\n'+data[2],l[3],l[4])
+            elif data[1] == TOKEN_CONC:
+                l = self.current_list[0]
+                if self.broken_conc:
+                    new_value = u"%s %s" % (l[2],data[2])
+                else:
+                    new_value = l[2] + data[2]
+                self.current_list[0] = (l[0],l[1],new_value,l[3],l[4])
+            else:
+                self.current_list.insert(0,data)
+
+if __name__ == "__main__":
+    import sys
+
+    def run():
+        print "Reading", sys.argv[1]
+        a = Reader(sys.argv[1])
+        while True:
+            line = a.readline()
+            #print line
+            if not line: break
+
+    import Utils
+    Utils.profile(run)
+
+    print extract_date("20 JAN 2000")
+    print extract_date("EST 20 JAN 2000")
+    print extract_date("CAL 20 JAN 2000")
+    print extract_date("ABT 20 JAN 2000")
+    print extract_date("INT 20 JAN 2000")
+    print extract_date("BET 20 JAN 2000 AND FEB 2000")
+    print extract_date("FROM 20 JAN 2000 TO FEB 2000")