2007-02-24 Don Allingham <don@gramps-project.org>
* src/DisplayTabs/_NoteModel.py: added * src/DisplayTabs/_NoteTab.py: support new list * src/GrampsDbUtils/_GedcomParse.py: enhancements to parsing * src/GrampsDbUtils/_ReadGedcom.py: handle encoding properly * src/GrampsDbUtils/_GedcomChar.py: new encoding interface * src/GrampsDbUtils/_GedcomLex.py: cleanup svn: r8231
This commit is contained in:
76
src/GrampsDbUtils/_GedcomChar.py
Normal file
76
src/GrampsDbUtils/_GedcomChar.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Gramps - a GTK+/GNOME based genealogy program
|
||||
#
|
||||
# Copyright (C) 2000-2005 Donald N. Allingham
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
from ansel_utf8 import ansel_to_utf8
|
||||
|
||||
class BaseReader:
|
||||
def __init__(self, ifile, encoding):
|
||||
self.ifile = ifile
|
||||
self.enc = encoding
|
||||
|
||||
def reset(self):
|
||||
self.ifile.seek(0)
|
||||
|
||||
def readline(self):
|
||||
return unicode(self.ifile.readline(),
|
||||
encoding=self.enc,
|
||||
errors='replace').strip('\n\r')
|
||||
|
||||
class UTF8Reader(BaseReader):
|
||||
|
||||
def __init__(self, ifile):
|
||||
BaseReader.__init__(self, ifile, 'utf8')
|
||||
|
||||
def reset(self):
|
||||
self.ifile.seek(0)
|
||||
data = self.ifile.read(3)
|
||||
if data != "\xef\xbb\xbf":
|
||||
self.ifile.seek(0)
|
||||
|
||||
def readline(self):
|
||||
return unicode(self.ifile.readline(),
|
||||
encoding=self.enc,
|
||||
errors='replace').strip('\n\r')
|
||||
|
||||
class UTF16Reader(BaseReader):
|
||||
|
||||
def __init__(self, ifile):
|
||||
BaseReader.__init__(self, ifile, 'utf16')
|
||||
|
||||
def reset(self):
|
||||
self.ifile.seek(0)
|
||||
data = self.ifile.read(2)
|
||||
if data != "\xff\xfe":
|
||||
self.ifile.seek(0)
|
||||
|
||||
class AnsiReader(BaseReader):
|
||||
|
||||
def __init__(self, ifile):
|
||||
BaseReader.__init__(self, ifile, 'latin1')
|
||||
|
||||
class AnselReader(BaseReader):
|
||||
|
||||
def __init__(self, ifile):
|
||||
BaseReader.__init__(self, ifile, "")
|
||||
|
||||
def readline(self):
|
||||
return ansel_to_utf8(self.ifile.readline().strip('\n\r'))
|
||||
|
||||
|
@@ -22,21 +22,22 @@
|
||||
|
||||
"Import from GEDCOM"
|
||||
|
||||
__revision__ = "$Revision: $"
|
||||
__author__ = "Don Allingham"
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# standard python modules
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
import re
|
||||
import string
|
||||
from gettext import gettext as _
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# GRAMPS modules
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
from ansel_utf8 import ansel_to_utf8
|
||||
|
||||
from _GedcomInfo import *
|
||||
from _GedcomTokens import *
|
||||
@@ -45,60 +46,25 @@ from DateHandler._DateParser import DateParser
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# latin/utf8 conversions
|
||||
#
|
||||
# constants #
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
def utf8_to_latin(msg):
|
||||
"""
|
||||
Converts a string from unicode to iso-8859-1. If any illegal characters
|
||||
are found, they are converted to ?
|
||||
|
||||
@param msg: unicode string to convert
|
||||
@type level: unicode
|
||||
@return: Returns the string, converted to a ISO-8859-1 object
|
||||
@rtype: str
|
||||
"""
|
||||
return msg.encode('iso-8859-1', 'replace')
|
||||
|
||||
def latin_to_utf8(s):
|
||||
if type(s) == unicode:
|
||||
return s
|
||||
else:
|
||||
return unicode(s,'iso-8859-1')
|
||||
|
||||
def nocnv(s):
|
||||
return unicode(s,errors='replace')
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# constants
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
ANSEL = 1
|
||||
UNICODE = 2
|
||||
UPDATE = 25
|
||||
|
||||
_transtable = string.maketrans('','')
|
||||
_delc = _transtable[0:8] + _transtable[10:31]
|
||||
_transtable2 = _transtable[0:128] + ('?' * 128)
|
||||
|
||||
ged2gramps = {}
|
||||
GED2GRAMPS = {}
|
||||
for _val in personalConstantEvents.keys():
|
||||
_key = personalConstantEvents[_val]
|
||||
if _key != "":
|
||||
ged2gramps[_key] = _val
|
||||
GED2GRAMPS[_key] = _val
|
||||
|
||||
for _val in familyConstantEvents.keys():
|
||||
_key = familyConstantEvents[_val]
|
||||
if _key != "":
|
||||
ged2gramps[_key] = _val
|
||||
GED2GRAMPS[_key] = _val
|
||||
|
||||
ged2attr = {}
|
||||
GED2ATTR = {}
|
||||
for _val in personalConstantAttributes.keys():
|
||||
_key = personalConstantAttributes[_val]
|
||||
if _key != "":
|
||||
ged2attr[_key] = _val
|
||||
GED2ATTR[_key] = _val
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
@@ -106,26 +72,24 @@ for _val in personalConstantAttributes.keys():
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
intRE = re.compile(r"\s*(\d+)\s*$")
|
||||
modRegexp = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$")
|
||||
calRegexp = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$")
|
||||
rangeRegexp = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$")
|
||||
spanRegexp = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$")
|
||||
intRegexp = re.compile(r"\s*INT\s+([^(]+)\((.*)\)$")
|
||||
MOD = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$")
|
||||
CAL = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$")
|
||||
RANGE = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$")
|
||||
SPAN = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$")
|
||||
|
||||
_calendar_map = {
|
||||
CALENDAR_MAP = {
|
||||
"FRENCH R" : RelLib.Date.CAL_FRENCH,
|
||||
"JULIAN" : RelLib.Date.CAL_JULIAN,
|
||||
"HEBREW" : RelLib.Date.CAL_HEBREW,
|
||||
}
|
||||
|
||||
_quality_map = {
|
||||
QUALITY_MAP = {
|
||||
'CAL' : RelLib.Date.QUAL_CALCULATED,
|
||||
'INT' : RelLib.Date.QUAL_CALCULATED,
|
||||
'EST' : RelLib.Date.QUAL_ESTIMATED,
|
||||
}
|
||||
|
||||
_sex_map = {
|
||||
SEX_MAP = {
|
||||
'F' : RelLib.Person.FEMALE,
|
||||
'M' : RelLib.Person.MALE,
|
||||
}
|
||||
@@ -185,20 +149,21 @@ class GedLine:
|
||||
self.data = data[2]
|
||||
|
||||
if self.level == 0:
|
||||
if self.token_text and self.token_text[0] == '@' and self.token_text[-1] == '@':
|
||||
if self.token_text and self.token_text[0] == '@' \
|
||||
and self.token_text[-1] == '@':
|
||||
self.token = TOKEN_ID
|
||||
self.token_text = self.token_text[1:-1]
|
||||
self.data = self.data.strip()
|
||||
else:
|
||||
f = MAP_DATA.get(self.token)
|
||||
if f:
|
||||
f(self)
|
||||
func = MAP_DATA.get(self.token)
|
||||
if func:
|
||||
func(self)
|
||||
|
||||
def calc_sex(self):
|
||||
"""
|
||||
Converts the data field to a RelLib token indicating the gender
|
||||
"""
|
||||
self.data = _sex_map.get(self.data.strip(),RelLib.Person.UNKNOWN)
|
||||
self.data = SEX_MAP.get(self.data.strip(), RelLib.Person.UNKNOWN)
|
||||
|
||||
def calc_date(self):
|
||||
"""
|
||||
@@ -212,12 +177,12 @@ class GedLine:
|
||||
change the type from UNKNOWN to TOKEN_GEVENT (gedcom event), and
|
||||
the data is assigned to the associated GRAMPS EventType
|
||||
"""
|
||||
token = ged2gramps.get(self.token_text)
|
||||
token = GED2GRAMPS.get(self.token_text)
|
||||
if token:
|
||||
self.token = TOKEN_GEVENT
|
||||
self.data = token
|
||||
else:
|
||||
token = ged2attr.get(self.token_text)
|
||||
token = GED2ATTR.get(self.token_text)
|
||||
if token:
|
||||
attr = RelLib.Attribute()
|
||||
attr.set_value(self.data)
|
||||
@@ -226,10 +191,10 @@ class GedLine:
|
||||
self.data = attr
|
||||
|
||||
def calc_note(self):
|
||||
d = self.data.strip()
|
||||
if len(d) > 2 and d[0] == '@' and d[-1] == '@':
|
||||
gid = self.data.strip()
|
||||
if len(gid) > 2 and gid[0] == '@' and gid[-1] == '@':
|
||||
self.token = TOKEN_RNOTE
|
||||
self.data = d[1:-1]
|
||||
self.data = gid[1:-1]
|
||||
|
||||
def calc_nchi(self):
|
||||
attr = RelLib.Attribute()
|
||||
@@ -245,10 +210,6 @@ class GedLine:
|
||||
self.data = attr
|
||||
self.token = TOKEN_ATTR
|
||||
|
||||
def calc_lds(self):
|
||||
self.data = _
|
||||
self.token = TOKEN_ATTR
|
||||
|
||||
def __repr__(self):
|
||||
return "%d: %d (%d:%s) %s" % (self.line, self.level, self.token,
|
||||
self.token_text, self.data)
|
||||
@@ -276,7 +237,7 @@ MAP_DATA = {
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
_dp = GedcomDateParser()
|
||||
DATE_CNV = GedcomDateParser()
|
||||
|
||||
def extract_date(text):
|
||||
"""
|
||||
@@ -285,54 +246,55 @@ def extract_date(text):
|
||||
dateobj = RelLib.Date()
|
||||
try:
|
||||
# extract out the MOD line
|
||||
match = modRegexp.match(text)
|
||||
match = MOD.match(text)
|
||||
if match:
|
||||
(mod, text) = match.groups()
|
||||
qual = _quality_map.get(mod, RelLib.Date.QUAL_NONE)
|
||||
qual = QUALITY_MAP.get(mod, RelLib.Date.QUAL_NONE)
|
||||
else:
|
||||
qual = RelLib.Date.QUAL_NONE
|
||||
|
||||
# parse the range if we match, if so, return
|
||||
match = rangeRegexp.match(text)
|
||||
match = RANGE.match(text)
|
||||
if match:
|
||||
(cal1,data1,cal2,data2) = match.groups()
|
||||
(cal1, data1, cal2, data2) = match.groups()
|
||||
|
||||
cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
|
||||
cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN)
|
||||
|
||||
start = _dp.parse(data1)
|
||||
stop = _dp.parse(data2)
|
||||
start = DATE_CNV.parse(data1)
|
||||
stop = DATE_CNV.parse(data2)
|
||||
dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_RANGE, cal,
|
||||
start.get_start_date() + stop.get_start_date())
|
||||
dateobj.set_quality(qual)
|
||||
return dateobj
|
||||
|
||||
# parse a span if we match
|
||||
match = spanRegexp.match(text)
|
||||
match = SPAN.match(text)
|
||||
if match:
|
||||
(cal1,data1,cal2,data2) = match.groups()
|
||||
(cal1, data1, cal2, data2) = match.groups()
|
||||
|
||||
cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
|
||||
cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN)
|
||||
|
||||
start = _dp.parse(data1)
|
||||
stop = _dp.parse(data2)
|
||||
start = DATE_CNV.parse(data1)
|
||||
stop = DATE_CNV.parse(data2)
|
||||
dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_SPAN, cal,
|
||||
start.get_start_date() + stop.get_start_date())
|
||||
dateobj.set_quality(qual)
|
||||
return dateobj
|
||||
|
||||
match = calRegexp.match(text)
|
||||
match = CAL.match(text)
|
||||
if match:
|
||||
(abt,cal,data) = match.groups()
|
||||
dateobj = _dp.parse("%s %s" % (abt, data))
|
||||
dateobj.set_calendar(_calendar_map.get(cal, RelLib.Date.CAL_GREGORIAN))
|
||||
(abt, cal, data) = match.groups()
|
||||
dateobj = DATE_CNV.parse("%s %s" % (abt, data))
|
||||
dateobj.set_calendar(CALENDAR_MAP.get(cal,
|
||||
RelLib.Date.CAL_GREGORIAN))
|
||||
dateobj.set_quality(qual)
|
||||
return dateobj
|
||||
|
||||
dateobj = _dp.parse(text)
|
||||
dateobj = DATE_CNV.parse(text)
|
||||
dateobj.set_quality(qual)
|
||||
return dateobj
|
||||
except IOError:
|
||||
return self.dp.set_text(text)
|
||||
return DATE_CNV.set_text(text)
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
@@ -341,8 +303,8 @@ def extract_date(text):
|
||||
#-------------------------------------------------------------------------
|
||||
class Reader:
|
||||
|
||||
def __init__(self, f):
|
||||
self.f = f
|
||||
def __init__(self, ifile):
|
||||
self.ifile = ifile
|
||||
self.current_list = []
|
||||
self.eof = False
|
||||
self.cnv = None
|
||||
@@ -353,11 +315,7 @@ class Reader:
|
||||
TOKEN_CONC : self._fix_token_conc,
|
||||
}
|
||||
|
||||
def set_charset_fn(self,cnv):
|
||||
print "Character set changed", cnv
|
||||
self.cnv = cnv
|
||||
|
||||
def set_broken_conc(self,broken):
|
||||
def set_broken_conc(self, broken):
|
||||
self.func_map = {
|
||||
TOKEN_CONT : self._fix_token_cont,
|
||||
TOKEN_CONC : self._fix_token_broken_conc,
|
||||
@@ -372,46 +330,39 @@ class Reader:
|
||||
return None
|
||||
|
||||
def _fix_token_cont(self, data):
|
||||
l = self.current_list[0]
|
||||
new_value = l[2]+'\n'+data[2]
|
||||
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
|
||||
line = self.current_list[0]
|
||||
new_value = line[2]+'\n'+data[2]
|
||||
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
|
||||
|
||||
def _fix_token_conc(self, data):
|
||||
l = self.current_list[0]
|
||||
new_value = l[2] + data[2]
|
||||
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
|
||||
line = self.current_list[0]
|
||||
new_value = line[2] + data[2]
|
||||
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
|
||||
|
||||
def _fix_token_broken_conc(self, data):
|
||||
l = self.current_list[0]
|
||||
new_value = u"%s %s" % (l[2], data[2])
|
||||
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
|
||||
line = self.current_list[0]
|
||||
new_value = u"%s %s" % (line[2], data[2])
|
||||
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
|
||||
|
||||
def readahead(self):
|
||||
while len(self.current_list) < 5:
|
||||
line = self.f.readline()
|
||||
line = self.ifile.readline()
|
||||
self.index += 1
|
||||
if not line:
|
||||
self.eof = True
|
||||
return
|
||||
|
||||
if self.cnv:
|
||||
try:
|
||||
line = self.cnv(line)
|
||||
except:
|
||||
line = self.cnv(line.translate(_transtable2))
|
||||
else:
|
||||
line = unicode(line,errors='replace')
|
||||
line = line.split(None, 2) + ['']
|
||||
|
||||
line = line.split(None,2) + ['']
|
||||
|
||||
val = line[2].rstrip('\r\n')
|
||||
val = line[2]
|
||||
|
||||
try:
|
||||
level = int(line[0])
|
||||
except:
|
||||
level = 0
|
||||
|
||||
data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1], self.index)
|
||||
data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1],
|
||||
self.index)
|
||||
|
||||
func = self.func_map.get(data[1])
|
||||
if func:
|
||||
@@ -419,25 +370,3 @@ class Reader:
|
||||
else:
|
||||
self.current_list.insert(0, data)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def run():
|
||||
print "Reading", sys.argv[1]
|
||||
a = Reader(sys.argv[1])
|
||||
while True:
|
||||
line = a.readline()
|
||||
print line
|
||||
if not line: break
|
||||
|
||||
# import Utils
|
||||
# Utils.profile(run)
|
||||
run()
|
||||
|
||||
print extract_date("20 JAN 2000")
|
||||
print extract_date("EST 20 JAN 2000")
|
||||
print extract_date("CAL 20 JAN 2000")
|
||||
print extract_date("ABT 20 JAN 2000")
|
||||
print extract_date("INT 20 JAN 2000")
|
||||
print extract_date("BET 20 JAN 2000 AND FEB 2000")
|
||||
print extract_date("FROM 20 JAN 2000 TO FEB 2000")
|
||||
|
@@ -64,13 +64,11 @@ all tokens at the lower level.
|
||||
|
||||
For example:
|
||||
|
||||
|
||||
1 BIRT
|
||||
2 DATE 1 JAN 2000
|
||||
2 UKNOWN TAG
|
||||
3 NOTE DATA
|
||||
|
||||
|
||||
The function parsing the individual at level 1, would encounter the BIRT tag.
|
||||
It would look up the BIRT token in the table to see if a function as defined
|
||||
for this TOKEN, and pass control to this function. This function would then
|
||||
@@ -81,7 +79,6 @@ the level 2 parser, which would then encounter the "UKNOWN" tag. Since this is
|
||||
not a valid token, it would not be in the table, and a function that would skip
|
||||
all lines until the next level 2 token is found (in this case, skipping the
|
||||
"3 NOTE DATA" line.
|
||||
|
||||
"""
|
||||
|
||||
__revision__ = "$Revision: $"
|
||||
@@ -94,10 +91,8 @@ __author__ = "Don Allingham"
|
||||
#-------------------------------------------------------------------------
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
import time
|
||||
from gettext import gettext as _
|
||||
import copy
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
#
|
||||
@@ -114,20 +109,19 @@ LOG = logging.getLogger(".GedcomImport")
|
||||
#-------------------------------------------------------------------------
|
||||
import Errors
|
||||
import RelLib
|
||||
from BasicUtils import NameDisplay
|
||||
from BasicUtils import NameDisplay, UpdateCallback
|
||||
import Utils
|
||||
import Mime
|
||||
import LdsUtils
|
||||
from ansel_utf8 import ansel_to_utf8
|
||||
|
||||
from _GedcomInfo import *
|
||||
from _GedcomTokens import *
|
||||
from _GedcomLex import Reader
|
||||
from _GedcomChar import *
|
||||
|
||||
import _GedcomUtils as GedcomUtils
|
||||
|
||||
from GrampsDb._GrampsDbConst import EVENT_KEY
|
||||
from BasicUtils import UpdateCallback
|
||||
|
||||
try:
|
||||
import Config
|
||||
@@ -145,53 +139,14 @@ ADDR_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)\s*(.*)')
|
||||
ADDR2_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)')
|
||||
ADDR3_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)')
|
||||
|
||||
|
||||
TRUNC_MSG = _("Your GEDCOM file is corrupted. "
|
||||
"It appears to have been truncated.")
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# latin/utf8 conversions
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
|
||||
def latin_to_utf8(msg):
|
||||
"""
|
||||
Converts a string from iso-8859-1 to unicode. If the string is already
|
||||
unicode, we do nothing.
|
||||
|
||||
@param msg: string to convert
|
||||
@type level: str
|
||||
@return: Returns the string, converted to a unicode object
|
||||
@rtype: unicode
|
||||
"""
|
||||
if type(msg) == unicode:
|
||||
return msg
|
||||
else:
|
||||
return unicode(msg, 'iso-8859-1')
|
||||
|
||||
def nocnv(msg):
|
||||
"""
|
||||
Null operation that makes sure that a unicode string remains a unicode
|
||||
string
|
||||
|
||||
@param msg: unicode to convert
|
||||
@type level: unicode
|
||||
@return: Returns the string, converted to a unicode object
|
||||
@rtype: unicode
|
||||
"""
|
||||
return unicode(msg)
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# constants
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
ANSEL = 1
|
||||
UNICODE = 2
|
||||
UPDATE = 25
|
||||
|
||||
TYPE_BIRTH = RelLib.ChildRefType()
|
||||
TYPE_ADOPT = RelLib.ChildRefType(RelLib.ChildRefType.ADOPTED)
|
||||
TYPE_FOSTER = RelLib.ChildRefType(RelLib.ChildRefType.FOSTER)
|
||||
@@ -224,10 +179,6 @@ MIME_MAP = {
|
||||
EVENT_FAMILY_STR = _("%(event_name)s of %(family)s")
|
||||
EVENT_PERSON_STR = _("%(event_name)s of %(person)s")
|
||||
|
||||
TRANS_TABLE = string.maketrans('', '')
|
||||
DEL_CHARS = TRANS_TABLE[0:8] + TRANS_TABLE[10:31]
|
||||
TRANS_TABLE2 = TRANS_TABLE[0:128] + ('?' * 128)
|
||||
|
||||
FTW_BAD_PLACE = [
|
||||
RelLib.EventType.OCCUPATION,
|
||||
RelLib.EventType.RELIGION,
|
||||
@@ -265,6 +216,7 @@ CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$")
|
||||
PERSON_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+INDI(.*)$")
|
||||
|
||||
class StageOne:
|
||||
|
||||
def __init__(self, ifile):
|
||||
self.ifile = ifile
|
||||
self.famc = {}
|
||||
@@ -275,44 +227,47 @@ class StageOne:
|
||||
|
||||
def parse(self):
|
||||
current = ""
|
||||
|
||||
line = self.ifile.read(3)
|
||||
if line == "\xef\xbb":
|
||||
self.ifile.read(1)
|
||||
self.enc = "UTF8"
|
||||
else:
|
||||
self.ifile.seek(0)
|
||||
|
||||
for line in self.ifile:
|
||||
self.lcnt +=1
|
||||
|
||||
data = line.split(None,2) + ['']
|
||||
try:
|
||||
(level, key, value) = data[:3]
|
||||
value = value.strip()
|
||||
# convert the first value to an integer. We have to be a bit
|
||||
# careful here, since some GEDCOM files have garbage characters
|
||||
# at the front of the first file if they are unicode encoded.
|
||||
# So, if we have a failure to convert, check the last character
|
||||
# of the string, which shoul de a '0'
|
||||
try:
|
||||
level = int(level)
|
||||
except:
|
||||
level = int(level[-1])
|
||||
level = 0
|
||||
key = key.strip()
|
||||
except:
|
||||
raise Errors.GedcomError("Corrupted file at line %d" % self.lcnt)
|
||||
|
||||
if level == 0 and key[0] == '@':
|
||||
if value == "FAM":
|
||||
if value == ("FAM", "FAMILY") :
|
||||
current = key.strip()
|
||||
current = current[1:-1]
|
||||
elif value == "INDI":
|
||||
elif value == ("INDI", "INDIVIDUAL"):
|
||||
self.pcnt += 1
|
||||
elif key in ("HUSB", "WIFE") and value and value[0] == '@':
|
||||
elif key in ("HUSB", "HUSBAND", "WIFE") and value and value[0] == '@':
|
||||
value = value[1:-1]
|
||||
if self.fams.has_key(value):
|
||||
self.fams[value].append(current)
|
||||
else:
|
||||
self.fams[value] = [current]
|
||||
elif key == "CHIL" and value and value[0] == '@':
|
||||
elif key in ("CHIL", "CHILD") and value and value[0] == '@':
|
||||
value = value[1:-1]
|
||||
if self.famc.has_key(value):
|
||||
self.famc[value].append(current)
|
||||
else:
|
||||
self.famc[value] = [current]
|
||||
elif key == 'CHAR':
|
||||
elif key == 'CHAR' and not self.enc:
|
||||
self.enc = value
|
||||
|
||||
def get_famc_map(self):
|
||||
@@ -322,7 +277,10 @@ class StageOne:
|
||||
return self.fams
|
||||
|
||||
def get_encoding(self):
|
||||
return self.enc
|
||||
return self.enc.upper()
|
||||
|
||||
def set_encoding(self, enc):
|
||||
self.enc = enc
|
||||
|
||||
def get_person_count(self):
|
||||
return self.pcnt
|
||||
@@ -806,16 +764,20 @@ class GedcomParser(UpdateCallback):
|
||||
data = cursor.next()
|
||||
cursor.close()
|
||||
|
||||
self.lexer = Reader(ifile)
|
||||
enc = stage_one.get_encoding()
|
||||
|
||||
if enc == "ANSEL":
|
||||
rdr = AnselReader(ifile)
|
||||
elif enc in ("UTF-8", "UTF8"):
|
||||
rdr = UTF8Reader(ifile)
|
||||
elif enc in ("UTF-16", "UTF16", "UNICODE"):
|
||||
rdr = UTF16Reader(ifile)
|
||||
else:
|
||||
rdr = AnsiReader(ifile)
|
||||
|
||||
self.lexer = Reader(rdr)
|
||||
self.filename = filename
|
||||
self.backoff = False
|
||||
self.override = False
|
||||
#
|
||||
# if self.override != 0:
|
||||
# if self.override == 1:
|
||||
# self.lexer.set_charset_fn(ansel_to_utf8)
|
||||
# elif self.override == 2:
|
||||
# self.lexer.set_charset_fn(latin_to_utf8)
|
||||
|
||||
fullpath = os.path.normpath(os.path.abspath(filename))
|
||||
self.geddir = os.path.dirname(fullpath)
|
||||
@@ -1064,9 +1026,6 @@ class GedcomParser(UpdateCallback):
|
||||
"""
|
||||
text = self.groups.line
|
||||
msg = _("Line %d was not understood, so it was ignored.") % text
|
||||
import traceback
|
||||
traceback.print_stack()
|
||||
print self.groups
|
||||
self.warn(msg)
|
||||
self.error_count += 1
|
||||
self.skip_subordinate_levels(level)
|
||||
@@ -4039,11 +3998,8 @@ class GedcomParser(UpdateCallback):
|
||||
if genby == "GRAMPS":
|
||||
self.gedsource = self.gedmap.get_from_source_tag(line.data)
|
||||
self.lexer.set_broken_conc(self.gedsource.get_conc())
|
||||
elif line.token == TOKEN_CHAR and not self.override:
|
||||
if line.data == "ANSEL":
|
||||
self.lexer.set_charset_fn(ansel_to_utf8)
|
||||
elif line.data not in ("UNICODE","UTF-8","UTF8"):
|
||||
self.lexer.set_charset_fn(latin_to_utf8)
|
||||
elif line.token == TOKEN_CHAR:
|
||||
pass
|
||||
self.skip_subordinate_levels(2)
|
||||
elif line.token == TOKEN_GEDC:
|
||||
self.skip_subordinate_levels(2)
|
||||
|
@@ -66,6 +66,7 @@ def importData(database, filename, callback=None, use_trans=False):
|
||||
dialog.destroy()
|
||||
else:
|
||||
code_set = None
|
||||
|
||||
import2(database, filename, callback, code_set, use_trans)
|
||||
|
||||
def import2(database, filename, callback, code_set, use_trans):
|
||||
@@ -74,7 +75,10 @@ def import2(database, filename, callback, code_set, use_trans):
|
||||
ifile = open(filename,"rU")
|
||||
np = StageOne(ifile)
|
||||
np.parse()
|
||||
print np.get_encoding()
|
||||
|
||||
if code_set:
|
||||
np.set_encoding(code_set)
|
||||
|
||||
ifile.seek(0)
|
||||
gedparse = GedcomParser(database, ifile, filename, callback, np)
|
||||
except IOError, msg:
|
||||
@@ -85,7 +89,6 @@ def import2(database, filename, callback, code_set, use_trans):
|
||||
_("%s could not be imported") % filename + "\n" + str(msg))
|
||||
return
|
||||
|
||||
|
||||
if database.get_number_of_people() == 0:
|
||||
use_trans = False
|
||||
|
||||
|
Reference in New Issue
Block a user