2007-02-24 Don Allingham <don@gramps-project.org>

* src/DisplayTabs/_NoteModel.py: added
	* src/DisplayTabs/_NoteTab.py: support new list
	* src/GrampsDbUtils/_GedcomParse.py: enhancements to parsing
	* src/GrampsDbUtils/_ReadGedcom.py: handle encoding properly
	* src/GrampsDbUtils/_GedcomChar.py: new encoding interface
	* src/GrampsDbUtils/_GedcomLex.py: cleanup



svn: r8231
This commit is contained in:
Don Allingham 2007-02-25 05:26:32 +00:00
parent a8ad1dcdcf
commit 706916af15
7 changed files with 268 additions and 359 deletions

View File

@ -1,3 +1,11 @@
2007-02-24 Don Allingham <don@gramps-project.org>
* src/DisplayTabs/_NoteModel.py: added
* src/DisplayTabs/_NoteTab.py: support new list
* src/GrampsDbUtils/_GedcomParse.py: enhancements to parsing
* src/GrampsDbUtils/_ReadGedcom.py: handle encoding properly
* src/GrampsDbUtils/_GedcomChar.py: new encoding interface
* src/GrampsDbUtils/_GedcomLex.py: cleanup
2007-02-24 Brian Matherly <brian@gramps-project.org>
* src/docgen/SvgDrawDoc.py.py: Fix XML error in draw_text.

View File

@ -0,0 +1,46 @@
#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2000-2006 Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id: _NoteModel.py 7068 2006-07-24 23:06:49Z rshura $
#-------------------------------------------------------------------------
#
# GTK libraries
#
#-------------------------------------------------------------------------
import gtk
#-------------------------------------------------------------------------
#
# NoteModel
#
#-------------------------------------------------------------------------
class NoteModel(gtk.ListStore):
def __init__(self, note_list, db):
gtk.ListStore.__init__(self, str, str, object)
self.db = db
for handle in note_list:
note = self.db.get_note_from_handle(handle)
self.append(row=[
str(note.get_type()),
note.get().replace('\n', ' ')[:80],
handle,
])

View File

@ -27,169 +27,60 @@
#-------------------------------------------------------------------------
from gettext import gettext as _
#-------------------------------------------------------------------------
#
# GTK libraries
#
#-------------------------------------------------------------------------
import gtk
import pango
#-------------------------------------------------------------------------
#
# GRAMPS classes
#
#-------------------------------------------------------------------------
import Spell
from _GrampsTab import GrampsTab
from DisplayTabs import log
from MarkupText import EditorBuffer
from _NoteModel import NoteModel
from _EmbeddedList import EmbeddedList
#-------------------------------------------------------------------------
#
# NoteTab
#
#-------------------------------------------------------------------------
class NoteTab(GrampsTab):
class NoteTab(EmbeddedList):
def __init__(self, dbstate, uistate, track, note_list, title=_('Note')):
self.note_list = note_list
self.original = note_list[:]
_HANDLE_COL = 2
GrampsTab.__init__(self, dbstate, uistate, track, title)
self.show_all()
_column_names = [
(_('Type'), 0, 100),
(_('Preview'), 1, 200),
]
def get_icon_name(self):
return 'gramps-notes'
def __init__(self, dbstate, uistate, track, data):
self.data = data
EmbeddedList.__init__(self, dbstate, uistate, track,
_("Notes"), NoteModel)
def _update_label(self, *obj):
cc = self.buf.get_char_count()
if cc == 0 and not self.empty:
self.empty = True
self._set_label()
elif cc != 0 and self.empty:
self.empty = False
self._set_label()
def get_editor(self):
pass
def is_empty(self):
"""
Indicates if the tab contains any data. This is used to determine
how the label should be displayed.
"""
return self.buf.get_char_count() == 0
def get_user_values(self):
return []
def build_interface(self):
BUTTON = [(_('Italic'),gtk.STOCK_ITALIC,'<i>i</i>','<Control>I'),
(_('Bold'),gtk.STOCK_BOLD,'<b>b</b>','<Control>B'),
(_('Underline'),gtk.STOCK_UNDERLINE,'<u>u</u>','<Control>U'),
#('Separator', None, None, None),
]
def get_data(self):
return self.data
vbox = gtk.VBox()
def column_order(self):
return ((1, 0), (1, 1))
self.text = gtk.TextView()
self.text.set_accepts_tab(True)
# Accelerator dictionary used for formatting shortcuts
# key: tuple(key, modifier)
# value: widget, to emit 'activate' signal on
self.accelerator = {}
self.text.connect('key-press-event', self._on_key_press_event)
def add_button_clicked(self, obj):
pass
self.flowed = gtk.RadioButton(None, _('Flowed'))
self.format = gtk.RadioButton(self.flowed, _('Formatted'))
# if self.note_obj and self.note_obj.get_format():
# self.format.set_active(True)
# self.text.set_wrap_mode(gtk.WRAP_NONE)
# else:
# self.flowed.set_active(True)
# self.text.set_wrap_mode(gtk.WRAP_WORD)
self.spellcheck = Spell.Spell(self.text)
self.flowed.connect('toggled', self.flow_changed)
scroll = gtk.ScrolledWindow()
scroll.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
scroll.add(self.text)
# FIXME: is this signal called at all
scroll.connect('focus-out-event', self.update)
vbox.pack_start(scroll, True)
vbox.set_spacing(6)
vbox.set_border_width(6)
hbox = gtk.HBox()
hbox.set_spacing(12)
hbox.set_border_width(6)
hbox.pack_start(self.flowed, False)
hbox.pack_start(self.format, False)
vbox.pack_start(hbox, False)
self.pack_start(vbox, True)
self.buf = EditorBuffer()
self.text.set_buffer(self.buf)
tooltips = gtk.Tooltips()
for tip, stock, markup, accel in BUTTON:
if markup:
button = gtk.ToggleButton()
image = gtk.Image()
image.set_from_stock(stock, gtk.ICON_SIZE_MENU)
button.set_image(image)
button.set_relief(gtk.RELIEF_NONE)
tooltips.set_tip(button, tip)
self.buf.setup_widget_from_xml(button, markup)
key, mod = gtk.accelerator_parse(accel)
self.accelerator[(key, mod)] = button
hbox.pack_start(button, False)
else:
hbox.pack_start(gtk.VSeparator(), False)
hbox.pack_start(gtk.Label(_('Additional Notes:')),False)
self.menu = gtk.ComboBox()
hbox.pack_start(self.menu, True)
# if self.note_obj:
# self.empty = False
# self.buf.set_text(self.note_obj.get(markup=True))
# log.debug("Text: %s" % self.buf.get_text())
# else:
# self.empty = True
self.buf.connect('changed', self.update)
self.buf.connect_after('apply-tag', self.update)
self.buf.connect_after('remove-tag', self.update)
def add_callback(self, name):
self.get_data().append(name)
self.changed = True
self.rebuild()
def _on_key_press_event(self, widget, event):
log.debug("Key %s (%d) was pressed on %s" %
(gtk.gdk.keyval_name(event.keyval), event.keyval, widget))
key = event.keyval
mod = event.state
if self.accelerator.has_key((key, mod)):
self.accelerator[(key, mod)].emit('activate')
return True
def edit_button_clicked(self, obj):
note = self.get_selected()
if note:
print note
def update(self, obj, *args):
# if self.note_obj:
# start = self.buf.get_start_iter()
# stop = self.buf.get_end_iter()
# text = self.buf.get_text(start, stop)
# self.note_obj.set(text)
# else:
# print "NOTE OBJ DOES NOT EXIST"
self._update_label(obj)
return False
def flow_changed(self, obj):
if obj.get_active():
self.text.set_wrap_mode(gtk.WRAP_WORD)
# self.note_obj.set_format(0)
else:
self.text.set_wrap_mode(gtk.WRAP_NONE)
# self.note_obj.set_format(1)
def rebuild(self):
self._set_label()
def cancel(self):
pass
# self.note_obj.unserialize(self.original)
def edit_callback(self, name):
self.changed = True
self.rebuild()

View File

@ -0,0 +1,76 @@
#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2000-2005 Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
from ansel_utf8 import ansel_to_utf8
class BaseReader:
def __init__(self, ifile, encoding):
self.ifile = ifile
self.enc = encoding
def reset(self):
self.ifile.seek(0)
def readline(self):
return unicode(self.ifile.readline(),
encoding=self.enc,
errors='replace').strip('\n\r')
class UTF8Reader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, 'utf8')
def reset(self):
self.ifile.seek(0)
data = self.ifile.read(3)
if data != "\xef\xbb\xbf":
self.ifile.seek(0)
def readline(self):
return unicode(self.ifile.readline(),
encoding=self.enc,
errors='replace').strip('\n\r')
class UTF16Reader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, 'utf16')
def reset(self):
self.ifile.seek(0)
data = self.ifile.read(2)
if data != "\xff\xfe":
self.ifile.seek(0)
class AnsiReader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, 'latin1')
class AnselReader(BaseReader):
def __init__(self, ifile):
BaseReader.__init__(self, ifile, "")
def readline(self):
return ansel_to_utf8(self.ifile.readline().strip('\n\r'))

View File

@ -22,21 +22,22 @@
"Import from GEDCOM"
__revision__ = "$Revision: $"
__author__ = "Don Allingham"
#-------------------------------------------------------------------------
#
# standard python modules
#
#-------------------------------------------------------------------------
import re
import string
from gettext import gettext as _
#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
from ansel_utf8 import ansel_to_utf8
from _GedcomInfo import *
from _GedcomTokens import *
@ -45,60 +46,25 @@ from DateHandler._DateParser import DateParser
#-------------------------------------------------------------------------
#
# latin/utf8 conversions
#
# constants #
#-------------------------------------------------------------------------
def utf8_to_latin(msg):
"""
Converts a string from unicode to iso-8859-1. If any illegal characters
are found, they are converted to ?
@param msg: unicode string to convert
@type level: unicode
@return: Returns the string, converted to a ISO-8859-1 object
@rtype: str
"""
return msg.encode('iso-8859-1', 'replace')
def latin_to_utf8(s):
if type(s) == unicode:
return s
else:
return unicode(s,'iso-8859-1')
def nocnv(s):
return unicode(s,errors='replace')
#-------------------------------------------------------------------------
#
# constants
#
#-------------------------------------------------------------------------
ANSEL = 1
UNICODE = 2
UPDATE = 25
_transtable = string.maketrans('','')
_delc = _transtable[0:8] + _transtable[10:31]
_transtable2 = _transtable[0:128] + ('?' * 128)
ged2gramps = {}
GED2GRAMPS = {}
for _val in personalConstantEvents.keys():
_key = personalConstantEvents[_val]
if _key != "":
ged2gramps[_key] = _val
GED2GRAMPS[_key] = _val
for _val in familyConstantEvents.keys():
_key = familyConstantEvents[_val]
if _key != "":
ged2gramps[_key] = _val
GED2GRAMPS[_key] = _val
ged2attr = {}
GED2ATTR = {}
for _val in personalConstantAttributes.keys():
_key = personalConstantAttributes[_val]
if _key != "":
ged2attr[_key] = _val
GED2ATTR[_key] = _val
#-------------------------------------------------------------------------
#
@ -106,26 +72,24 @@ for _val in personalConstantAttributes.keys():
#
#-------------------------------------------------------------------------
intRE = re.compile(r"\s*(\d+)\s*$")
modRegexp = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$")
calRegexp = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$")
rangeRegexp = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$")
spanRegexp = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$")
intRegexp = re.compile(r"\s*INT\s+([^(]+)\((.*)\)$")
MOD = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$")
CAL = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$")
RANGE = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$")
SPAN = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$")
_calendar_map = {
CALENDAR_MAP = {
"FRENCH R" : RelLib.Date.CAL_FRENCH,
"JULIAN" : RelLib.Date.CAL_JULIAN,
"HEBREW" : RelLib.Date.CAL_HEBREW,
}
_quality_map = {
QUALITY_MAP = {
'CAL' : RelLib.Date.QUAL_CALCULATED,
'INT' : RelLib.Date.QUAL_CALCULATED,
'EST' : RelLib.Date.QUAL_ESTIMATED,
}
_sex_map = {
SEX_MAP = {
'F' : RelLib.Person.FEMALE,
'M' : RelLib.Person.MALE,
}
@ -185,20 +149,21 @@ class GedLine:
self.data = data[2]
if self.level == 0:
if self.token_text and self.token_text[0] == '@' and self.token_text[-1] == '@':
if self.token_text and self.token_text[0] == '@' \
and self.token_text[-1] == '@':
self.token = TOKEN_ID
self.token_text = self.token_text[1:-1]
self.data = self.data.strip()
else:
f = MAP_DATA.get(self.token)
if f:
f(self)
func = MAP_DATA.get(self.token)
if func:
func(self)
def calc_sex(self):
"""
Converts the data field to a RelLib token indicating the gender
"""
self.data = _sex_map.get(self.data.strip(),RelLib.Person.UNKNOWN)
self.data = SEX_MAP.get(self.data.strip(), RelLib.Person.UNKNOWN)
def calc_date(self):
"""
@ -212,12 +177,12 @@ class GedLine:
change the type from UNKNOWN to TOKEN_GEVENT (gedcom event), and
the data is assigned to the associated GRAMPS EventType
"""
token = ged2gramps.get(self.token_text)
token = GED2GRAMPS.get(self.token_text)
if token:
self.token = TOKEN_GEVENT
self.data = token
else:
token = ged2attr.get(self.token_text)
token = GED2ATTR.get(self.token_text)
if token:
attr = RelLib.Attribute()
attr.set_value(self.data)
@ -226,10 +191,10 @@ class GedLine:
self.data = attr
def calc_note(self):
d = self.data.strip()
if len(d) > 2 and d[0] == '@' and d[-1] == '@':
gid = self.data.strip()
if len(gid) > 2 and gid[0] == '@' and gid[-1] == '@':
self.token = TOKEN_RNOTE
self.data = d[1:-1]
self.data = gid[1:-1]
def calc_nchi(self):
attr = RelLib.Attribute()
@ -245,10 +210,6 @@ class GedLine:
self.data = attr
self.token = TOKEN_ATTR
def calc_lds(self):
self.data = _
self.token = TOKEN_ATTR
def __repr__(self):
return "%d: %d (%d:%s) %s" % (self.line, self.level, self.token,
self.token_text, self.data)
@ -276,7 +237,7 @@ MAP_DATA = {
#
#-------------------------------------------------------------------------
_dp = GedcomDateParser()
DATE_CNV = GedcomDateParser()
def extract_date(text):
"""
@ -285,54 +246,55 @@ def extract_date(text):
dateobj = RelLib.Date()
try:
# extract out the MOD line
match = modRegexp.match(text)
match = MOD.match(text)
if match:
(mod, text) = match.groups()
qual = _quality_map.get(mod, RelLib.Date.QUAL_NONE)
qual = QUALITY_MAP.get(mod, RelLib.Date.QUAL_NONE)
else:
qual = RelLib.Date.QUAL_NONE
# parse the range if we match, if so, return
match = rangeRegexp.match(text)
match = RANGE.match(text)
if match:
(cal1,data1,cal2,data2) = match.groups()
(cal1, data1, cal2, data2) = match.groups()
cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN)
start = _dp.parse(data1)
stop = _dp.parse(data2)
start = DATE_CNV.parse(data1)
stop = DATE_CNV.parse(data2)
dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_RANGE, cal,
start.get_start_date() + stop.get_start_date())
dateobj.set_quality(qual)
return dateobj
# parse a span if we match
match = spanRegexp.match(text)
match = SPAN.match(text)
if match:
(cal1,data1,cal2,data2) = match.groups()
(cal1, data1, cal2, data2) = match.groups()
cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN)
cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN)
start = _dp.parse(data1)
stop = _dp.parse(data2)
start = DATE_CNV.parse(data1)
stop = DATE_CNV.parse(data2)
dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_SPAN, cal,
start.get_start_date() + stop.get_start_date())
dateobj.set_quality(qual)
return dateobj
match = calRegexp.match(text)
match = CAL.match(text)
if match:
(abt,cal,data) = match.groups()
dateobj = _dp.parse("%s %s" % (abt, data))
dateobj.set_calendar(_calendar_map.get(cal, RelLib.Date.CAL_GREGORIAN))
(abt, cal, data) = match.groups()
dateobj = DATE_CNV.parse("%s %s" % (abt, data))
dateobj.set_calendar(CALENDAR_MAP.get(cal,
RelLib.Date.CAL_GREGORIAN))
dateobj.set_quality(qual)
return dateobj
dateobj = _dp.parse(text)
dateobj = DATE_CNV.parse(text)
dateobj.set_quality(qual)
return dateobj
except IOError:
return self.dp.set_text(text)
return DATE_CNV.set_text(text)
#-------------------------------------------------------------------------
#
@ -341,8 +303,8 @@ def extract_date(text):
#-------------------------------------------------------------------------
class Reader:
def __init__(self, f):
self.f = f
def __init__(self, ifile):
self.ifile = ifile
self.current_list = []
self.eof = False
self.cnv = None
@ -353,11 +315,7 @@ class Reader:
TOKEN_CONC : self._fix_token_conc,
}
def set_charset_fn(self,cnv):
print "Character set changed", cnv
self.cnv = cnv
def set_broken_conc(self,broken):
def set_broken_conc(self, broken):
self.func_map = {
TOKEN_CONT : self._fix_token_cont,
TOKEN_CONC : self._fix_token_broken_conc,
@ -372,46 +330,39 @@ class Reader:
return None
def _fix_token_cont(self, data):
l = self.current_list[0]
new_value = l[2]+'\n'+data[2]
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
line = self.current_list[0]
new_value = line[2]+'\n'+data[2]
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
def _fix_token_conc(self, data):
l = self.current_list[0]
new_value = l[2] + data[2]
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
line = self.current_list[0]
new_value = line[2] + data[2]
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
def _fix_token_broken_conc(self, data):
l = self.current_list[0]
new_value = u"%s %s" % (l[2], data[2])
self.current_list[0] = (l[0], l[1], new_value, l[3], l[4])
line = self.current_list[0]
new_value = u"%s %s" % (line[2], data[2])
self.current_list[0] = (line[0], line[1], new_value, line[3], line[4])
def readahead(self):
while len(self.current_list) < 5:
line = self.f.readline()
line = self.ifile.readline()
self.index += 1
if not line:
self.eof = True
return
if self.cnv:
try:
line = self.cnv(line)
except:
line = self.cnv(line.translate(_transtable2))
else:
line = unicode(line,errors='replace')
line = line.split(None, 2) + ['']
line = line.split(None,2) + ['']
val = line[2].rstrip('\r\n')
val = line[2]
try:
level = int(line[0])
except:
level = 0
data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1], self.index)
data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1],
self.index)
func = self.func_map.get(data[1])
if func:
@ -419,25 +370,3 @@ class Reader:
else:
self.current_list.insert(0, data)
if __name__ == "__main__":
import sys
def run():
print "Reading", sys.argv[1]
a = Reader(sys.argv[1])
while True:
line = a.readline()
print line
if not line: break
# import Utils
# Utils.profile(run)
run()
print extract_date("20 JAN 2000")
print extract_date("EST 20 JAN 2000")
print extract_date("CAL 20 JAN 2000")
print extract_date("ABT 20 JAN 2000")
print extract_date("INT 20 JAN 2000")
print extract_date("BET 20 JAN 2000 AND FEB 2000")
print extract_date("FROM 20 JAN 2000 TO FEB 2000")

View File

@ -64,13 +64,11 @@ all tokens at the lower level.
For example:
1 BIRT
2 DATE 1 JAN 2000
2 UKNOWN TAG
3 NOTE DATA
The function parsing the individual at level 1, would encounter the BIRT tag.
It would look up the BIRT token in the table to see if a function as defined
for this TOKEN, and pass control to this function. This function would then
@ -81,7 +79,6 @@ the level 2 parser, which would then encounter the "UKNOWN" tag. Since this is
not a valid token, it would not be in the table, and a function that would skip
all lines until the next level 2 token is found (in this case, skipping the
"3 NOTE DATA" line.
"""
__revision__ = "$Revision: $"
@ -94,10 +91,8 @@ __author__ = "Don Allingham"
#-------------------------------------------------------------------------
import os
import re
import string
import time
from gettext import gettext as _
import copy
#------------------------------------------------------------------------
#
@ -114,20 +109,19 @@ LOG = logging.getLogger(".GedcomImport")
#-------------------------------------------------------------------------
import Errors
import RelLib
from BasicUtils import NameDisplay
from BasicUtils import NameDisplay, UpdateCallback
import Utils
import Mime
import LdsUtils
from ansel_utf8 import ansel_to_utf8
from _GedcomInfo import *
from _GedcomTokens import *
from _GedcomLex import Reader
from _GedcomChar import *
import _GedcomUtils as GedcomUtils
from GrampsDb._GrampsDbConst import EVENT_KEY
from BasicUtils import UpdateCallback
try:
import Config
@ -145,53 +139,14 @@ ADDR_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)\s*(.*)')
ADDR2_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)')
ADDR3_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)')
TRUNC_MSG = _("Your GEDCOM file is corrupted. "
"It appears to have been truncated.")
#-------------------------------------------------------------------------
#
# latin/utf8 conversions
#
#-------------------------------------------------------------------------
def latin_to_utf8(msg):
"""
Converts a string from iso-8859-1 to unicode. If the string is already
unicode, we do nothing.
@param msg: string to convert
@type level: str
@return: Returns the string, converted to a unicode object
@rtype: unicode
"""
if type(msg) == unicode:
return msg
else:
return unicode(msg, 'iso-8859-1')
def nocnv(msg):
"""
Null operation that makes sure that a unicode string remains a unicode
string
@param msg: unicode to convert
@type level: unicode
@return: Returns the string, converted to a unicode object
@rtype: unicode
"""
return unicode(msg)
#-------------------------------------------------------------------------
#
# constants
#
#-------------------------------------------------------------------------
ANSEL = 1
UNICODE = 2
UPDATE = 25
TYPE_BIRTH = RelLib.ChildRefType()
TYPE_ADOPT = RelLib.ChildRefType(RelLib.ChildRefType.ADOPTED)
TYPE_FOSTER = RelLib.ChildRefType(RelLib.ChildRefType.FOSTER)
@ -224,10 +179,6 @@ MIME_MAP = {
EVENT_FAMILY_STR = _("%(event_name)s of %(family)s")
EVENT_PERSON_STR = _("%(event_name)s of %(person)s")
TRANS_TABLE = string.maketrans('', '')
DEL_CHARS = TRANS_TABLE[0:8] + TRANS_TABLE[10:31]
TRANS_TABLE2 = TRANS_TABLE[0:128] + ('?' * 128)
FTW_BAD_PLACE = [
RelLib.EventType.OCCUPATION,
RelLib.EventType.RELIGION,
@ -265,6 +216,7 @@ CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$")
PERSON_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+INDI(.*)$")
class StageOne:
def __init__(self, ifile):
self.ifile = ifile
self.famc = {}
@ -275,44 +227,47 @@ class StageOne:
def parse(self):
current = ""
line = self.ifile.read(3)
if line == "\xef\xbb":
self.ifile.read(1)
self.enc = "UTF8"
else:
self.ifile.seek(0)
for line in self.ifile:
self.lcnt +=1
data = line.split(None,2) + ['']
try:
(level, key, value) = data[:3]
value = value.strip()
# convert the first value to an integer. We have to be a bit
# careful here, since some GEDCOM files have garbage characters
# at the front of the first file if they are unicode encoded.
# So, if we have a failure to convert, check the last character
# of the string, which shoul de a '0'
try:
level = int(level)
except:
level = int(level[-1])
level = 0
key = key.strip()
except:
raise Errors.GedcomError("Corrupted file at line %d" % self.lcnt)
if level == 0 and key[0] == '@':
if value == "FAM":
if value == ("FAM", "FAMILY") :
current = key.strip()
current = current[1:-1]
elif value == "INDI":
elif value == ("INDI", "INDIVIDUAL"):
self.pcnt += 1
elif key in ("HUSB", "WIFE") and value and value[0] == '@':
elif key in ("HUSB", "HUSBAND", "WIFE") and value and value[0] == '@':
value = value[1:-1]
if self.fams.has_key(value):
self.fams[value].append(current)
else:
self.fams[value] = [current]
elif key == "CHIL" and value and value[0] == '@':
elif key in ("CHIL", "CHILD") and value and value[0] == '@':
value = value[1:-1]
if self.famc.has_key(value):
self.famc[value].append(current)
else:
self.famc[value] = [current]
elif key == 'CHAR':
elif key == 'CHAR' and not self.enc:
self.enc = value
def get_famc_map(self):
@ -322,7 +277,10 @@ class StageOne:
return self.fams
def get_encoding(self):
return self.enc
return self.enc.upper()
def set_encoding(self, enc):
self.enc = enc
def get_person_count(self):
return self.pcnt
@ -806,16 +764,20 @@ class GedcomParser(UpdateCallback):
data = cursor.next()
cursor.close()
self.lexer = Reader(ifile)
enc = stage_one.get_encoding()
if enc == "ANSEL":
rdr = AnselReader(ifile)
elif enc in ("UTF-8", "UTF8"):
rdr = UTF8Reader(ifile)
elif enc in ("UTF-16", "UTF16", "UNICODE"):
rdr = UTF16Reader(ifile)
else:
rdr = AnsiReader(ifile)
self.lexer = Reader(rdr)
self.filename = filename
self.backoff = False
self.override = False
#
# if self.override != 0:
# if self.override == 1:
# self.lexer.set_charset_fn(ansel_to_utf8)
# elif self.override == 2:
# self.lexer.set_charset_fn(latin_to_utf8)
fullpath = os.path.normpath(os.path.abspath(filename))
self.geddir = os.path.dirname(fullpath)
@ -1064,9 +1026,6 @@ class GedcomParser(UpdateCallback):
"""
text = self.groups.line
msg = _("Line %d was not understood, so it was ignored.") % text
import traceback
traceback.print_stack()
print self.groups
self.warn(msg)
self.error_count += 1
self.skip_subordinate_levels(level)
@ -4039,11 +3998,8 @@ class GedcomParser(UpdateCallback):
if genby == "GRAMPS":
self.gedsource = self.gedmap.get_from_source_tag(line.data)
self.lexer.set_broken_conc(self.gedsource.get_conc())
elif line.token == TOKEN_CHAR and not self.override:
if line.data == "ANSEL":
self.lexer.set_charset_fn(ansel_to_utf8)
elif line.data not in ("UNICODE","UTF-8","UTF8"):
self.lexer.set_charset_fn(latin_to_utf8)
elif line.token == TOKEN_CHAR:
pass
self.skip_subordinate_levels(2)
elif line.token == TOKEN_GEDC:
self.skip_subordinate_levels(2)

View File

@ -66,6 +66,7 @@ def importData(database, filename, callback=None, use_trans=False):
dialog.destroy()
else:
code_set = None
import2(database, filename, callback, code_set, use_trans)
def import2(database, filename, callback, code_set, use_trans):
@ -74,7 +75,10 @@ def import2(database, filename, callback, code_set, use_trans):
ifile = open(filename,"rU")
np = StageOne(ifile)
np.parse()
print np.get_encoding()
if code_set:
np.set_encoding(code_set)
ifile.seek(0)
gedparse = GedcomParser(database, ifile, filename, callback, np)
except IOError, msg:
@ -85,7 +89,6 @@ def import2(database, filename, callback, code_set, use_trans):
_("%s could not be imported") % filename + "\n" + str(msg))
return
if database.get_number_of_people() == 0:
use_trans = False