Cleaner method of handling whitespace. Contributed by Tim Lyons (tim.g.lyons@googlemail.com)

svn: r11601
This commit is contained in:
Gary Burton 2009-01-10 15:17:27 +00:00
parent 4f07e2ed52
commit ba48b9a6df

View File

@ -255,7 +255,10 @@ def extract_date(text):
""" """
dateobj = gen.lib.Date() dateobj = gen.lib.Date()
text = text.replace('BET ABT','EST BET') # Horrible hack for Tim Lyons text = text.replace('BET ABT','EST BET') # Horrible hack for importing
# illegal GEDCOM from
# Apple Macintosh Classic
# 'Gene' program
try: try:
# extract out the MOD line # extract out the MOD line
@ -352,9 +355,13 @@ class Reader:
def __fix_token_conc(self, data): def __fix_token_conc(self, data):
line = self.current_list[0] line = self.current_list[0]
if len(line[2]) == 4: if len(line[2]) == 4:
# This deals with lines of the form
# 0 @<XREF:NOTE>@ NOTE
# 1 CONC <SUBMITTER TEXT>
# The previous line contains only a tag and no data so concat a # The previous line contains only a tag and no data so concat a
# space to separate the new line from the tag. This prevents the # space to separate the new line from the tag. This prevents the
# first letter of the new line being lost later. # first letter of the new line being lost later
# in _GedcomParse.__parse_record
new_value = line[2] + ' ' + data[2] new_value = line[2] + ' ' + data[2]
else: else:
new_value = line[2] + data[2] new_value = line[2] + data[2]
@ -362,29 +369,34 @@ class Reader:
def __readahead(self): def __readahead(self):
while len(self.current_list) < 5: while len(self.current_list) < 5:
linetmp = self.ifile.readline() line = self.ifile.readline()
self.index += 1 self.index += 1
if not linetmp: if not line:
self.eof = True self.eof = True
return return
try: try:
# the space ensures no trailing whitespace on last parm # According to the GEDCOM 5.5 standard,
line = linetmp.strip(' \n\r').split(None, 2) + [''] # Chapter 1 subsection Grammar
# however keep trailing whitespace on notes only #"leading whitespace preceeding a GEDCOM line should be ignored"
if line[1] == 'CONC' or line[2].startswith('NOTE'): # We will also strip the terminator which is any combination
line = linetmp.strip('\n\r').split(None, 2) + [''] # of carriage_return and line_feed
elif line[1] == 'CONT': line = line.lstrip(' ').rstrip('\n\r')
# Make sure that whitespace is preserved at start and # split into level+delim+rest
# end of CONT data line = line.partition(' ')
part_line = linetmp.strip('\n\r').partition(' CONT ')
line = [part_line[0]] + ['CONT'] + [part_line[2]] + ['']
level = int(line[0]) level = int(line[0])
# there should only be one space after the level,
# but we can ignore more,
# then split into tag+delim+line_value
# or xfef_id+delim+rest
line = line[2].lstrip(' ').partition(' ')
tag = line[0]
line_value = line[2]
except: except:
continue continue
token = GedcomTokens.TOKENS.get(line[1], GedcomTokens.TOKEN_UNKNOWN) token = GedcomTokens.TOKENS.get(tag, GedcomTokens.TOKEN_UNKNOWN)
data = (level, token, line[2], line[1], self.index) data = (level, token, line_value, tag, self.index)
func = self.func_map.get(data[1]) func = self.func_map.get(data[1])
if func: if func: