From ba48b9a6dfac81073556ae5926edbcf671f45628 Mon Sep 17 00:00:00 2001
From: Gary Burton <gary.burton@zen.co.uk>
Date: Sat, 10 Jan 2009 15:17:27 +0000
Subject: [PATCH] Cleaner method of handling whitespace. Contributed by Tim
 Lyons (tim.g.lyons@googlemail.com)

svn: r11601
---
 src/GrampsDbUtils/_GedcomLex.py | 44 +++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 16 deletions(-)
diff --git a/src/GrampsDbUtils/_GedcomLex.py b/src/GrampsDbUtils/_GedcomLex.py
index d4ab3087a..70895d4bc 100644
--- a/src/GrampsDbUtils/_GedcomLex.py
+++ b/src/GrampsDbUtils/_GedcomLex.py
@@ -255,7 +255,10 @@ def extract_date(text):
     """
     dateobj = gen.lib.Date()
 
-    text = text.replace('BET ABT','EST BET') # Horrible hack for Tim Lyons
+    text = text.replace('BET ABT','EST BET') # Horrible hack for importing
+                                             # illegal GEDCOM from
+                                             # Apple Macintosh Classic
+                                             # 'Gene' program
 
     try:
         # extract out the MOD line
@@ -352,9 +355,13 @@ class Reader:
     def __fix_token_conc(self, data):
         line = self.current_list[0]
         if len(line[2]) == 4:
+            # This deals with lines of the form
+            # 0 @<XREF:NOTE>@ NOTE
+            #   1 CONC <SUBMITTER TEXT>
             # The previous line contains only a tag and no data so concat a
             # space to separate the new line from the tag. This prevents the
-            # first letter of the new line being lost later.
+            # first letter of the new line being lost later
+            # in _GedcomParse.__parse_record
             new_value = line[2] + ' ' + data[2]
         else:
             new_value = line[2] + data[2]
@@ -362,29 +369,34 @@ class Reader:
 
     def __readahead(self):
         while len(self.current_list) < 5:
-            linetmp = self.ifile.readline()
+            line = self.ifile.readline()
             self.index += 1
-            if not linetmp:
+            if not line:
                 self.eof = True
                 return
 
             try:
-                # the space ensures no trailing whitespace on last parm
-                line = linetmp.strip(' \n\r').split(None, 2) + ['']
-                # however keep trailing whitespace on notes only
-                if line[1] == 'CONC' or line[2].startswith('NOTE'):
-                    line = linetmp.strip('\n\r').split(None, 2) + ['']
-                elif line[1] == 'CONT':
-                    # Make sure that whitespace is preserved at start and
-                    # end of CONT data
-                    part_line = linetmp.strip('\n\r').partition(' CONT ')
-                    line = [part_line[0]] + ['CONT'] + [part_line[2]] + ['']
+                # According to the GEDCOM 5.5 standard,
+                # Chapter 1 subsection Grammar
+                #"leading whitespace preceeding a GEDCOM line should be ignored"
+                # We will also strip the terminator which is any combination
+                # of carriage_return and line_feed
+                line = line.lstrip(' ').rstrip('\n\r')
+                # split into level+delim+rest
+                line = line.partition(' ')
                 level = int(line[0])
+                # there should only be one space after the level,
+                # but we can ignore more,
+                # then split into tag+delim+line_value
+                # or xfef_id+delim+rest
+                line = line[2].lstrip(' ').partition(' ')
+                tag = line[0]
+                line_value = line[2]
             except:
                 continue
 
-            token = GedcomTokens.TOKENS.get(line[1], GedcomTokens.TOKEN_UNKNOWN)
-            data = (level, token, line[2], line[1], self.index)
+            token = GedcomTokens.TOKENS.get(tag, GedcomTokens.TOKEN_UNKNOWN)
+            data = (level, token, line_value, tag, self.index)
 
             func = self.func_map.get(data[1])
             if func: