* src/plugins/ImportGeneWeb.py (decode): Decode characters and named entities. Because gramps is not web browser based we can simply use unicode.

svn: r4608
2005-05-17 19:46:52 +00:00
parent f8e15f5269
commit 8c2ef473f4
2 changed files with 29 additions and 1 deletions
--- a/gramps2/ChangeLog
+++ b/gramps2/ChangeLog
@ -1,3 +1,8 @@
+2005-05-17  Martin Hawlisch  <Martin.Hawlisch@gmx.de>
+	* src/plugins/ImportGeneWeb.py (decode): Decode characters and named
+	entities. Because gramps is not web browser based we can simply use
+	unicode.
+	
 2005-05-16  Don Allingham  <don@gramps-project.org>
 	* src/PlaceView.py: select correct column for sorting
 	* src/SourceView.py: select correct column for sorting
--- a/gramps2/src/plugins/ImportGeneWeb.py
+++ b/gramps2/src/plugins/ImportGeneWeb.py
@ -52,6 +52,7 @@ import Utils
 import const
 from QuestionDialog import ErrorDialog
 from DateHandler import parser as _dp
+from htmlentitydefs import name2codepoint

 #-------------------------------------------------------------------------
 #
@ -698,7 +699,29 @@ class GeneWebParser:
        return sref

    def decode(self,s):
-        return( latin_utf8.latin_to_utf8( s.replace('_',' ')))
+        s = latin_utf8.latin_to_utf8( s.replace('_',' '))
+        charref_re = re.compile('(&#)(x?)([0-9a-zA-Z]+)(;)')
+        for match in charref_re.finditer(s):
+            try:
+                if match.group(2):  # HEX
+                    nchar = unichr(int(match.group(3),16))
+                else:   # Decimal
+                    nchar = unichr(int(match.group(3)))
+                s = s.replace(match.group(0),nchar)
+            except UnicodeDecodeError:
+                pass
+        
+        # replace named entities
+        entref_re = re.compile('(&)([a-zA-Z]+)(;)')
+        for match in entref_re.finditer(s):
+            try:
+                if match.group(2) in name2codepoint:
+                    nchar = unichr(name2codepoint[match.group(2)])
+                s = s.replace(match.group(0),nchar)
+            except UnicodeDecodeError:
+                pass
+        
+        return( s)

    def cnv(seld,s):
        return( latin_utf8.latin_to_utf8(s))