diff --git a/gramps2/ChangeLog b/gramps2/ChangeLog index 6a744def2..0989ee8b2 100644 --- a/gramps2/ChangeLog +++ b/gramps2/ChangeLog @@ -1,3 +1,8 @@ +2005-05-17 Martin Hawlisch + * src/plugins/ImportGeneWeb.py (decode): Decode characters and named + entities. Because gramps is not web browser based we can simply use + unicode. + 2005-05-16 Don Allingham * src/PlaceView.py: select correct column for sorting * src/SourceView.py: select correct column for sorting diff --git a/gramps2/src/plugins/ImportGeneWeb.py b/gramps2/src/plugins/ImportGeneWeb.py index 273baf036..892b4c422 100644 --- a/gramps2/src/plugins/ImportGeneWeb.py +++ b/gramps2/src/plugins/ImportGeneWeb.py @@ -52,6 +52,7 @@ import Utils import const from QuestionDialog import ErrorDialog from DateHandler import parser as _dp +from htmlentitydefs import name2codepoint #------------------------------------------------------------------------- # @@ -698,7 +699,29 @@ class GeneWebParser: return sref def decode(self,s): - return( latin_utf8.latin_to_utf8( s.replace('_',' '))) + s = latin_utf8.latin_to_utf8( s.replace('_',' ')) + charref_re = re.compile('(&#)(x?)([0-9a-zA-Z]+)(;)') + for match in charref_re.finditer(s): + try: + if match.group(2): # HEX + nchar = unichr(int(match.group(3),16)) + else: # Decimal + nchar = unichr(int(match.group(3))) + s = s.replace(match.group(0),nchar) + except UnicodeDecodeError: + pass + + # replace named entities + entref_re = re.compile('(&)([a-zA-Z]+)(;)') + for match in entref_re.finditer(s): + try: + if match.group(2) in name2codepoint: + nchar = unichr(name2codepoint[match.group(2)]) + s = s.replace(match.group(0),nchar) + except UnicodeDecodeError: + pass + + return( s) def cnv(seld,s): return( latin_utf8.latin_to_utf8(s))