* src/plugins/ImportGeneWeb.py (decode): Decode characters and named entities. Because gramps is not web browser based we can simply use unicode.

svn: r4608
This commit is contained in:
Martin Hawlisch 2005-05-17 19:46:52 +00:00
parent f8e15f5269
commit 8c2ef473f4
2 changed files with 29 additions and 1 deletions

View File

@ -1,3 +1,8 @@
2005-05-17 Martin Hawlisch <Martin.Hawlisch@gmx.de>
* src/plugins/ImportGeneWeb.py (decode): Decode characters and named
entities. Because gramps is not web browser based we can simply use
unicode.
2005-05-16 Don Allingham <don@gramps-project.org>
* src/PlaceView.py: select correct column for sorting
* src/SourceView.py: select correct column for sorting

View File

@ -52,6 +52,7 @@ import Utils
import const
from QuestionDialog import ErrorDialog
from DateHandler import parser as _dp
from htmlentitydefs import name2codepoint
#-------------------------------------------------------------------------
#
@ -698,7 +699,29 @@ class GeneWebParser:
return sref
def decode(self,s):
return( latin_utf8.latin_to_utf8( s.replace('_',' ')))
s = latin_utf8.latin_to_utf8( s.replace('_',' '))
charref_re = re.compile('(&#)(x?)([0-9a-zA-Z]+)(;)')
for match in charref_re.finditer(s):
try:
if match.group(2): # HEX
nchar = unichr(int(match.group(3),16))
else: # Decimal
nchar = unichr(int(match.group(3)))
s = s.replace(match.group(0),nchar)
except UnicodeDecodeError:
pass
# replace named entities
entref_re = re.compile('(&)([a-zA-Z]+)(;)')
for match in entref_re.finditer(s):
try:
if match.group(2) in name2codepoint:
nchar = unichr(name2codepoint[match.group(2)])
s = s.replace(match.group(0),nchar)
except UnicodeDecodeError:
pass
return( s)
def cnv(seld,s):
return( latin_utf8.latin_to_utf8(s))