gramps/src/GrampsDbUtils/test/_GedcomChar_test.py

165 lines
5.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python
import unittest
import os.path
import codecs
import struct
from test import test_util as tu
m = tu.msg
par = tu.path_append_parent()
here = tu.absdir()
import _GedcomChar as G
cdir = tu.make_subdir("test_data")
# unicode block "latin1 supplement" chars
utest_chars = "".join(map(unichr, range(0xA0,0x100))) + "\n"
# 12 ansel test chars (raw 8-bit bytes, here)
atest_list = range(0xa1,0xa7) + range(0xb1,0xb7) + [0x0a,]
atest_bytes = struct.pack("B"*13, *atest_list)
# unicode mappings of above (http://www.gymel.com/charsets/ANSEL.html)
a2u = u"".join(map(unichr, (
0x141, 0xd8, 0x110, 0xde, 0xc6, 0x152,
0x142, 0xf8, 0x111, 0xfe, 0xe6, 0x153,
0x0a, )))
def gen_chars(filename, encoding):
"""write generic test chars as given file and encoding"""
if not os.path.exists(filename):
codecs.open(filename, "wb", encoding).write(utest_chars)
###
class Test1_ansi(unittest.TestCase):
"""Test original "ANSI" reader and codecs: latin, cp1252"""
enc = "latin-1"
cp = "cp1252"
fil = os.path.join(cdir,enc)
exp = utest_chars
def setUp(self):
gen_chars(self.fil, self.enc)
def test1a_read_ansi(self):
f = open(self.fil)
ra= G.AnsiReader(f)
got = ra.readline()
self.assertEquals(got,self.exp, m(got,self.exp, "AnsiReader"))
def test1b_read_codec_latin1(self):
got=codecs.open(self.fil, encoding=self.enc).read()
self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.enc))
def test1c_read_codec_cp1252(self):
got=codecs.open(self.fil, encoding=self.cp).read()
self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.cp))
###
class Test2_ansel(unittest.TestCase):
"""Test original AnselReader (later: ansel codec)"""
enc = "ansel"
afil = os.path.join(cdir,enc)
exp = a2u
def setUp(self):
open(self.afil, "wb").write(atest_bytes)
def test2a_read_ansel(self):
f = open(self.afil)
ra = G.AnselReader(f)
got = ra.readline()
self.assertEquals(got,self.exp, m(got,self.exp, "AnselReader"))
###
class Test3(unittest.TestCase):
"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig
with no 'BOM' (sig) in input (the common case)
out of curiosity, verify behavior reading a 1-byte file
"""
enc = "utf_8"
enc_sig = enc + "_sig"
ufil = os.path.join(cdir, "chars.utf8")
f1byte = os.path.join(cdir, "1byte")
exp = utest_chars
def setUp(self):
gen_chars(self.ufil, self.enc)
if not os.path.exists(self.f1byte):
open(self.f1byte, "wb").write("1")
def test3a_u8_UTF8Reader_NO_BOM_sig(self):
f=open(self.ufil)
ra=G.UTF8Reader(f)
g = ra.readline()
self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
r2 = G.UTF8Reader(open(self.f1byte))
g = r2.readline()
self.assertEquals(g,"1",
m(g,"1", "read 1-byte file"))
# NB: utf_8 reads data and never expects a BOM-sig
def test3b_utf8_codec_NO_BOM_sig_as_expected(self):
g=codecs.open(self.ufil, encoding=self.enc).read()
self.assertEquals(g,self.exp, m(g,self.exp, "codec utf8"))
g=codecs.open(self.f1byte, encoding=self.enc).read()
self.assertEquals(g,"1", m(g,"1", "codec utf8"))
# NB: utf_8_sig reads data even absent a BOM-sig (GOOD!)
def test3c_utf8_sig_codec_NO_BOM_sig_tolerated_GOOD(self):
g=codecs.open(self.ufil, encoding=self.enc_sig).read()
self.assertEquals(g,self.exp,
m(g,self.exp, "codec utf_8_sig NO sig input"))
g=codecs.open(self.f1byte, encoding=self.enc_sig).read()
self.assertEquals(g,"1",
m(g,"1", "codec utf_8_sig NO sig input"))
###
class Test4(unittest.TestCase):
"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig
with 'BOM' (sig) in input (uncommon, [some?] MS Windows only?)
"""
enc = "utf_8"
enc_sig = enc + "_sig"
ufil = os.path.join(cdir, "chars.utf8_sig")
exp = utest_chars
def setUp(self):
gen_chars(self.ufil, self.enc_sig)
def test4a_u8_UTF8Reader_WITH_BOM_sig(self):
f=open(self.ufil)
ra=G.UTF8Reader(f)
g = ra.readline()
self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
# utf_8 reads an initial BOM-sig as data -- oops, pity
# write the test to verify this known codec behavior
# ==> Recommend: do not use utf8 as input codec (use utf_8_sig)
def test4b_utf8_codec_WITH_BOM_sig_reads_as_data_PITY(self):
g=codecs.open(self.ufil, encoding=self.enc).read()
e0=u'\ufeff'
self.assertEquals(g[0], e0,
m(g[0],e0, "codec utf8 reads 'BOM'-sig as data" ))
g = g[1:]
self.assertEquals(g,self.exp,
m(g,self.exp, "codec utf8 reads rest of data ok"))
# utf_8_sig reads and ignores the BOM-sig
def test4c_utf8_sig_codec_WITH_BOM_sig_as_expected(self):
g=codecs.open(self.ufil, encoding=self.enc_sig).read()
self.assertEquals(g,self.exp,
m(g,self.exp, "codec utf_8_sig NO sig input"))
###
if __name__ == "__main__":
unittest.main()
#===eof===