2007-11-06 08:03:40 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
import unittest
|
2008-02-18 20:07:09 +00:00
|
|
|
import os.path
|
2007-11-06 08:03:40 +00:00
|
|
|
import codecs
|
|
|
|
import struct
|
|
|
|
|
|
|
|
from test import test_util as tu
|
|
|
|
m = tu.msg
|
|
|
|
|
|
|
|
par = tu.path_append_parent()
|
|
|
|
here = tu.absdir()
|
|
|
|
|
|
|
|
import _GedcomChar as G
|
|
|
|
|
|
|
|
cdir = tu.make_subdir("test_data")
|
|
|
|
|
|
|
|
# unicode block "latin1 supplement" chars
|
|
|
|
utest_chars = "".join(map(unichr, range(0xA0,0x100))) + "\n"
|
|
|
|
|
|
|
|
# 12 ansel test chars (raw 8-bit bytes, here)
|
|
|
|
atest_list = range(0xa1,0xa7) + range(0xb1,0xb7) + [0x0a,]
|
|
|
|
atest_bytes = struct.pack("B"*13, *atest_list)
|
|
|
|
|
|
|
|
# unicode mappings of above (http://www.gymel.com/charsets/ANSEL.html)
|
|
|
|
a2u = u"".join(map(unichr, (
|
|
|
|
0x141, 0xd8, 0x110, 0xde, 0xc6, 0x152,
|
|
|
|
0x142, 0xf8, 0x111, 0xfe, 0xe6, 0x153,
|
|
|
|
0x0a, )))
|
|
|
|
|
|
|
|
def gen_chars(filename, encoding):
|
|
|
|
"""write generic test chars as given file and encoding"""
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
codecs.open(filename, "wb", encoding).write(utest_chars)
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
###
|
2007-11-06 08:03:40 +00:00
|
|
|
class Test1_ansi(unittest.TestCase):
|
2007-11-13 08:10:00 +00:00
|
|
|
"""Test original "ANSI" reader and codecs: latin, cp1252"""
|
2007-11-06 08:03:40 +00:00
|
|
|
enc = "latin-1"
|
2007-11-13 08:10:00 +00:00
|
|
|
cp = "cp1252"
|
2007-11-06 08:03:40 +00:00
|
|
|
fil = os.path.join(cdir,enc)
|
|
|
|
exp = utest_chars
|
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def setUp(self):
|
|
|
|
gen_chars(self.fil, self.enc)
|
2007-11-06 08:03:40 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test1a_read_ansi(self):
|
|
|
|
f = open(self.fil)
|
2007-11-06 08:03:40 +00:00
|
|
|
ra= G.AnsiReader(f)
|
|
|
|
got = ra.readline()
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(got,self.exp, m(got,self.exp, "AnsiReader"))
|
2007-11-06 08:03:40 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test1b_read_codec_latin1(self):
|
|
|
|
got=codecs.open(self.fil, encoding=self.enc).read()
|
|
|
|
self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.enc))
|
2007-11-06 08:03:40 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test1c_read_codec_cp1252(self):
|
|
|
|
got=codecs.open(self.fil, encoding=self.cp).read()
|
|
|
|
self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.cp))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
###
|
2007-11-06 08:03:40 +00:00
|
|
|
class Test2_ansel(unittest.TestCase):
|
2007-11-13 08:10:00 +00:00
|
|
|
"""Test original AnselReader (later: ansel codec)"""
|
2007-11-06 08:03:40 +00:00
|
|
|
enc = "ansel"
|
|
|
|
afil = os.path.join(cdir,enc)
|
|
|
|
exp = a2u
|
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def setUp(self):
|
|
|
|
open(self.afil, "wb").write(atest_bytes)
|
2007-11-06 08:03:40 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test2a_read_ansel(self):
|
|
|
|
f = open(self.afil)
|
2007-11-06 08:03:40 +00:00
|
|
|
ra = G.AnselReader(f)
|
|
|
|
got = ra.readline()
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(got,self.exp, m(got,self.exp, "AnselReader"))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
###
|
|
|
|
class Test3(unittest.TestCase):
|
|
|
|
"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig
|
|
|
|
with no 'BOM' (sig) in input (the common case)
|
|
|
|
|
|
|
|
out of curiosity, verify behavior reading a 1-byte file
|
|
|
|
"""
|
|
|
|
enc = "utf_8"
|
|
|
|
enc_sig = enc + "_sig"
|
|
|
|
ufil = os.path.join(cdir, "chars.utf8")
|
|
|
|
f1byte = os.path.join(cdir, "1byte")
|
|
|
|
exp = utest_chars
|
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def setUp(self):
|
|
|
|
gen_chars(self.ufil, self.enc)
|
|
|
|
if not os.path.exists(self.f1byte):
|
|
|
|
open(self.f1byte, "wb").write("1")
|
2007-11-13 08:10:00 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test3a_u8_UTF8Reader_NO_BOM_sig(self):
|
|
|
|
f=open(self.ufil)
|
2007-11-13 08:10:00 +00:00
|
|
|
ra=G.UTF8Reader(f)
|
|
|
|
g = ra.readline()
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
|
|
|
|
r2 = G.UTF8Reader(open(self.f1byte))
|
2007-11-13 08:10:00 +00:00
|
|
|
g = r2.readline()
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(g,"1",
|
2007-11-13 08:10:00 +00:00
|
|
|
m(g,"1", "read 1-byte file"))
|
|
|
|
|
|
|
|
# NB: utf_8 reads data and never expects a BOM-sig
|
2008-01-09 16:47:56 +00:00
|
|
|
def test3b_utf8_codec_NO_BOM_sig_as_expected(self):
|
|
|
|
g=codecs.open(self.ufil, encoding=self.enc).read()
|
|
|
|
self.assertEquals(g,self.exp, m(g,self.exp, "codec utf8"))
|
|
|
|
g=codecs.open(self.f1byte, encoding=self.enc).read()
|
|
|
|
self.assertEquals(g,"1", m(g,"1", "codec utf8"))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
# NB: utf_8_sig reads data even absent a BOM-sig (GOOD!)
|
2008-01-09 16:47:56 +00:00
|
|
|
def test3c_utf8_sig_codec_NO_BOM_sig_tolerated_GOOD(self):
|
|
|
|
g=codecs.open(self.ufil, encoding=self.enc_sig).read()
|
|
|
|
self.assertEquals(g,self.exp,
|
|
|
|
m(g,self.exp, "codec utf_8_sig NO sig input"))
|
|
|
|
g=codecs.open(self.f1byte, encoding=self.enc_sig).read()
|
|
|
|
self.assertEquals(g,"1",
|
2007-11-13 08:10:00 +00:00
|
|
|
m(g,"1", "codec utf_8_sig NO sig input"))
|
|
|
|
|
|
|
|
###
|
|
|
|
class Test4(unittest.TestCase):
|
|
|
|
"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig
|
|
|
|
with 'BOM' (sig) in input (uncommon, [some?] MS Windows only?)
|
|
|
|
"""
|
|
|
|
enc = "utf_8"
|
|
|
|
enc_sig = enc + "_sig"
|
|
|
|
ufil = os.path.join(cdir, "chars.utf8_sig")
|
|
|
|
exp = utest_chars
|
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def setUp(self):
|
|
|
|
gen_chars(self.ufil, self.enc_sig)
|
2007-11-13 08:10:00 +00:00
|
|
|
|
2008-01-09 16:47:56 +00:00
|
|
|
def test4a_u8_UTF8Reader_WITH_BOM_sig(self):
|
|
|
|
f=open(self.ufil)
|
2007-11-13 08:10:00 +00:00
|
|
|
ra=G.UTF8Reader(f)
|
|
|
|
g = ra.readline()
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
# utf_8 reads an initial BOM-sig as data -- oops, pity
|
|
|
|
# write the test to verify this known codec behavior
|
|
|
|
# ==> Recommend: do not use utf8 as input codec (use utf_8_sig)
|
2008-01-09 16:47:56 +00:00
|
|
|
def test4b_utf8_codec_WITH_BOM_sig_reads_as_data_PITY(self):
|
|
|
|
g=codecs.open(self.ufil, encoding=self.enc).read()
|
2007-11-13 08:10:00 +00:00
|
|
|
e0=u'\ufeff'
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(g[0], e0,
|
2007-11-13 08:10:00 +00:00
|
|
|
m(g[0],e0, "codec utf8 reads 'BOM'-sig as data" ))
|
|
|
|
g = g[1:]
|
2008-01-09 16:47:56 +00:00
|
|
|
self.assertEquals(g,self.exp,
|
|
|
|
m(g,self.exp, "codec utf8 reads rest of data ok"))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
# utf_8_sig reads and ignores the BOM-sig
|
2008-01-09 16:47:56 +00:00
|
|
|
def test4c_utf8_sig_codec_WITH_BOM_sig_as_expected(self):
|
|
|
|
g=codecs.open(self.ufil, encoding=self.enc_sig).read()
|
|
|
|
self.assertEquals(g,self.exp,
|
|
|
|
m(g,self.exp, "codec utf_8_sig NO sig input"))
|
2007-11-13 08:10:00 +00:00
|
|
|
|
|
|
|
###
|
|
|
|
|
|
|
|
|
2007-11-06 08:03:40 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|
|
|
|
|
|
|
|
#===eof===
|