gramps/src/GrampsDbUtils/test/_GedcomChar_test.py

#!/usr/bin/env python
import unittest
import os.path
import codecs
import struct

from test import test_util as tu
m = tu.msg

par = tu.path_append_parent()
here = tu.absdir()

import _GedcomChar as G

cdir = tu.make_subdir("test_data")

# unicode block "latin1 supplement" chars
utest_chars = "".join(map(unichr, range(0xA0,0x100))) + "\n"

# 12 ansel test chars (raw 8-bit bytes, here)
atest_list = range(0xa1,0xa7) + range(0xb1,0xb7) + [0x0a,]
atest_bytes = struct.pack("B"*13, *atest_list)

# unicode mappings of above (http://www.gymel.com/charsets/ANSEL.html)
a2u = u"".join(map(unichr, (
    0x141, 0xd8, 0x110, 0xde, 0xc6, 0x152,
    0x142, 0xf8, 0x111, 0xfe, 0xe6, 0x153,
    0x0a, )))

def gen_chars(filename, encoding):
    """write generic test chars as given file and encoding"""
    if not os.path.exists(filename):
        codecs.open(filename, "wb", encoding).write(utest_chars)

###
class Test1_ansi(unittest.TestCase):
    """Test original "ANSI" reader and codecs: latin, cp1252"""
    enc = "latin-1"
    cp = "cp1252"
    fil = os.path.join(cdir,enc)
    exp  = utest_chars
    
    def setUp(self):
        gen_chars(self.fil, self.enc)

    def test1a_read_ansi(self):        
        f = open(self.fil)
        ra= G.AnsiReader(f)
        got = ra.readline()
        self.assertEquals(got,self.exp, m(got,self.exp, "AnsiReader"))

    def test1b_read_codec_latin1(self):
        got=codecs.open(self.fil, encoding=self.enc).read()
        self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.enc))

    def test1c_read_codec_cp1252(self):
        got=codecs.open(self.fil, encoding=self.cp).read()
        self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.cp))

###
class Test2_ansel(unittest.TestCase):
    """Test original AnselReader (later: ansel codec)"""
    enc = "ansel"
    afil = os.path.join(cdir,enc)
    exp  = a2u
    
    def setUp(self):
        open(self.afil, "wb").write(atest_bytes)

    def test2a_read_ansel(self):
        f = open(self.afil)
        ra = G.AnselReader(f)
        got = ra.readline()
        self.assertEquals(got,self.exp, m(got,self.exp, "AnselReader"))

###
class Test3(unittest.TestCase):
    """Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig 
    with no 'BOM' (sig) in input (the common case) 

    out of curiosity, verify behavior reading a 1-byte file
    """
    enc = "utf_8"
    enc_sig = enc + "_sig"
    ufil = os.path.join(cdir, "chars.utf8")
    f1byte = os.path.join(cdir, "1byte")
    exp  = utest_chars
    
    def setUp(self):
        gen_chars(self.ufil, self.enc)
        if not os.path.exists(self.f1byte):
            open(self.f1byte, "wb").write("1")
 
    def test3a_u8_UTF8Reader_NO_BOM_sig(self):
        f=open(self.ufil)
        ra=G.UTF8Reader(f)
        g = ra.readline()
        self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
        r2 = G.UTF8Reader(open(self.f1byte))
        g = r2.readline()
        self.assertEquals(g,"1", 
            m(g,"1", "read 1-byte file"))
  
    # NB: utf_8 reads data and never expects a BOM-sig
    def test3b_utf8_codec_NO_BOM_sig_as_expected(self):
        g=codecs.open(self.ufil, encoding=self.enc).read()
        self.assertEquals(g,self.exp, m(g,self.exp, "codec utf8"))
        g=codecs.open(self.f1byte, encoding=self.enc).read()
        self.assertEquals(g,"1", m(g,"1", "codec utf8"))
  
    # NB: utf_8_sig reads data even absent a BOM-sig (GOOD!)
    def test3c_utf8_sig_codec_NO_BOM_sig_tolerated_GOOD(self):
        g=codecs.open(self.ufil, encoding=self.enc_sig).read()
        self.assertEquals(g,self.exp, 
            m(g,self.exp, "codec utf_8_sig NO sig input"))
        g=codecs.open(self.f1byte, encoding=self.enc_sig).read()
        self.assertEquals(g,"1", 
            m(g,"1", "codec utf_8_sig NO sig input"))

###
class Test4(unittest.TestCase):
    """Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig
    with 'BOM' (sig) in input (uncommon, [some?] MS Windows only?) 
    """
    enc = "utf_8"
    enc_sig = enc + "_sig"
    ufil = os.path.join(cdir, "chars.utf8_sig")
    exp  = utest_chars
    
    def setUp(self):
        gen_chars(self.ufil, self.enc_sig)
 
    def test4a_u8_UTF8Reader_WITH_BOM_sig(self):
        f=open(self.ufil)
        ra=G.UTF8Reader(f)
        g = ra.readline()
        self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))
   
    # utf_8 reads an initial BOM-sig as data -- oops, pity
    #  write the test to verify this known codec behavior 
    # ==> Recommend: do not use utf8 as input codec (use utf_8_sig)
    def test4b_utf8_codec_WITH_BOM_sig_reads_as_data_PITY(self):
        g=codecs.open(self.ufil, encoding=self.enc).read()
        e0=u'\ufeff'
        self.assertEquals(g[0], e0, 
            m(g[0],e0, "codec utf8 reads 'BOM'-sig as data" ))
        g = g[1:]
        self.assertEquals(g,self.exp, 
            m(g,self.exp, "codec utf8 reads rest of data ok"))
  
    # utf_8_sig reads and ignores the BOM-sig
    def test4c_utf8_sig_codec_WITH_BOM_sig_as_expected(self):
        g=codecs.open(self.ufil, encoding=self.enc_sig).read()
        self.assertEquals(g,self.exp, 
            m(g,self.exp, "codec utf_8_sig NO sig input"))

###


if __name__ == "__main__":
    unittest.main()

#===eof===
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`#!/usr/bin/env python`
			`import unittest`
2008-02-18 Raphael Ackermann <raphael.ackermann@gmail.com> * src/* 143 files * test/try_tree_model.py * windows/nsis/gcheck.py imports cleanup svn: r10055 2008-02-18 20:07:09 +00:00			`import os.path`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`import codecs`
			`import struct`

			`from test import test_util as tu`
			`m = tu.msg`

			`par = tu.path_append_parent()`
			`here = tu.absdir()`

			`import _GedcomChar as G`

			`cdir = tu.make_subdir("test_data")`

			`# unicode block "latin1 supplement" chars`
			`utest_chars = "".join(map(unichr, range(0xA0,0x100))) + "\n"`

			`# 12 ansel test chars (raw 8-bit bytes, here)`
			`atest_list = range(0xa1,0xa7) + range(0xb1,0xb7) + [0x0a,]`
			`atest_bytes = struct.pack("B"13, atest_list)`

			`# unicode mappings of above (http://www.gymel.com/charsets/ANSEL.html)`
			`a2u = u"".join(map(unichr, (`
			`0x141, 0xd8, 0x110, 0xde, 0xc6, 0x152,`
			`0x142, 0xf8, 0x111, 0xfe, 0xe6, 0x153,`
			`0x0a, )))`

			`def gen_chars(filename, encoding):`
			`"""write generic test chars as given file and encoding"""`
			`if not os.path.exists(filename):`
			`codecs.open(filename, "wb", encoding).write(utest_chars)`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`###`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`class Test1_ansi(unittest.TestCase):`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`"""Test original "ANSI" reader and codecs: latin, cp1252"""`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`enc = "latin-1"`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`cp = "cp1252"`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`fil = os.path.join(cdir,enc)`
			`exp = utest_chars`

2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def setUp(self):`
			`gen_chars(self.fil, self.enc)`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test1a_read_ansi(self):`
			`f = open(self.fil)`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`ra= G.AnsiReader(f)`
			`got = ra.readline()`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(got,self.exp, m(got,self.exp, "AnsiReader"))`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test1b_read_codec_latin1(self):`
			`got=codecs.open(self.fil, encoding=self.enc).read()`
			`self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.enc))`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test1c_read_codec_cp1252(self):`
			`got=codecs.open(self.fil, encoding=self.cp).read()`
			`self.assertEquals(got,self.exp, m(got,self.exp, "using codec %s" % self.cp))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`###`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`class Test2_ansel(unittest.TestCase):`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`"""Test original AnselReader (later: ansel codec)"""`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`enc = "ansel"`
			`afil = os.path.join(cdir,enc)`
			`exp = a2u`

2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def setUp(self):`
			`open(self.afil, "wb").write(atest_bytes)`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test2a_read_ansel(self):`
			`f = open(self.afil)`
fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00			`ra = G.AnselReader(f)`
			`got = ra.readline()`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(got,self.exp, m(got,self.exp, "AnselReader"))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`###`
			`class Test3(unittest.TestCase):`
			`"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig`
			`with no 'BOM' (sig) in input (the common case)`

			`out of curiosity, verify behavior reading a 1-byte file`
			`"""`
			`enc = "utf_8"`
			`enc_sig = enc + "_sig"`
			`ufil = os.path.join(cdir, "chars.utf8")`
			`f1byte = os.path.join(cdir, "1byte")`
			`exp = utest_chars`

2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def setUp(self):`
			`gen_chars(self.ufil, self.enc)`
			`if not os.path.exists(self.f1byte):`
			`open(self.f1byte, "wb").write("1")`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test3a_u8_UTF8Reader_NO_BOM_sig(self):`
			`f=open(self.ufil)`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`ra=G.UTF8Reader(f)`
			`g = ra.readline()`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))`
			`r2 = G.UTF8Reader(open(self.f1byte))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`g = r2.readline()`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(g,"1",`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`m(g,"1", "read 1-byte file"))`

			`# NB: utf_8 reads data and never expects a BOM-sig`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test3b_utf8_codec_NO_BOM_sig_as_expected(self):`
			`g=codecs.open(self.ufil, encoding=self.enc).read()`
			`self.assertEquals(g,self.exp, m(g,self.exp, "codec utf8"))`
			`g=codecs.open(self.f1byte, encoding=self.enc).read()`
			`self.assertEquals(g,"1", m(g,"1", "codec utf8"))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`# NB: utf_8_sig reads data even absent a BOM-sig (GOOD!)`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test3c_utf8_sig_codec_NO_BOM_sig_tolerated_GOOD(self):`
			`g=codecs.open(self.ufil, encoding=self.enc_sig).read()`
			`self.assertEquals(g,self.exp,`
			`m(g,self.exp, "codec utf_8_sig NO sig input"))`
			`g=codecs.open(self.f1byte, encoding=self.enc_sig).read()`
			`self.assertEquals(g,"1",`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`m(g,"1", "codec utf_8_sig NO sig input"))`

			`###`
			`class Test4(unittest.TestCase):`
			`"""Test otriginal UTF8Reader and codecs: utf_8, utf_8_sig`
			`with 'BOM' (sig) in input (uncommon, [some?] MS Windows only?)`
			`"""`
			`enc = "utf_8"`
			`enc_sig = enc + "_sig"`
			`ufil = os.path.join(cdir, "chars.utf8_sig")`
			`exp = utest_chars`

2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def setUp(self):`
			`gen_chars(self.ufil, self.enc_sig)`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test4a_u8_UTF8Reader_WITH_BOM_sig(self):`
			`f=open(self.ufil)`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`ra=G.UTF8Reader(f)`
			`g = ra.readline()`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(g,self.exp, m(g,self.exp, "orig UTF8Reader"))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`# utf_8 reads an initial BOM-sig as data -- oops, pity`
			`# write the test to verify this known codec behavior`
			`# ==> Recommend: do not use utf8 as input codec (use utf_8_sig)`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test4b_utf8_codec_WITH_BOM_sig_reads_as_data_PITY(self):`
			`g=codecs.open(self.ufil, encoding=self.enc).read()`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`e0=u'\ufeff'`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(g[0], e0,`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00			`m(g[0],e0, "codec utf8 reads 'BOM'-sig as data" ))`
			`g = g[1:]`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`self.assertEquals(g,self.exp,`
			`m(g,self.exp, "codec utf8 reads rest of data ok"))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`# utf_8_sig reads and ignores the BOM-sig`
2008-01-09 Raphael Ackermann <raphael.ackermann@gmail.com> From PEP8 Always use 'self' for the first argument to instance methods. svn: r9767 2008-01-09 16:47:56 +00:00			`def test4c_utf8_sig_codec_WITH_BOM_sig_as_expected(self):`
			`g=codecs.open(self.ufil, encoding=self.enc_sig).read()`
			`self.assertEquals(g,self.exp,`
			`m(g,self.exp, "codec utf_8_sig NO sig input"))`
minor fix and update to test code svn: r9345 2007-11-13 08:10:00 +00:00
			`###`


fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py svn: r9305 2007-11-06 08:03:40 +00:00
			`if __name__ == "__main__":`
			`unittest.main()`

			`#===eof===`