From 8dd7840293f95535d8f825f92428f9cc23ba97e0 Mon Sep 17 00:00:00 2001 From: Alex Roitman Date: Wed, 17 Nov 2004 03:36:18 +0000 Subject: [PATCH] * src/DateParser.py: Switch from utf8 strings to unicode. * src/DateDisplay.py: Switch from utf8 strings to unicode. * src/dates/Date_ru.py: Switch from utf8 strings to unicode. * src/dates/Date_fr.py: Switch from utf8 strings to unicode. svn: r3733 --- ChangeLog | 5 ++ src/DateDisplay.py | 6 +-- src/DateParser.py | 22 +++++---- src/dates/Date_fr.py | 78 ++++++++++++++++--------------- src/dates/Date_ru.py | 108 +++++++++++++++++++++++-------------------- 5 files changed, 120 insertions(+), 99 deletions(-) diff --git a/ChangeLog b/ChangeLog index f0b0a9142..d18475c04 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,11 @@ 2004-11-16 Alex Roitman * src/DateParser.py: Typo. + * src/DateParser.py: Switch from utf8 strings to unicode. + * src/DateDisplay.py: Switch from utf8 strings to unicode. + * src/dates/Date_ru.py: Switch from utf8 strings to unicode. + * src/dates/Date_fr.py: Switch from utf8 strings to unicode. + 2004-11-15 Alex Roitman * src/DateDisplay.py: Remove localized displayers. * src/DateParser.py: Remove localized parsers. diff --git a/src/DateDisplay.py b/src/DateDisplay.py index 60fbb8c92..af90dc96e 100644 --- a/src/DateDisplay.py +++ b/src/DateDisplay.py @@ -98,9 +98,9 @@ class DateDisplay: ) _french = ( - '', 'Vend\xc3\xa9miaire', 'Brumaire', - 'Frimaire', 'Niv\xc3\xb4se', 'Pluvi\xc3\xb4se', - 'Vent\xc3\xb4se', 'Germinal', 'Flor\xc3\xa9al', + '', u'Vend\xc3\xa9miaire', 'Brumaire', + 'Frimaire', u'Niv\xc3\xb4se', u'Pluvi\xc3\xb4se', + u'Vent\xc3\xb4se', 'Germinal', u'Flor\xc3\xa9al', 'Prairial', 'Messidor', 'Thermidor', 'Fructidor', 'Extra' ) diff --git a/src/DateParser.py b/src/DateParser.py index 761f0b6f1..71dc7e4e4 100644 --- a/src/DateParser.py +++ b/src/DateParser.py @@ -137,10 +137,10 @@ class DateParser: } french_to_int = { - 'vend\xc3\xa9miaire' : 1, 'brumaire' : 2, - 'frimaire' : 3, 'niv\xc3\xb4se ': 4, - 'pluvi\xc3\xb4se' : 5, 'vent\xc3\xb4se' : 6, - 'germinal' : 7, 'flor\xc3\xa9al' : 8, + u'vend\xc3\xa9miaire' : 1, 'brumaire' : 2, + 'frimaire' : 3, u'niv\xc3\xb4se ': 4, + u'pluvi\xc3\xb4se' : 5, u'vent\xc3\xb4se' : 6, + 'germinal' : 7, u'flor\xc3\xa9al' : 8, 'prairial' : 9, 'messidor' : 10, 'thermidor' : 11, 'fructidor' : 12, 'extra' : 13 @@ -239,7 +239,12 @@ class DateParser: self._mod_str = '(' + '|'.join( [ key.replace('.','\.') for key in self.modifier_to_int.keys() ] ) + ')' - self._mon_str = '(' + '|'.join(self.month_to_int.keys()) + ')' + # Need to reverse-sort the keys, so that April matches before Apr does. + # Otherwise, 'april 2000' would be matched as 'apr' + garbage ('il 2000') + _month_keys = self.month_to_int.keys() + _month_keys.sort() + _month_keys.reverse() + self._mon_str = '(' + '|'.join(_month_keys) + ')' self._jmon_str = '(' + '|'.join(self.hebrew_to_int.keys()) + ')' self._fmon_str = '(' + '|'.join(self.french_to_int.keys()) + ')' self._pmon_str = '(' + '|'.join(self.persian_to_int.keys()) + ')' @@ -316,7 +321,7 @@ class DateParser: self.month_to_int,gregorian_valid) def _parse_calendar(self,text,regex1,regex2,mmap,check=None): - match = regex1.match(text) + match = regex1.match(text.lower()) if match: groups = match.groups() if groups[0] == None: @@ -337,9 +342,10 @@ class DateParser: value = Date.EMPTY return value - match = regex2.match(text) + match = regex2.match(text.lower()) if match: groups = match.groups() + print groups #[ g.encode('utf8') for g in groups ] if groups[1] == None: m = 0 else: @@ -421,8 +427,6 @@ class DateParser: qual = Date.QUAL_NONE cal = Date.CAL_GREGORIAN - text = text.encode('utf8') - match = self._cal.match(text) if match: grps = match.groups() diff --git a/src/dates/Date_fr.py b/src/dates/Date_fr.py index c9307deba..7e2cec340 100644 --- a/src/dates/Date_fr.py +++ b/src/dates/Date_fr.py @@ -50,48 +50,54 @@ from DateDisplay import DateDisplay class DateParserFR(DateParser): modifier_to_int = { - 'avant' : Date.MOD_BEFORE, - 'av.' : Date.MOD_BEFORE, - 'av' : Date.MOD_BEFORE, - 'après' : Date.MOD_AFTER, - 'ap.' : Date.MOD_AFTER, - 'ap' : Date.MOD_AFTER, - 'env.' : Date.MOD_ABOUT, - 'env' : Date.MOD_ABOUT, - 'circa' : Date.MOD_ABOUT, - 'c.' : Date.MOD_ABOUT, - 'vers' : Date.MOD_ABOUT, + u'avant' : Date.MOD_BEFORE, + u'av.' : Date.MOD_BEFORE, + u'av' : Date.MOD_BEFORE, + u'après' : Date.MOD_AFTER, + u'ap.' : Date.MOD_AFTER, + u'ap' : Date.MOD_AFTER, + u'env.' : Date.MOD_ABOUT, + u'env' : Date.MOD_ABOUT, + u'circa' : Date.MOD_ABOUT, + u'c.' : Date.MOD_ABOUT, + u'vers' : Date.MOD_ABOUT, } calendar_to_int = { - 'grégorien' : Date.CAL_GREGORIAN, - 'g' : Date.CAL_GREGORIAN, - 'julien' : Date.CAL_JULIAN, - 'j' : Date.CAL_JULIAN, - 'hébreu' : Date.CAL_HEBREW, - 'h' : Date.CAL_HEBREW, - 'islamique' : Date.CAL_ISLAMIC, - 'i' : Date.CAL_ISLAMIC, - 'révolutionnaire': Date.CAL_FRENCH, - 'r' : Date.CAL_FRENCH, - 'perse' : Date.CAL_PERSIAN, - 'p' : Date.CAL_PERSIAN, + u'grégorien' : Date.CAL_GREGORIAN, + u'g' : Date.CAL_GREGORIAN, + u'julien' : Date.CAL_JULIAN, + u'j' : Date.CAL_JULIAN, + u'hébreu' : Date.CAL_HEBREW, + u'h' : Date.CAL_HEBREW, + u'islamique' : Date.CAL_ISLAMIC, + u'i' : Date.CAL_ISLAMIC, + u'révolutionnaire': Date.CAL_FRENCH, + u'r' : Date.CAL_FRENCH, + u'perse' : Date.CAL_PERSIAN, + u'p' : Date.CAL_PERSIAN, } quality_to_int = { - 'estimated' : Date.QUAL_ESTIMATED, - 'est.' : Date.QUAL_ESTIMATED, - 'est' : Date.QUAL_ESTIMATED, - 'calc.' : Date.QUAL_CALCULATED, - 'calc' : Date.QUAL_CALCULATED, - 'calculated' : Date.QUAL_CALCULATED, + u'estimated' : Date.QUAL_ESTIMATED, + u'est.' : Date.QUAL_ESTIMATED, + u'est' : Date.QUAL_ESTIMATED, + u'calc.' : Date.QUAL_CALCULATED, + u'calc' : Date.QUAL_CALCULATED, + u'calculated' : Date.QUAL_CALCULATED, } def init_strings(self): DateParser.init_strings(self) - self._span = re.compile("(de)\s+(.+)\s+(à)\s+(.+)", + _span_1 = [u'de'] + _span_2 = [u'à'] + _range_1 = [u'ent.',u'ent',u'entre'] + _range_2 = [u'et'] + self._span = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" % + ('|'.join(_span_1),'|'.join(_span_2)), re.IGNORECASE) - self._range = re.compile("(ent.|ent|entre)\s+(.+)\s+(et)\s+(.+)", + self._range = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" % + ('|'.join(_range_1),'|'.join(_range_2)), re.IGNORECASE) #------------------------------------------------------------------------- @@ -102,11 +108,11 @@ class DateParserFR(DateParser): class DateDisplayFR(DateDisplay): calendar = ( - "", " (Julien)", " (Hébreu)", - " (Révolutionnaire)", " (Perse)", " (Islamique)" + "", u" (Julien)", u" (Hébreu)", + u" (Révolutionnaire)", u" (Perse)", u" (Islamique)" ) - _mod_str = ("","avant ","après ","vers ","","","") + _mod_str = ("",u"avant ",u"après ",u"vers ","","","") def display(self,date): """ @@ -126,11 +132,11 @@ class DateDisplayFR(DateDisplay): elif mod == Date.MOD_SPAN: d1 = self.display_cal[cal](start) d2 = self.display_cal[cal](date.get_stop_date()) - return "%sde %s à %s%s" % (qual_str,d1,d2,self.calendar[cal]) + return "%s%s %s %s %s%s" % (qual_str,u'de',d1,u'à',d2,self.calendar[cal]) elif mod == Date.MOD_RANGE: d1 = self.display_cal[cal](start) d2 = self.display_cal[cal](date.get_stop_date()) - return "%sentre %s et %s%s" % (qual_str,d1,d2,self.calendar[cal]) + return "%s%s %s %s %s%s" % (qual_str,u'entre',d1,u'et',d2,self.calendar[cal]) else: text = self.display_cal[date.get_calendar()](start) return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal]) diff --git a/src/dates/Date_ru.py b/src/dates/Date_ru.py index 574ff0b60..6e340cae9 100644 --- a/src/dates/Date_ru.py +++ b/src/dates/Date_ru.py @@ -50,58 +50,64 @@ from DateDisplay import DateDisplay class DateParserRU(DateParser): modifier_to_int = { - 'до' : Date.MOD_BEFORE, - 'по' : Date.MOD_BEFORE, - 'после' : Date.MOD_AFTER, - 'п.' : Date.MOD_AFTER, - 'п' : Date.MOD_AFTER, - 'с' : Date.MOD_AFTER, - 'ок' : Date.MOD_ABOUT, - 'ок.' : Date.MOD_ABOUT, - 'около' : Date.MOD_ABOUT, - 'примерно' : Date.MOD_ABOUT, - 'прим' : Date.MOD_ABOUT, - 'прим.' : Date.MOD_ABOUT, - 'приблизительно' : Date.MOD_ABOUT, - 'приб.' : Date.MOD_ABOUT, - 'прибл.' : Date.MOD_ABOUT, - 'приб' : Date.MOD_ABOUT, - 'прибл' : Date.MOD_ABOUT, + u'до' : Date.MOD_BEFORE, + u'по' : Date.MOD_BEFORE, + u'после' : Date.MOD_AFTER, + u'п.' : Date.MOD_AFTER, + u'п' : Date.MOD_AFTER, + u'с' : Date.MOD_AFTER, + u'ок' : Date.MOD_ABOUT, + u'ок.' : Date.MOD_ABOUT, + u'около' : Date.MOD_ABOUT, + u'примерно' : Date.MOD_ABOUT, + u'прим' : Date.MOD_ABOUT, + u'прим.' : Date.MOD_ABOUT, + u'приблизительно' : Date.MOD_ABOUT, + u'приб.' : Date.MOD_ABOUT, + u'прибл.' : Date.MOD_ABOUT, + u'приб' : Date.MOD_ABOUT, + u'прибл' : Date.MOD_ABOUT, } calendar_to_int = { - 'григорианский' : Date.CAL_GREGORIAN, - 'г' : Date.CAL_GREGORIAN, - 'юлианский' : Date.CAL_JULIAN, - 'ю' : Date.CAL_JULIAN, - 'еврейский' : Date.CAL_HEBREW, - 'е' : Date.CAL_HEBREW, - 'исламский' : Date.CAL_ISLAMIC, - 'и' : Date.CAL_ISLAMIC, - 'республиканский': Date.CAL_FRENCH, - 'р' : Date.CAL_FRENCH, - 'персидский' : Date.CAL_PERSIAN, - 'п' : Date.CAL_PERSIAN, + u'григорианский' : Date.CAL_GREGORIAN, + u'г' : Date.CAL_GREGORIAN, + u'юлианский' : Date.CAL_JULIAN, + u'ю' : Date.CAL_JULIAN, + u'еврейский' : Date.CAL_HEBREW, + u'е' : Date.CAL_HEBREW, + u'исламский' : Date.CAL_ISLAMIC, + u'и' : Date.CAL_ISLAMIC, + u'республиканский': Date.CAL_FRENCH, + u'р' : Date.CAL_FRENCH, + u'персидский' : Date.CAL_PERSIAN, + u'п' : Date.CAL_PERSIAN, } quality_to_int = { - 'оценено' : Date.QUAL_ESTIMATED, - 'оцен.' : Date.QUAL_ESTIMATED, - 'оц.' : Date.QUAL_ESTIMATED, - 'оцен' : Date.QUAL_ESTIMATED, - 'оц' : Date.QUAL_ESTIMATED, - 'вычислено' : Date.QUAL_CALCULATED, - 'вычисл.' : Date.QUAL_CALCULATED, - 'выч.' : Date.QUAL_CALCULATED, - 'вычисл' : Date.QUAL_CALCULATED, - 'выч' : Date.QUAL_CALCULATED, + u'оценено' : Date.QUAL_ESTIMATED, + u'оцен.' : Date.QUAL_ESTIMATED, + u'оц.' : Date.QUAL_ESTIMATED, + u'оцен' : Date.QUAL_ESTIMATED, + u'оц' : Date.QUAL_ESTIMATED, + u'вычислено' : Date.QUAL_CALCULATED, + u'вычисл.' : Date.QUAL_CALCULATED, + u'выч.' : Date.QUAL_CALCULATED, + u'вычисл' : Date.QUAL_CALCULATED, + u'выч' : Date.QUAL_CALCULATED, } def init_strings(self): DateParser.init_strings(self) - self._span = re.compile("(с|от)\s+(.+)\s+(по|до)\s+(.+)", + _span_1 = [u'с',u'от'] + _span_2 = [u'по',u'до'] + _range_1 = [u'между',u'меж',u'меж.'] + _range_2 = [u'и'] + self._span = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" % + ('|'.join(_span_1),'|'.join(_span_2)), re.IGNORECASE) - self._range = re.compile("(между|меж|меж.)\s+(.+)\s+(и)\s+(.+)", + self._range = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" % + ('|'.join(_range_1),'|'.join(_range_2)), re.IGNORECASE) #------------------------------------------------------------------------- @@ -112,16 +118,16 @@ class DateParserRU(DateParser): class DateDisplayRU(DateDisplay): calendar = ( - "", " (юлианский)", - " (еврейский)", - " (республиканский)", - " (персидский)", - " (исламский)" + "", u" (юлианский)", + u" (еврейский)", + u" (республиканский)", + u" (персидский)", + u" (исламский)" ) - _mod_str = ("","до ", - "после ", - "около ","","","") + _mod_str = ("",u"до ", + u"после ", + u"около ","","","") def display(self,date): """ @@ -141,11 +147,11 @@ class DateDisplayRU(DateDisplay): elif mod == Date.MOD_SPAN: d1 = self.display_cal[cal](start) d2 = self.display_cal[cal](date.get_stop_date()) - return "%sс %s по %s%s" % (qual_str,d1,d2,self.calendar[cal]) + return "%sс %s %s %s%s" % (qual_str,d1,u'по',d2,self.calendar[cal]) elif mod == Date.MOD_RANGE: d1 = self.display_cal[cal](start) d2 = self.display_cal[cal](date.get_stop_date()) - return "%sмежду %s и %s%s" % (qual_str,d1,d2,self.calendar[cal]) + return "%s%s %s %s %s%s" % (qual_str,u'между',d1,u'и',d2,self.calendar[cal]) else: text = self.display_cal[date.get_calendar()](start) return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal])