#! /usr/bin/env python # # check_po - a gramps tool to check validity of po files # # Copyright (C) 2006-2006 Kees Bakker # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # $Id:$ import sys import re all_total = {} all_fuzzy = {} all_untranslated = {} all_percent_s = {} all_named_s = {} all_bnamed_s = {} all_context = {} all_coverage = {} all_template_coverage = {} def strip_quotes(st): st = st.strip() if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"': st = st.strip()[1:-1] return st class Msgid: fuzzy_pat = re.compile( 'fuzzy' ) tips_xml_pat = re.compile( r'tips\.xml' ) def __init__( self, msgnr, lineno ): self._msgid = [] self._msgstr = [] self._cmnt = [] self.nr = msgnr self.lineno = lineno self.is_fuzzy = 0 self.has_sfmt_mismatch = 0 self.has_named_sfmt_mismatch = 0 self.has_fmt_missing_sd = 0 self.has_context_error = 0 self.has_named_fmt_mismatch = 0 self.has_xml_error = 0 self.has_lastchar_error = 0 def diag( self ): if 1: print print "msg nr: %d, lineno: %d%s" % ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" ) sys.stdout.write( ''.join( self._msgid ) ) sys.stdout.write( ''.join( self._msgstr ) ) else: # Compatible with the old check_po print "%d '%s' : '%s'" % ( self.lineno, self.msgid(), self.msgstr() ) def msgid( self ): if not self._msgid: return None txt = '' for l in self._msgid: l = re.sub( r'msgid\s+', '', l ) l = strip_quotes( l ) txt += l return txt def add_msgid( self, line ): self._msgid.append( line ) def msgstr( self ): if not self._msgstr: return None txt = '' for l in self._msgstr: l = re.sub( r'msgstr\s+', '', l ) l = strip_quotes( l ) txt += l return txt def add_msgstr( self, line ): self._msgstr.append( line ) def add_cmnt( self, line ): self._cmnt.append( line ) if not self.is_fuzzy and self.fuzzy_pat.search( line ): self.is_fuzzy = 1 def is_tips_xml( self ): for c in self._cmnt: if self.tips_xml_pat.search( c ): return 1 return 0 def set_sfmt_mismatch( self ): self.has_sfmt_mismatch = 1 def set_named_fmt_mismatch( self ): self.has_named_fmt_mismatch = 1 def set_fmt_missing_sd( self ): self.has_fmt_missing_sd = 1 def set_context_error( self ): self.has_context_error = 1 def set_named_fmt_mismatch( self ): self.has_named_fmt_mismatch = 1 def set_xml_error( self ): self.has_xml_error = 1 def set_lastchar_error( self ): self.has_lastchar_error = 1 def read_msgs( fname ): empty_pat = re.compile( r'^ \s* $', re.VERBOSE ) comment_pat = re.compile( r'\#', re.VERBOSE ) msgid_pat = re.compile( r'msgid \s+ "', re.VERBOSE ) msgstr_pat = re.compile( r'msgstr \s+ "', re.VERBOSE ) str_pat = re.compile( r'"', re.VERBOSE ) old_pat = re.compile( r'\#~ \s+ ', re.VERBOSE ) msgnr = 1 f = open( fname ) lines = f.readlines() # parse it like a statemachine NONE = 0 # Nothing detected, yet CMNT = 1 # Inside comment part MSGID = 2 # Inside msgid part MSGSTR = 3 # Inside msgstr part STR = 4 # A continuation string OLD = 5 # An old pattern with #~ state = NONE msg = None msgs = [] for ix in range( len(lines) ): # Use line numbers for messages line = lines[ix] lineno = ix + 1 m = empty_pat.match( line ) if m: continue # Empty lines are not interesting # What's the next state? if old_pat.match( line ): next_state = OLD elif comment_pat.match( line ): next_state = CMNT elif msgid_pat.match( line ): next_state = MSGID elif msgstr_pat.match( line ): next_state = MSGSTR elif str_pat.match( line ): next_state = STR else: next_state = NONE #print "%(state)d->%(next_state)d\t%(line)s" % vars() if state == NONE: # expect msgid or comment or old stuff if next_state == CMNT: state = CMNT msgnr += 1 msg = Msgid( msgnr, lineno ) # Start with an empty new item msgs.append( msg ) msg.add_cmnt( line ) elif next_state == MSGID: state = MSGID msgnr += 1 msg = Msgid( msgnr, lineno ) # Start with an empty new item msgs.append( msg ) msg.add_msgid( line ) elif next_state == MSGSTR: print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() state = MSGSTR msgnr += 1 msg = Msgid( msgnr, lineno ) # Start with an empty new item msgs.append( msg ) msg.add_msgstr( line ) elif next_state == STR: print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() elif next_state == OLD: pass # Just skip elif state == CMNT: if next_state == CMNT: if msg: msg.add_cmnt( line ) else: # Note. We may need to do something about these comments # Skip for now pass elif next_state == MSGID: state = MSGID if not msg: msgnr += 1 msg = Msgid( msgnr, lineno ) # Start with an empty new item msgs.append( msg ) msg.add_msgid( line ) elif next_state == MSGSTR: print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() state = MSGSTR msgnr += 1 msg = Msgid( msgnr, lineno ) # Start with an empty new item msgs.append( msg ) msg.add_msgstr( line ) elif next_state == STR: print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() elif next_state == OLD: msg = None pass # Just skip elif state == MSGID: if next_state == CMNT: # Hmmm. A comment here? print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars() elif next_state == MSGID: raise Exception( 'Unexpected msgid at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: state = MSGSTR msg.add_msgstr( line ) elif next_state == STR: msg.add_msgid( line ) elif next_state == OLD: msg = None pass # Just skip elif state == MSGSTR: if next_state == CMNT: # A comment probably starts a new item state = CMNT msgnr += 1 msg = Msgid( msgnr, lineno ) msgs.append( msg ) msg.add_cmnt( line ) elif next_state == MSGID: state = MSGID msgnr += 1 msg = Msgid( msgnr, lineno ) msgs.append( msg ) msg.add_msgid( line ) elif next_state == MSGSTR: raise Exception( 'Unexpected msgstr at %(fname)s:%(lineno)d' % vars() ) elif next_state == STR: msg.add_msgstr( line ) elif next_state == OLD: msg = None pass # Just skip else: raise Exception( 'Unexpected state in po parsing (state = %d)' % state ) # Strip items with just comments. (Can this happen?) msgs1 = [] for m in msgs: if not m.msgid() and not m.msgstr(): #print "INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno ) pass else: msgs1.append( m ) msgs = msgs1 return msgs def analyze_msgs( fname, msgs, nr_templates = None, nth = 0 ): nr_fuzzy = 0 nr_untranslated = 0 nr_sfmt_mismatches = 0 nr_named_fmt_mismatches = 0 nr_fmt_missing_sd = 0 nr_context_errors = 0 nr_xml_errors = 0 nr_lastchar_errors = 0 # A pattern to find %() without s or d # Here is a command to use for testing # print re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' ) find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE) # A pattern to find all %() find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE) # Special XML characters # It is not allowed to have a quote, an ampersand or an angle bracket xml_chars_pat = re.compile( r'(?<=\W) > | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE ) for msg in msgs: msgid = msg.msgid() msgstr = msg.msgstr() #print #print "msgid: %(msgid)s" % vars() #print "msgstr: %(msgstr)s" % vars() if not msgstr: nr_untranslated += 1 continue if msg.is_fuzzy: nr_fuzzy += 1 # Skip fuzzies or not? # continue cnt1 = msgid.count('%s') cnt2 = msgstr.count('%s') if cnt1 != cnt2: nr_sfmt_mismatches += 1 msg.set_sfmt_mismatch() # Same number of named formats? fmts1 = find_named_fmt_pat.findall( msgid ) fmts2 = find_named_fmt_pat.findall( msgstr ) if len( fmts1 ) != len( fmts2 ): if not msg.has_sfmt_mismatch: nr_sfmt_mismatches += 1 msg.set_sfmt_mismatch() # Do we have the same named formats? fmts1.sort() fmts2.sort() if fmts1 != fmts2: nr_named_fmt_mismatches += 1 msg.set_named_fmt_mismatch() # Any formats missing format letter? fmts = find_named_fmt_pat2.findall( msgstr ) for f in fmts: if not f in ('s', 'd'): nr_fmt_missing_sd += 1 msg.set_fmt_missing_sd() break # Runaway context. In the translated part we only to see # the translation of the word after the | if msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr: nr_context_errors += 1 msg.set_context_error() # XML errors # Only look at messages in the tips.xml if msg.is_tips_xml(): if xml_chars_pat.search( msgstr ): nr_xml_errors += 1 msg.set_xml_error() # Last character of msgid? White space? Period? if not msg.is_fuzzy and (msgid[-1:].isspace() != msgstr[-1:].isspace() or (msgid[-1:] == '.') != (msgstr[-1:] == '.')): nr_lastchar_errors += 1 msg.set_lastchar_error() nr_msgs = len(msgs) if nth > 0: print print "=====================================" print "%-20s%s" % ( "File:", fname ) print "%-20s%d" % ( "Template total:", nr_templates ) print "%-20s%d" % ( "PO total:", nr_msgs ) print "%-20s%d" % ( "Fuzzy:", nr_fuzzy ) print "%-20s%d" % ( "Untranslated:", nr_untranslated ) print "%-20s%d" % ( "%s mismatches:", nr_sfmt_mismatches ) print "%-20s%d" % ( "%() name mismatches:", nr_named_fmt_mismatches ) print "%-20s%d" % ( "%() missing s/d:", nr_fmt_missing_sd ) print "%-20s%d" % ( "Runaway context:", nr_context_errors ) print "%-20s%d" % ( "XML special chars:", nr_xml_errors ) print "%-20s%d" % ( "Last character:", nr_lastchar_errors ) po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100 print "%-20s%5.2f%%" % ( "PO Coverage:", po_coverage ) template_coverage = po_coverage * float(nr_msgs) / float(nr_templates) print "%-20s%5.2f%%" % ( "Template Coverage:", template_coverage ) if nr_sfmt_mismatches: print print "-------- %s mismatches --------------" for m in msgs: if m.has_sfmt_mismatch: m.diag() if nr_named_fmt_mismatches: print print "-------- %() name mismatches --------------" for m in msgs: if m.has_named_fmt_mismatch: m.diag() if nr_fmt_missing_sd: print print "-------- %() without 's' or 'd' mismatches --------------" for m in msgs: if m.has_fmt_missing_sd: m.diag() if nr_context_errors: print print "-------- Runaway context in translation ---------" for m in msgs: if m.has_context_error: m.diag() if nr_xml_errors: print print "-------- unescaped XML special characters ---------" for m in msgs: if m.has_xml_error: m.diag() if nr_lastchar_errors: print print "-------- last character not identical ---------" for m in msgs: if m.has_lastchar_error: m.diag() def main(): try: pot_msgs = read_msgs( 'gramps.pot' ) nr_templates = len( pot_msgs ) #analyze_msgs( 'gramps.pot', pot_msgs ) nth = 0 for fname in sys.argv[1:]: msgs = read_msgs( fname ) analyze_msgs( fname, msgs, nr_templates, nth ) nth += 1 except Exception, e: print e if __name__ == "__main__": main()