#! /usr/bin/env python
#
# check_po - a gramps tool to check validity of po files
#
# Copyright (C) 2006-2006  Kees Bakker
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

#
# TODO
#
# * Check for HTML text in msgstr when there is none in msgid
# * Check for matching HTML tag/endtag in msgstr
#

import sys
import re
from optparse import OptionParser

all_total = {}
all_fuzzy = {}
all_untranslated = {}
all_percent_s = {}
all_named_s = {}
all_bnamed_s = {}
all_context = {}
all_coverage = {}
all_template_coverage = {}

def strip_quotes(st):
	if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"':
		st = st.strip()[1:-1]
	return st

class CheckException( Exception ):
	pass

# This is a base class for all checks
class Check:
	def __init__( self ):
		self.msgs = []
	def diag( self ):
		if len( self.msgs ):
			print
			print self.diag_header
			for m in self.msgs:
				m.diag()
	def summary( self ):
		print "%-20s%d" % ( self.summary_text, len(self.msgs) )

class Check_fmt( Check ):
	def __init__( self, fmt ):
		Check.__init__( self )
		self.diag_header = "-------- %s mismatches --------------" % fmt
		self.summary_text = "%s mismatches:" % fmt
		self.fmt = fmt

	def __process( self, msg, msgid, msgstr ):
		cnt1 = msgid.count( self.fmt )
		cnt2 = msgstr.count( self.fmt )
		if cnt1 != cnt2:
			self.msgs.append( msg )

	def process( self, msg ):
		msgid = msg.msgid
		msgstr = msg.msgstr[0]
		self.__process( msg, msgid, msgstr )

		if msg.msgidp and len(msg.msgstr) >= 2:
			msgid = msg.msgidp
			msgstr = msg.msgstr[1]
			self.__process( msg, msgid, msgstr )

class Check_named_fmt( Check ):
	# A pattern to find all %()
	find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE)

	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- %() name mismatches --------------"
		self.summary_text = "%() name mismatches:"

	def __process( self, msg, msgid, msgstr ):
		# Same number of named formats?
		fmts1 = self.find_named_fmt_pat.findall( msgid )
		fmts2 = self.find_named_fmt_pat.findall( msgstr )
		if len( fmts1 ) != len( fmts2 ):
			self.msgs.append( msg )
		else:
			# Do we have the same named formats?
			fmts1.sort()
			fmts2.sort()
			if fmts1 != fmts2:
				self.msgs.append( msg )

	def process( self, msg ):
		msgid = msg.msgid
		msgstr = msg.msgstr[0]
		self.__process( msg, msgid, msgstr )

		if msg.msgidp and len(msg.msgstr) >= 2:
			msgid = msg.msgidp
			msgstr = msg.msgstr[1]
			self.__process( msg, msgid, msgstr )

class Check_missing_sd( Check ):
	# A pattern to find %() without s or d
	# Here is a command to use for testing
	# print re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' )
	find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE)

	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- %() without 's' or 'd' mismatches --------------"
		self.summary_text = "%() missing s/d:"
	def process( self, msg ):
		for msgstr in msg.msgstr:
			fmts = self.find_named_fmt_pat2.findall( msgstr )
			for f in fmts:
				if not f in ('s', 'd'):
					self.msgs.append( msg )
					break

class Check_runaway( Check ):
	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- Runaway context in translation ---------"
		self.summary_text = "Runaway context:"

	def __process( self, msg, msgid, msgstr ):
		# Runaway context. In the translated part we only to see
		# the translation of the word after the |
		if msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr:
			self.msgs.append( msg )

	def process( self, msg ):
		msgid = msg.msgid
		msgstr = msg.msgstr[0]
		self.__process( msg, msgid, msgstr )

		if msg.msgidp and len(msg.msgstr) >= 2:
			msgid = msg.msgidp
			msgstr = msg.msgstr[1]
			self.__process( msg, msgid, msgstr )

class Check_xml_chars( Check ):
	# Special XML characters
	# It is not allowed to have a quote, an ampersand or an angle bracket
	xml_chars_pat = re.compile( r'<(?!(b>|/b>|i>|/i>|br/>)) | (?<=!(<b|/b|<i|/i|r/))> | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE )

	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- unescaped XML special characters ---------"
		self.summary_text = "XML special chars:"

	def process( self, msg ):
		msgid = msg.msgid
		msgstr = msg.msgstr[0]

		# XML errors
		# Only look at messages in the tips.xml
		if msg.is_tips_xml:
			if self.xml_chars_pat.search( msgstr ):
				self.msgs.append( msg )

class Check_last_char( Check ):
	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- last character not identical ---------"
		self.summary_text = "Last character:"

	def __process( self, msg, msgid, msgstr ):
		msgid_last = msgid[-1:]
		msgstr_last = msgstr[-1:]
		if msgid_last.isspace() != msgstr_last.isspace():
			self.msgs.append( msg )
		elif (msgid_last == '.') != (msgstr_last == '.'):
			self.msgs.append( msg )

	def process( self, msg ):
		# Last character of msgid? White space? Period?
		if msg.is_fuzzy:
			return
		msgid = msg.msgid
		msgstr = msg.msgstr[0]
		self.__process( msg, msgid, msgstr )

		if msg.msgidp and len(msg.msgstr) >= 2:
			msgid = msg.msgidp
			msgstr = msg.msgstr[1]
			self.__process( msg, msgid, msgstr )

class Check_shortcut_trans( Check ):
	def __init__( self ):
		Check.__init__( self )
		self.diag_header = "-------- shortcut key in translation ---------"
		self.summary_text = "Shortcut in msgstr:"

	def __process( self, msg, msgid, msgstr ):
		if msgid.count('_') == 0 and msgstr.count('_') > 0:
			self.msgs.append( msg )

	def process( self, msg ):
		msgid = msg.msgid
		msgstr = msg.msgstr[0]
		self.__process( msg, msgid, msgstr )

		if msg.msgidp and len(msg.msgstr) >= 2:
			msgid = msg.msgidp
			msgstr = msg.msgstr[1]
			self.__process( msg, msgid, msgstr )

class Msgid:
	fuzzy_pat = re.compile( 'fuzzy' )
	tips_xml_pat = re.compile( r'tips\.xml' )
	def __init__( self, msgnr, lineno ):
		self._msgid = []	# For debugging purpose the original text
		self._msgidp = []	# For debugging purpose the original text
		self._msgstr = []	# For debugging purpose the original text
		self.msgid = ''
		self.msgidp = ''
		self.msgstr = []	# This is a list to support plural
		self._cmnt = []
		self.nr = msgnr
		self.lineno = lineno
		self.is_fuzzy = 0
		self.is_tips_xml = 0

	def diag( self ):
		print
		print "msg nr: %d, lineno: %d%s" % ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" )
		sys.stdout.write( ''.join( self._msgid ) )
		sys.stdout.write( ''.join( self._msgidp ) )
		sys.stdout.write( ''.join( self._msgstr ) )

	def add_msgid( self, line, lineno ):
		self._msgid.append( line )
		line = re.sub( r'msgid\s+', '', line )
		line = line.strip()
		if line[0] != '"' or line[-1:] != '"':
			print "ERROR at line %d: Missing quote." % lineno
		line = strip_quotes( line )
		self.msgid += line

	def add_msgidp( self, line, lineno ):
		self._msgidp.append( line )
		line = re.sub( r'msgid_plural\s+', '', line )
		line = line.strip()
		if line[0] != '"' or line[-1:] != '"':
			print "ERROR at line %d: Missing quote." % lineno
		line = strip_quotes( line )
		self.msgidp += line

	def add_new_msgstr( self, line, lineno ):
		self.msgstr.append( '' )	# Start a new msgstr
		self.add_msgstr( line, lineno )

	def add_msgstr( self, line, lineno ):
		self._msgstr.append( line )
		line = re.sub( r'msgstr(\[\d\])?\s+', '', line )
		line = line.strip()
		if line[0] != '"' or line[-1:] != '"':
			print "ERROR at line %d: Missing quote." % lineno
		line = strip_quotes( line )
		self.msgstr[-1] += line

	def add_cmnt( self, line ):
		self._cmnt.append( line )
		if not self.is_fuzzy and self.fuzzy_pat.search( line ):
			self.is_fuzzy = 1
		if not self.is_tips_xml and self.tips_xml_pat.search( line ):
			self.is_tips_xml = 1

def create_new_Msgid( msgs, lineno ):
	msg = Msgid( len(msgs), lineno )
	msgs.append( msg )
	return msg

def read_msgs( fname ):
	empty_pat   = re.compile( r'^ \s* $',      re.VERBOSE )
	comment_pat = re.compile( r'\#',           re.VERBOSE )
	msgid_pat   = re.compile( r'msgid \s+ "',  re.VERBOSE )
	msgid_plural_pat = re.compile( r'msgid_plural \s+ "', re.VERBOSE )
	msgstr_pat  = re.compile( r'msgstr (\[\d\])? \s+ "', re.VERBOSE )
	str_pat     = re.compile( r'"',            re.VERBOSE )
	old_pat     = re.compile( r'\#~ \s+ ',     re.VERBOSE )

	f = open( fname )
	lines = f.readlines()
	
	# parse it like a statemachine
	NONE   = 'NONE'			# Nothing detected, yet
	CMNT   = 'CMNT'			# Inside comment part
	MSGID  = 'msgid'       		# Inside msgid part
	MSGIDP = 'msgid_plural'		# Inside msgid_plural part
	MSGSTR = 'msgstr'		# Inside msgstr part
	STR    = 'STR'			# A continuation string
	OLD    = 'OLD'			# An old pattern with #~

	global msgs
	state = NONE
	msg = None

	msgs = []
	for ix, line in enumerate( lines ):	# Use line numbers for messages
		lineno = ix + 1

		m = empty_pat.match( line )
		if m:
			continue	# Empty lines are not interesting

		# What's the next state?
		if  old_pat.match( line ):
			next_state = OLD
		elif comment_pat.match( line ):
			next_state = CMNT
		elif msgid_pat.match( line ):
			next_state = MSGID
		elif msgid_plural_pat.match( line ):
			next_state = MSGIDP
		elif msgstr_pat.match( line ):
			next_state = MSGSTR
		elif str_pat.match( line ):
			next_state = STR
		else:
			print 'WARNING: Unexpected input at %(fname)s:%(lineno)d' % vars()
			next_state = NONE

		#print "%(state)d->%(next_state)d\t%(line)s" % vars()
		if state == NONE:
			# expect msgid or comment or old stuff
			if next_state == CMNT:
				state = CMNT
				msg = create_new_Msgid( msgs, lineno ) # Start with an empty new item
				msg.add_cmnt( line )

			elif next_state == MSGID:
				state = MSGID
				msg = create_new_Msgid( msgs, lineno ) # Start with an empty new item
				msg.add_msgid( line, lineno )

			elif next_state == MSGIDP:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGSTR:
				print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()
				state = MSGSTR
				msg = create_new_Msgid( msgs, lineno ) # Start with an empty new item
				msg.add_new_msgstr( line, lineno )

			elif next_state == STR:
				print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars()

			elif next_state == OLD:
				pass	# Just skip

			else:
				raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

		elif state == CMNT:
			# Expect more comment, or msgid. If msgstr or string it is flagged as error.
			if next_state == CMNT:
				if msg:
					msg.add_cmnt( line )
				else:
					# Note. We may need to do something about these comments
					# Skip for now
					pass

			elif next_state == MSGID:
				state = MSGID
				if not msg:
					msg = create_new_Msgid( msgs, lineno ) # Start with an empty new item
				msg.add_msgid( line, lineno )

			elif next_state == MSGIDP:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGSTR:
				print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()
				state = MSGSTR
				msg = create_new_Msgid( msgs, lineno ) # Start with an empty new item
				msg.add_new_msgstr( line, lineno )

			elif next_state == STR:
				print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars()

			elif next_state == OLD:
				msg = None
				pass	# Just skip

			else:
				raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

		elif state == MSGID:
			# Expect msgstr or msgid_plural or string
			if next_state == CMNT:
				# Hmmm. A comment here?
				print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars()

			elif next_state == MSGID:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGIDP:
				state = MSGIDP
				msg.add_msgidp( line, lineno )

			elif next_state == MSGSTR:
				state = MSGSTR
				msg.add_new_msgstr( line, lineno )

			elif next_state == STR:
				# Continuation of msgid, stay in state MSGID
				msg.add_msgid( line, lineno )

			elif next_state == OLD:
				msg = None
				pass	# Just skip

			else:
				raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

		elif state == MSGIDP:
			# Expect msgstr or string or comment
			if next_state == CMNT:
				# Hmmm. A comment here?
				print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars()

			elif next_state == MSGID:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGIDP:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGSTR:
				state = MSGSTR
				msg.add_new_msgstr( line, lineno )

			elif next_state == STR:
				# Continuation of msgid_plural, stay in state MSGIDP
				msg.add_msgidp( line, lineno )

			elif next_state == OLD:
				msg = None
				pass	# Just skip

			else:
				raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

		elif state == MSGSTR:
			# Expect comment, or msgid, or string.
			if next_state == CMNT:
				# A comment probably starts a new item
				state = CMNT
				msg = create_new_Msgid( msgs, lineno )
				msg.add_cmnt( line )

			elif next_state == MSGID:
				state = MSGID
				msg = create_new_Msgid( msgs, lineno )
				msg.add_msgid( line, lineno )

			elif next_state == MSGIDP:
				raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() )

			elif next_state == MSGSTR:
				# New msgstr, probably for plural form
				# Stay in MSGSTR state
				msg.add_new_msgstr( line, lineno )

			elif next_state == STR:
				msg.add_msgstr( line, lineno )

			elif next_state == OLD:
				msg = None
				pass	# Just skip

			else:
				raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

		else:
			raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() )

	# Strip items with just comments. (Can this happen?)
	msgs1 = []
	for m in msgs:
		if not m.msgid and not m.msgstr:
			#print "INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno )
			pass
		else:
			msgs1.append( m )
	msgs = msgs1
	return msgs

def analyze_msgs( options, fname, msgs, nr_templates = None, nth = 0 ):
	nr_fuzzy = 0
	nr_untranslated = 0

	checks = []
	checks.append( Check_fmt( '%s' ) )
	checks.append( Check_fmt( '%d' ) )
	checks.append( Check_named_fmt() )
	checks.append( Check_missing_sd() )
	checks.append( Check_runaway() )
	checks.append( Check_xml_chars() )
	checks.append( Check_last_char() )
	checks.append( Check_shortcut_trans() )

	for msg in msgs:
		msgid = msg.msgid
		msgstr = msg.msgstr
		#print
		#print "msgid: %(msgid)s" % vars()
		#print "msgstr: %(msgstr)s" % vars()

		if ''.join(msgstr) == '':
			nr_untranslated += 1
			continue

		if msg.is_fuzzy:
			nr_fuzzy += 1
			if options.skip_fuzzy:
				continue

		for c in checks:
			c.process( msg )

	nr_msgs = len(msgs)
	if nth > 0:
		print
		print "====================================="
	print "%-20s%s"     % ( "File:",              fname )
	print "%-20s%d"     % ( "Template total:",    nr_templates )
	print "%-20s%d"     % ( "PO total:",          nr_msgs )
	print "%-20s%d"     % ( "Fuzzy:",             nr_fuzzy )
	print "%-20s%d"     % ( "Untranslated:",      nr_untranslated )

	for c in checks:
		c.summary()

	po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100
	print "%-20s%5.2f%%" % ( "PO Coverage:",       po_coverage )

	template_coverage = po_coverage * float(nr_msgs) / float(nr_templates)
	print "%-20s%5.2f%%" % ( "Template Coverage:", template_coverage )

	not_displayed = nr_untranslated + nr_fuzzy
	translation = (1.0 - (float(not_displayed) / float(nr_templates))) * 100
	text = "%-20s%5.2f%%" % ( "Localized at:",     translation)

	if template_coverage == po_coverage:
		print text
	else:
		print text + ' (previous gramps.pot)'

	if not options.only_summary:
		for c in checks:
			c.diag()

def main():
	parser = OptionParser( description='This program validates a PO file for GRAMPS.', usage='%prog [options] po-file...' )

	parser.add_option("", "--skip-fuzzy",
			  action="store_true", dest="skip_fuzzy", default=False,
			  help="skip fuzzies")

	parser.add_option("-s", "--only-summary",
			  action="store_true", dest="only_summary", default=False,
			  help="only give the summary")

	(options, args) = parser.parse_args()

	try:
		pot_msgs = read_msgs( 'gramps.pot' )
		nr_templates = len( pot_msgs )
		nth = 0
		for fname in args:
			msgs = read_msgs( fname )
			analyze_msgs( options, fname, msgs, nr_templates, nth )
			nth += 1

	except CheckException, e:
		print 'Oops.', e
		print 'Bailing out'

if __name__ == "__main__":
	main()