From 0c5b792c76f4dd8b84b5f93d24b933e8f11ce174 Mon Sep 17 00:00:00 2001 From: Benny Malengier Date: Thu, 23 Sep 2010 21:03:15 +0000 Subject: [PATCH] 3969: [NarWeb] Narrated Web Site: Newlines and white space are not preserved in note text Patch of Tim Lyons and Benny Malengier svn: r15923 --- src/plugins/docgen/AsciiDoc.py | 51 +++++++------ src/plugins/docgen/HtmlDoc.py | 67 +++++++++++------ src/plugins/docgen/LaTeXDoc.py | 6 +- src/plugins/docgen/ODFDoc.py | 79 ++++++++++++++++---- src/plugins/docgen/RTFDoc.py | 93 ++++++++++++++++-------- src/plugins/lib/libcairodoc.py | 10 +-- src/plugins/lib/libhtmlbackend.py | 100 ++++++++++++++++++++++++++ src/plugins/webreport/NarrativeWeb.py | 50 +++++++------ 8 files changed, 340 insertions(+), 116 deletions(-) diff --git a/src/plugins/docgen/AsciiDoc.py b/src/plugins/docgen/AsciiDoc.py index 366025be2..9bc94b853 100644 --- a/src/plugins/docgen/AsciiDoc.py +++ b/src/plugins/docgen/AsciiDoc.py @@ -3,7 +3,7 @@ # # Copyright (C) 2000-2006 Donald N. Allingham # Copyright (C) 2007-2009 Brian G. Matherly -# Copyright (C) 2009 Benny Malengier +# Copyright (C) 2009-2010 Benny Malengier # Copyright (C) 2010 Peter Landgren # # This program is free software; you can redistribute it and/or modify @@ -63,28 +63,35 @@ _WIDTH_IN_CHARS = 72 def reformat_para(para='',left=0,right=72,just=LEFT,right_pad=0,first=0): if not para.strip(): return "\n" - words = para.split() + lines = [] - line = '' - word = 0 - end_words = 0 real_left = left+first - while not end_words: - if len(words[word]) > right-real_left: # Handle very long words - line = words[word] - word +=1 - if word >= len(words): - end_words = 1 - else: # Compose line of words - while len(line)+len(words[word]) <= right-real_left: - line += words[word]+' ' - word += 1 + alllines = para.split('\n') + for realline in alllines: + words = realline.split() + line = '' + word = 0 + end_words = 0 + while not end_words: + if not words: + lines.append("\n") + break + if len(words[word]) > right-real_left: # Handle very long words + line = words[word] + word +=1 if word >= len(words): end_words = 1 - break - lines.append(line) - real_left = left - line = '' + else: # Compose line of words + while len(line)+len(words[word]) <= right-real_left: + line += words[word]+' ' + word += 1 + if word >= len(words): + end_words = 1 + break + lines.append(line) + #first line finished, discard first + real_left = left + line = '' if just==CENTER: if right_pad: return '\n'.join( @@ -375,6 +382,8 @@ class AsciiDoc(BaseDoc,TextDoc): some way. Eg, a textdoc could remove all tags, or could make sure a link is clickable. AsciiDoc prints the html without handling it """ + if contains_html: + return text = str(styledtext) if format: #Preformatted note, keep all white spaces, tabs, LF's @@ -390,8 +399,8 @@ class AsciiDoc(BaseDoc,TextDoc): else: for line in text.split('\n\n'): self.start_paragraph(style_name) - line = line.replace('\n',' ') - line = ' '.join(line.split()) + #line = line.replace('\n',' ') + #line = ' '.join(line.split()) self.write_text(line) self.end_paragraph() diff --git a/src/plugins/docgen/HtmlDoc.py b/src/plugins/docgen/HtmlDoc.py index d5eabebb5..cbf95beb3 100644 --- a/src/plugins/docgen/HtmlDoc.py +++ b/src/plugins/docgen/HtmlDoc.py @@ -3,8 +3,10 @@ # # Copyright (C) 2000-2006 Donald N. Allingham # Copyright (C) 2007-2009 Brian G. Matherly -# Copyright (C) 2009 Benny Malengier +# Copyright (C) 2009-2010 Benny Malengier # Copyright (C) 2010 Peter Landgren +# Copyright (C) 2010 Tim Lyons + # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -47,7 +49,7 @@ from gui.utils import open_file_with_default_application import ImgManip import const from gen.plug.docgen import BaseDoc, TextDoc, FONT_SANS_SERIF -from libhtmlbackend import HtmlBackend +from libhtmlbackend import HtmlBackend, process_spaces from libhtml import Html #------------------------------------------------------------------------ @@ -482,10 +484,6 @@ class HtmlDoc(BaseDoc, TextDoc): """ text = str(styledtext) - s_tags = styledtext.get_tags() - #FIXME: following split should be regex to match \n\s*\n instead? - markuptext = self._backend.add_markup_from_styled(text, s_tags, - split='\n\n') self.htmllist += [Html('div', id='grampsstylednote')] if contains_html: #just dump the note out as it is. Adding markup would be dangerous @@ -493,25 +491,48 @@ class HtmlDoc(BaseDoc, TextDoc): self.start_paragraph(style_name) self.__write_text(text, markup=True) self.end_paragraph() - elif format == 1: - #preformatted, retain whitespace. - #so use \n\n for paragraph detection - #FIXME: following split should be regex to match \n\s*\n instead? - self.htmllist += [Html('pre', indent=None, inline=True)] - for line in markuptext.split('\n\n'): - self.start_paragraph(style_name) - for realline in line.split('\n'): - self.__write_text(realline, markup=True) - self.htmllist[-1] += Html('br') + else: + s_tags = styledtext.get_tags() + markuptext = self._backend.add_markup_from_styled(text, s_tags, + split='\n') + self.start_paragraph(style_name) + inpara = True + self._empty = 1 # para is empty + # we explicitly set _empty because start and end para do not seem + # to do a very good job at setting them + linenb = 1 + # The code is tricky here, because we don't want to start a new para + # at the end of the last line if there is no newline there. + # Instead, we want to just end the current para. + for line in markuptext.split('\n'): + [line, sigcount] = process_spaces(line, format) + if sigcount == 0: + if inpara == False: + # needed for runs of three or more newlines + self.start_paragraph(style_name) + inpara = True + self._empty = 1 # para is empty + self.end_paragraph() + inpara = False + linenb = 1 + else: + if inpara == False: + self.start_paragraph(style_name) + inpara = True + self._empty = 1 # para is empty + if linenb > 1: + self.htmllist[-1] += Html('br') + self.__write_text(line, markup=True) + self._empty = 0 # para is not empty + linenb += 1 + if inpara == True: self.end_paragraph() - #end pre element - self.__reduce_list() - elif format == 0: - #flowed - #FIXME: following split should be regex to match \n\s*\n instead? - for line in markuptext.split('\n\n'): + if sigcount == 0: + # if the last line was blank, then as well as outputting the previous para, + # which we have just done, + # we also output a new blank para self.start_paragraph(style_name) - self.__write_text(line, markup=True) + self._empty = 1 # para is empty self.end_paragraph() #end div element self.__reduce_list() diff --git a/src/plugins/docgen/LaTeXDoc.py b/src/plugins/docgen/LaTeXDoc.py index 862190547..b8b0e257c 100644 --- a/src/plugins/docgen/LaTeXDoc.py +++ b/src/plugins/docgen/LaTeXDoc.py @@ -641,9 +641,11 @@ class LaTeXDoc(BaseDoc, TextDoc): else: for line in markuptext.split('\n\n'): self.start_paragraph(style_name) - self._backend.write(line) + for realline in line.split('\n'): + self._backend.write(realline) + self._backend.write("\\newline\n") self.end_paragraph() - self._backend.write("\n\\vspace*{0.5cm} \n\end{minipage}\n\n") + self._backend.write("\n\\vspace*{0.5cm} \n\\end{minipage}\n\n") def write_endnotes_ref(self, text, style_name): """ diff --git a/src/plugins/docgen/ODFDoc.py b/src/plugins/docgen/ODFDoc.py index 87a26b6e3..6f1a9f503 100644 --- a/src/plugins/docgen/ODFDoc.py +++ b/src/plugins/docgen/ODFDoc.py @@ -1379,48 +1379,52 @@ class ODFDoc(BaseDoc, TextDoc, DrawDoc): def write_styled_note(self, styledtext, format, style_name, contains_html=False): """ - Convenience function to write a styledtext to the latex doc. + Convenience function to write a styledtext to the ODF doc. styledtext : assumed a StyledText object to write format : = 0 : Flowed, = 1 : Preformatted style_name : name of the style to use for default presentation contains_html: bool, the backend should not check if html is present. If contains_html=True, then the textdoc is free to handle that in some way. Eg, a textdoc could remove all tags, or could make sure - a link is clickable. RTFDoc prints the html without handling it + a link is clickable. ODFDoc prints the html without handling it """ text = str(styledtext) s_tags = styledtext.get_tags() text = text.replace('&', '\1') # must be the first text = text.replace('<', '\2') text = text.replace('>', '\3') - markuptext = self._backend.add_markup_from_styled(text, s_tags) - + markuptext = self._backend.add_markup_from_styled(text, s_tags, '\n') # we need to know if we have new styles to add. # if markuptext contains : FontColor, FontFace, FontSize ... # we must prepare the new styles for the styles.xml file. # We are looking for the following format : # style-name="([a-zA-Z0-9]*)__([a-zA-Z0-9 ])"> # The first element is the StyleType and the second one is the value - start = 0 - while True: + while 1: m = NewStyle.search(markuptext, start) if not m: break - self.StyleList.append( - [m.group(1) + m.group(2), m.group(1), m.group(2)] - ) + self.StyleList.append([m.group(1)+m.group(2), + m.group(1), + m.group(2)]) start = m.end() - + linenb = 1 self.start_paragraph(style_name) markuptext = markuptext.replace('\1', '&') # must be the first markuptext = markuptext.replace('\2', '<') markuptext = markuptext.replace('\3', '>') - - for l, line in enumerate(markuptext.split('\n')): - if l: - self.cntnt.write('') - self.cntnt.write(line) + for line in markuptext.split('\n'): + [line, sigcount] = process_spaces(line, format) + if sigcount == 0: + self.end_paragraph() + self.start_paragraph(style_name) + linenb = 1 + else: + if ( linenb > 1 ): + self.cntnt.write('') + self.cntnt.write(line) + linenb += 1 self.end_paragraph() def write_text(self, text, mark=None): @@ -1704,3 +1708,48 @@ class ODFDoc(BaseDoc, TextDoc, DrawDoc): '' ) self.cntnt.write('\n') + +def process_spaces(line, format): + """ + Function to process spaces in text lines for flowed and pre-formatted notes. + line : text to process + format : = 0 : Flowed, = 1 : Preformatted + + If the text is flowed (format==0), then leading spaces (after ignoring XML) + are removed. Embedded multiple spaces are reduced to one by ODF + If the text is pre-formatted (format==1). then all spaces (after ignoring XML) + are replaced by "" + + Returns the processed text, and the number of significant (i.e. non-white-space) chars. + """ + txt = "" + xml = False + sigcount = 0 + # we loop through every character, which is very inefficient, but an attempt to use + # a regex replace didn't always work. This was the code that was replaced. + # Problem, we may not replace ' ' in xml tags, so we use a regex + # self.cntnt.write(re.sub(' (?=([^(<|>)]*<[^>]*>)*[^>]*$)', + # "", line)) + for char in line: + if char == '<' and xml == False: + xml = True + txt += char + elif char == '>' and xml == True: + xml = False + txt += char + elif xml == True: + txt += char + elif char == " " or char == "\t": + if format == 0 and sigcount == 0: + pass + elif format == 1: + #preformatted, section White-space characters of + # http://docs.oasis-open.org/office/v1.1/OS/OpenDocument-v1.1-html/OpenDocument-v1.1.html#5.1.1.White-space%20Characters|outline + txt += "" + else: + txt += char + else: + sigcount += 1 + txt += char + return [txt, sigcount] + \ No newline at end of file diff --git a/src/plugins/docgen/RTFDoc.py b/src/plugins/docgen/RTFDoc.py index 58a1636c6..a15057c38 100644 --- a/src/plugins/docgen/RTFDoc.py +++ b/src/plugins/docgen/RTFDoc.py @@ -227,6 +227,12 @@ class RTFDoc(BaseDoc,TextDoc): # #-------------------------------------------------------------------- def end_paragraph(self): + # FIXME: I don't understand why no end paragraph marker is output when + # we are inside a table. Since at least version 3.2.2, this seems to mean that + # there is no new paragraph after the first line of a table entry. + # For example in the birth cell, the first paragraph should be the + # description (21 Jan 1900 in London); if there is a note following this, + # there is no newline between the description and the note. if not self.in_table: self.f.write(self.text) if self.opened: @@ -404,11 +410,11 @@ class RTFDoc(BaseDoc,TextDoc): self.f.write('\n') index = index+1 self.f.write('}}\\par\n') - + def write_styled_note(self, styledtext, format, style_name, contains_html=False): """ - Convenience function to write a styledtext to the latex doc. + Convenience function to write a styledtext to the RTF doc. styledtext : assumed a StyledText object to write format : = 0 : Flowed, = 1 : Preformatted style_name : name of the style to use for default presentation @@ -418,41 +424,31 @@ class RTFDoc(BaseDoc,TextDoc): a link is clickable. RTFDoc prints the html without handling it """ text = str(styledtext) - if format: - # Preformatted note - for line in text.split('\n'): - self.start_paragraph(style_name) - self.write_text(line) + self.start_paragraph(style_name) + linenb = 1 + for line in text.split('\n'): + [line, sigcount] = process_spaces(line, format) + if sigcount == 0: if self.in_table: # # Add LF when in table as in indiv_complete report self.write_text('\n') self.end_paragraph() - else: - firstline = True - for line in text.split('\n\n'): self.start_paragraph(style_name) - if len(line) > 0: - # Remember first char, can be a LF. - firstchar = line[0] - # Replace all LF's with space and reformat. - line = line.replace('\n',' ') - line = ' '.join(line.split()) - # If remembered first char is LF, insert in front of lines - #This takes care of the case with even number of empty lines. - if firstchar == '\n': - line = firstchar + line - #Insert LF's if not first line. - if not firstline: - line = '\n\n' + line - else: - # If odd number of empty lines line will be empty. - line = '\n\n' + linenb = 1 + else: + if ( linenb > 1 ): + self.write_text('\\line ') self.write_text(line) - self.end_paragraph() - firstline = False - self.start_paragraph(style_name) + linenb += 1 + # FIXME: I don't understand why these newlines are necessary. + # It may be related to the behaviour of end_paragraph inside tables, and + # write_text converting \n to end paragraph. + # This code prevents the whole document going wrong, but seems to produce an extra + # paragraph mark at the end of each table cell. + if self.in_table: + # # Add LF when in table as in indiv_complete report self.write_text('\n') - self.end_paragraph() + self.end_paragraph() def write_endnotes_ref(self,text,style_name): """ @@ -497,3 +493,40 @@ class RTFDoc(BaseDoc,TextDoc): self.text += '\\%s' % i else: self.text += i + +def process_spaces (self, line, format): + """ + Function to process spaces in text lines for flowed and pre-formatted notes. + line : text to process + format : = 0 : Flowed, = 1 : Preformatted + + If the text is flowed (format==0), then leading spaces + are removed, and multiple spaces are reduced to one. + If the text is pre-formatted (format==1). then all spaces are preserved + + Note that xml is just treated like any other text, + because it will be from the original note, and it is just printed, not interpreted. + Returns the processed text, and the number of significant (i.e. non-white-space) chars. + """ + txt = "" + xml = False + space = False + sigcount = 0 + # we loop through every character, which is very inefficient, but an attempt to use + # a regex replace didn't always work. + for char in line: + if char == " " or char == "\t": + if format == 1: + txt += char + elif format == 0 and sigcount == 0: + pass + elif format == 0 and space == False: + space = True + txt += char + elif format == 0 and space == True: + pass + else: + sigcount += 1 + space = False + txt += char + return [txt, sigcount] diff --git a/src/plugins/lib/libcairodoc.py b/src/plugins/lib/libcairodoc.py index cfe896af5..b220f4b2a 100644 --- a/src/plugins/lib/libcairodoc.py +++ b/src/plugins/lib/libcairodoc.py @@ -1335,10 +1335,12 @@ class CairoDoc(BaseDoc, TextDoc, DrawDoc): #FIXME: following split should be regex to match \n\s*\n instead? for line in markuptext.split('\n\n'): self.start_paragraph(style_name) - #flowed, make normal whitespace go away - line = line.replace('\n',' ') - line = ' '.join(line.split()) - self.__write_text(line, markup=True) + #flowed, normal whitespace goes away, but we keep linebreak + lines = line.split('\n') + newlines = [] + for singleline in lines: + newlines.append(' '.join(singleline.split())) + self.__write_text('\n'.join(newlines), markup=True) self.end_paragraph() def __write_text(self, text, mark=None, markup=False): diff --git a/src/plugins/lib/libhtmlbackend.py b/src/plugins/lib/libhtmlbackend.py index a2d48a160..37050102c 100644 --- a/src/plugins/lib/libhtmlbackend.py +++ b/src/plugins/lib/libhtmlbackend.py @@ -46,6 +46,106 @@ from gen.plug.docbackend import DocBackend from libhtml import Html from Utils import xml_lang + +#------------------------------------------------------------------------ +# +# Functions +# +#------------------------------------------------------------------------ + +def process_spaces(intext, format): + """ + Function to process spaces in text lines for pre-formatted notes. + line : text to process + format : = 0 : Flowed, = 1 : Preformatted + + If the text is pre-formatted (format==1), then leading spaces (after ignoring XML) + are replaced by alternating non-breaking spaces and ordinary spaces. + After the first non-space character, single spaces are left + but multiple spaces are replaced by alternating NBSP and space + If the text is flowed, the text is unchanged. + + Returns the processed text, and the number of significant + (i.e. non-xml non-white-space) chars. + """ + NORMAL=1 + SPACE=2 + NBSP=3 + XML=4 + SPACEHOLD=5 + + sigcount = 0 + state = NORMAL + outtext = "" + if format == 1: + # Pre-formatted + for char in intext: + if state == NORMAL: + if char == " ": + if sigcount == 0: + state = NBSP + outtext += " " + else: + state = SPACEHOLD + elif char == "<": + state = XML + outtext += char + else: + sigcount += 1 + outtext += char + elif state == SPACE: + if char == " ": + state = NBSP + outtext += " " + elif char == "<": + state = XML + outtext += char + else: + sigcount += 1 + state = NORMAL + outtext += char + elif state == NBSP: + if char == " ": + state = SPACE + elif char == "<": + state = XML + else: + sigcount += 1 + state = NORMAL + outtext += char + elif state == XML: + if char == ">": + state = NORMAL + outtext += char + elif state == SPACEHOLD: + if char == " ": + outtext += "  " + state = NORMAL + elif char == "<": + outtext += " "+char + state = XML + else: + outtext += " "+char + sigcount += 1 + state = NORMAL + + else: + # format == 0 flowed + for char in intext: + if char == '<' and state == NORMAL: + state = XML + outtext += char + elif char == '>' and state == XML: + state = NORMAL + outtext += char + elif state == XML: + outtext += char + else: + sigcount += 1 + outtext += char + + return [outtext, sigcount] + #------------------------------------------------------------------------ # # Document Backend class for html pages diff --git a/src/plugins/webreport/NarrativeWeb.py b/src/plugins/webreport/NarrativeWeb.py index 5c99eaa68..6e0ae4f49 100644 --- a/src/plugins/webreport/NarrativeWeb.py +++ b/src/plugins/webreport/NarrativeWeb.py @@ -60,6 +60,7 @@ from cStringIO import StringIO from textwrap import TextWrapper from unicodedata import normalize from collections import defaultdict +import re import operator from decimal import Decimal @@ -104,7 +105,7 @@ from libhtml import Html # import styled notes from # src/plugins/lib/libhtmlbackend.py -from libhtmlbackend import HtmlBackend +from libhtmlbackend import HtmlBackend, process_spaces from libgedcom import make_gedcom_date from PlaceUtils import conv_lat_lon @@ -488,13 +489,6 @@ class BasePage(object): # return text of the note to its callers return text -################################################# -# -# Will produce styled notes for NarrativeWeb by using: -# src/plugins/lib/libhtmlbackend.py -# -################################################# - def styled_note(self, styledtext, format, contains_html=False): """ styledtext : assumed a StyledText object to write @@ -507,23 +501,37 @@ class BasePage(object): return '' s_tags = styledtext.get_tags() - #FIXME: following split should be regex to match \n\s*\n instead? markuptext = self._backend.add_markup_from_styled(text, s_tags, - split='\n\n') + split='\n') htmllist = Html("div", id = "grampsstylednote") if contains_html: htmllist.extend((Html('p') + text)) - elif format == 1: - #preformatted, retain whitespace. - #so use \n\n for paragraph detection - htmllist += Html("pre", indent=None) + markuptext.split('\n\n') - - elif format == 0: - #flowed, use \n\n for paragraph detection - htmllist.extend( - (Html('p') + para.split('\n')) - for para in markuptext.split("\n\n")) - + else: + linelist = [] + linenb = 1 + for line in markuptext.split('\n'): + [line, sigcount] = process_spaces(line, format) + if sigcount == 0: + # The rendering of an empty paragraph '

' + # is undefined so we use a non-breaking space + if linenb == 1: + linelist.append(' ') + htmllist.extend(Html('p') + linelist) + linelist = [] + linenb = 1 + else: + if linenb > 1: + linelist[-1] += '
' + linelist.append(line) + linenb += 1 + if linenb > 1: + htmllist.extend(Html('p') + linelist) + # if the last line was blank, then as well as outputting the previous para, + # which we have just done, + # we also output a new blank para + if sigcount == 0: + linelist = [" "] + htmllist.extend(Html('p') + linelist) return htmllist def dump_notes(self, notelist):