3969: [NarWeb] Narrated Web Site: Newlines and white space are not preserved in note text

Patch of Tim Lyons and Benny Malengier svn: r15923
2010-09-23 21:03:15 +00:00
parent f748668c89
commit 0c5b792c76
8 changed files with 340 additions and 116 deletions
--- a/src/plugins/docgen/AsciiDoc.py
+++ b/src/plugins/docgen/AsciiDoc.py
@ -3,7 +3,7 @@
 #
 # Copyright (C) 2000-2006  Donald N. Allingham
 # Copyright (C) 2007-2009  Brian G. Matherly
-# Copyright (C) 2009       Benny Malengier <benny.malengier@gramps-project.org>
+# Copyright (C) 2009-2010  Benny Malengier <benny.malengier@gramps-project.org>
 # Copyright (C) 2010       Peter Landgren
 #
 # This program is free software; you can redistribute it and/or modify
@ -63,28 +63,35 @@ _WIDTH_IN_CHARS = 72
 def reformat_para(para='',left=0,right=72,just=LEFT,right_pad=0,first=0):
    if not para.strip():
        return "\n"
-    words = para.split()
+    
    lines = []
-    line  = ''
-    word = 0
-    end_words = 0
    real_left = left+first
-    while not end_words:
-        if len(words[word]) > right-real_left: # Handle very long words
-            line = words[word]
-            word +=1
-            if word >= len(words):
-                end_words = 1
-        else:                             # Compose line of words
-            while len(line)+len(words[word]) <= right-real_left:
-                line += words[word]+' '
-                word += 1
+    alllines = para.split('\n')
+    for realline in alllines:
+        words = realline.split()
+        line  = ''
+        word = 0
+        end_words = 0
+        while not end_words:
+            if not words:
+                lines.append("\n")
+                break
+            if len(words[word]) > right-real_left: # Handle very long words
+                line = words[word]
+                word +=1
                if word >= len(words):
                    end_words = 1
-                    break
-        lines.append(line)
-        real_left = left
-        line = ''
+            else:                             # Compose line of words
+                while len(line)+len(words[word]) <= right-real_left:
+                    line += words[word]+' '
+                    word += 1
+                    if word >= len(words):
+                        end_words = 1
+                        break
+            lines.append(line)
+            #first line finished, discard first
+            real_left = left
+            line = ''
    if just==CENTER:
        if right_pad:
            return '\n'.join(
@ -375,6 +382,8 @@ class AsciiDoc(BaseDoc,TextDoc):
            some way. Eg, a textdoc could remove all tags, or could make sure
            a link is clickable. AsciiDoc prints the html without handling it
        """
+        if contains_html:
+            return
        text = str(styledtext)
        if format:
            #Preformatted note, keep all white spaces, tabs, LF's
@ -390,8 +399,8 @@ class AsciiDoc(BaseDoc,TextDoc):
        else:
            for line in text.split('\n\n'):
                self.start_paragraph(style_name)
-                line = line.replace('\n',' ')
-                line = ' '.join(line.split())
+                #line = line.replace('\n',' ')
+                #line = ' '.join(line.split())
                self.write_text(line)
                self.end_paragraph()

--- a/src/plugins/docgen/HtmlDoc.py
+++ b/src/plugins/docgen/HtmlDoc.py
@ -3,8 +3,10 @@
 #
 # Copyright (C) 2000-2006  Donald N. Allingham
 # Copyright (C) 2007-2009  Brian G. Matherly
-# Copyright (C) 2009       Benny Malengier <benny.malengier@gramps-project.org>
+# Copyright (C) 2009-2010  Benny Malengier <benny.malengier@gramps-project.org>
 # Copyright (C) 2010       Peter Landgren
+# Copyright (C) 2010       Tim Lyons
+
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -47,7 +49,7 @@ from gui.utils import open_file_with_default_application
 import ImgManip
 import const
 from gen.plug.docgen import BaseDoc, TextDoc, FONT_SANS_SERIF
-from libhtmlbackend import HtmlBackend
+from libhtmlbackend import HtmlBackend, process_spaces
 from libhtml import Html

 #------------------------------------------------------------------------
@ -482,10 +484,6 @@ class HtmlDoc(BaseDoc, TextDoc):
        """
        text = str(styledtext)

-        s_tags = styledtext.get_tags()
-        #FIXME: following split should be regex to match \n\s*\n instead?
-        markuptext = self._backend.add_markup_from_styled(text, s_tags, 
-                                                          split='\n\n')
        self.htmllist += [Html('div', id='grampsstylednote')]
        if contains_html:
            #just dump the note out as it is. Adding markup would be dangerous
@ -493,25 +491,48 @@ class HtmlDoc(BaseDoc, TextDoc):
            self.start_paragraph(style_name)
            self.__write_text(text, markup=True)
            self.end_paragraph()
-        elif format == 1:
-            #preformatted, retain whitespace.
-            #so use \n\n for paragraph detection
-            #FIXME: following split should be regex to match \n\s*\n instead?
-            self.htmllist += [Html('pre', indent=None, inline=True)]
-            for line in markuptext.split('\n\n'):
-                self.start_paragraph(style_name)
-                for realline in line.split('\n'):
-                    self.__write_text(realline, markup=True)
-                    self.htmllist[-1] += Html('br')
+        else:
+            s_tags = styledtext.get_tags()
+            markuptext = self._backend.add_markup_from_styled(text, s_tags, 
+                                                              split='\n')
+            self.start_paragraph(style_name)
+            inpara = True
+            self._empty = 1   # para is empty
+            # we explicitly set _empty because start and end para do not seem
+            # to do a very good job at setting them
+            linenb = 1
+            # The code is tricky here, because we don't want to start a new para
+            # at the end of the last line if there is no newline there.
+            # Instead, we want to just end the current para.
+            for line in markuptext.split('\n'):
+                [line, sigcount] = process_spaces(line, format)
+                if sigcount == 0:
+                    if inpara == False:
+                        # needed for runs of three or more newlines
+                        self.start_paragraph(style_name)
+                        inpara = True
+                        self._empty = 1   # para is empty
+                    self.end_paragraph()
+                    inpara = False
+                    linenb = 1
+                else:
+                    if inpara == False:
+                        self.start_paragraph(style_name)
+                        inpara = True
+                        self._empty = 1   # para is empty
+                    if linenb > 1:
+                        self.htmllist[-1] += Html('br')
+                    self.__write_text(line, markup=True)
+                    self._empty = 0  # para is not empty
+                    linenb += 1
+            if inpara == True:
                self.end_paragraph()
-            #end pre element
-            self.__reduce_list()
-        elif format == 0:
-            #flowed
-            #FIXME: following split should be regex to match \n\s*\n instead?
-            for line in markuptext.split('\n\n'):
+            if sigcount == 0:        
+                # if the last line was blank, then as well as outputting the previous para,
+                # which we have just done,
+                # we also output a new blank para
                self.start_paragraph(style_name)
-                self.__write_text(line, markup=True)
+                self._empty = 1   # para is empty
                self.end_paragraph()
        #end div element
        self.__reduce_list()
--- a/src/plugins/docgen/LaTeXDoc.py
+++ b/src/plugins/docgen/LaTeXDoc.py
@ -641,9 +641,11 @@ class LaTeXDoc(BaseDoc, TextDoc):
        else:
            for line in markuptext.split('\n\n'):
                self.start_paragraph(style_name)
-                self._backend.write(line)
+                for realline in line.split('\n'):
+                    self._backend.write(realline)
+                    self._backend.write("\\newline\n")
                self.end_paragraph()
-        self._backend.write("\n\\vspace*{0.5cm} \n\end{minipage}\n\n")
+        self._backend.write("\n\\vspace*{0.5cm} \n\\end{minipage}\n\n")

    def write_endnotes_ref(self, text, style_name):
        """
--- a/src/plugins/docgen/ODFDoc.py
+++ b/src/plugins/docgen/ODFDoc.py
@ -1379,48 +1379,52 @@ class ODFDoc(BaseDoc, TextDoc, DrawDoc):
    def write_styled_note(self, styledtext, format, style_name,
                          contains_html=False):
        """
-        Convenience function to write a styledtext to the latex doc. 
+        Convenience function to write a styledtext to the ODF doc. 
        styledtext : assumed a StyledText object to write
        format : = 0 : Flowed, = 1 : Preformatted
        style_name : name of the style to use for default presentation
        contains_html: bool, the backend should not check if html is present. 
            If contains_html=True, then the textdoc is free to handle that in 
            some way. Eg, a textdoc could remove all tags, or could make sure
-            a link is clickable. RTFDoc prints the html without handling it
+            a link is clickable. ODFDoc prints the html without handling it
        """
        text = str(styledtext)
        s_tags = styledtext.get_tags()
        text = text.replace('&', '\1') # must be the first
        text = text.replace('<', '\2')
        text = text.replace('>', '\3')
-        markuptext = self._backend.add_markup_from_styled(text, s_tags)
-
+        markuptext = self._backend.add_markup_from_styled(text, s_tags, '\n')
        # we need to know if we have new styles to add.
        # if markuptext contains : FontColor, FontFace, FontSize ...
        # we must prepare the new styles for the styles.xml file.
        # We are looking for the following format :
        # style-name="([a-zA-Z0-9]*)__([a-zA-Z0-9 ])">
        # The first element is the StyleType and the second one is the value
-
        start = 0
-        while True:
+        while 1:
            m = NewStyle.search(markuptext, start)
            if not m:
                break
-            self.StyleList.append(
-                [m.group(1) + m.group(2), m.group(1), m.group(2)]
-                )
+            self.StyleList.append([m.group(1)+m.group(2),
+                                  m.group(1),
+                                  m.group(2)])
            start = m.end()
-
+        linenb = 1
        self.start_paragraph(style_name)
        markuptext = markuptext.replace('\1', '&amp;') # must be the first
        markuptext = markuptext.replace('\2', '&lt;')
        markuptext = markuptext.replace('\3', '&gt;')
-
-        for l, line in enumerate(markuptext.split('\n')):
-            if l:
-                self.cntnt.write('<text:line-break/>')
-            self.cntnt.write(line)
+        for line in markuptext.split('\n'):
+            [line, sigcount] = process_spaces(line, format)
+            if sigcount == 0:
+                self.end_paragraph()
+                self.start_paragraph(style_name)
+                linenb = 1
+            else:
+                if ( linenb > 1 ):
+                    self.cntnt.write('<text:line-break/>')
+                self.cntnt.write(line)
+                linenb += 1
        self.end_paragraph()

    def write_text(self, text, mark=None):
@ -1704,3 +1708,48 @@ class ODFDoc(BaseDoc, TextDoc, DrawDoc):
                '</draw:text-box>'
                )
        self.cntnt.write('</draw:frame>\n')
+
+def process_spaces(line, format):
+    """
+    Function to process spaces in text lines for flowed and pre-formatted notes.
+    line : text to process
+    format : = 0 : Flowed, = 1 : Preformatted
+    
+    If the text is flowed (format==0), then leading spaces (after ignoring XML)
+    are removed. Embedded multiple spaces are reduced to one by ODF
+    If the text is pre-formatted (format==1). then all spaces  (after ignoring XML)
+    are replaced by "<text:s/>"
+    
+    Returns the processed text, and the number of significant (i.e. non-white-space) chars.
+    """
+    txt = ""
+    xml = False
+    sigcount = 0
+    # we loop through every character, which is very inefficient, but an attempt to use
+    # a regex replace didn't always work. This was the code that was replaced.
+    # Problem, we may not replace ' ' in xml tags, so we use a regex
+    # self.cntnt.write(re.sub(' (?=([^(<|>)]*<[^>]*>)*[^>]*$)', 
+    #                        "<text:s/>", line))
+    for char in line:
+        if char == '<' and xml == False:
+            xml = True
+            txt += char
+        elif char == '>' and xml == True:
+            xml = False
+            txt += char
+        elif xml == True:
+            txt += char
+        elif char == " " or char == "\t":
+            if format == 0 and sigcount == 0:
+                pass
+            elif format == 1:
+                #preformatted, section White-space characters of
+                # http://docs.oasis-open.org/office/v1.1/OS/OpenDocument-v1.1-html/OpenDocument-v1.1.html#5.1.1.White-space%20Characters|outline
+                txt += "<text:s/>"
+            else:
+                txt += char
+        else:
+            sigcount += 1
+            txt += char
+    return [txt, sigcount]
+    
--- a/src/plugins/docgen/RTFDoc.py
+++ b/src/plugins/docgen/RTFDoc.py
@ -227,6 +227,12 @@ class RTFDoc(BaseDoc,TextDoc):
    #
    #--------------------------------------------------------------------
    def end_paragraph(self):
+        # FIXME: I don't understand why no end paragraph marker is output when
+        # we are inside a table. Since at least version 3.2.2, this seems to mean that
+        # there is no new paragraph after the first line of a table entry.
+        # For example in the birth cell, the first paragraph should be the
+        # description (21 Jan 1900 in London); if there is a note following this,
+        # there is no newline between the description and the note.
        if not self.in_table:
            self.f.write(self.text)
            if self.opened:
@ -404,11 +410,11 @@ class RTFDoc(BaseDoc,TextDoc):
                self.f.write('\n')
            index = index+1
        self.f.write('}}\\par\n')
-    
+
    def write_styled_note(self, styledtext, format, style_name,
                          contains_html=False):
        """
-        Convenience function to write a styledtext to the latex doc. 
+        Convenience function to write a styledtext to the RTF doc. 
        styledtext : assumed a StyledText object to write
        format : = 0 : Flowed, = 1 : Preformatted
        style_name : name of the style to use for default presentation
@ -418,41 +424,31 @@ class RTFDoc(BaseDoc,TextDoc):
            a link is clickable. RTFDoc prints the html without handling it
        """
        text = str(styledtext)
-        if format:
-            # Preformatted note
-            for line in text.split('\n'):
-                self.start_paragraph(style_name)
-                self.write_text(line)
+        self.start_paragraph(style_name)
+        linenb = 1
+        for line in text.split('\n'):
+            [line, sigcount] = process_spaces(line, format)
+            if sigcount == 0:
                if self.in_table:
                #    # Add LF when in table as in indiv_complete report
                    self.write_text('\n')
                self.end_paragraph()
-        else:
-            firstline = True
-            for line in text.split('\n\n'):
                self.start_paragraph(style_name)
-                if len(line) > 0:
-                    # Remember first char, can be a LF.
-                    firstchar = line[0] 
-                    # Replace all LF's with space and reformat.
-                    line = line.replace('\n',' ')
-                    line = ' '.join(line.split())
-                    # If remembered first char is LF, insert in front of lines
-                    #This takes care of the case with even number of empty lines.
-                    if firstchar == '\n':
-                        line = firstchar + line
-                    #Insert LF's if not first line.
-                    if not firstline:
-                        line = '\n\n' + line
-                else:
-                    # If odd number of empty lines line will be empty.
-                    line = '\n\n'
+                linenb = 1
+            else:
+                if ( linenb > 1 ):
+                    self.write_text('\\line ')
                self.write_text(line)
-                self.end_paragraph()
-                firstline = False
-            self.start_paragraph(style_name)
+                linenb += 1
+        # FIXME: I don't understand why these newlines are necessary.
+        # It may be related to the behaviour of end_paragraph inside tables, and
+        # write_text converting \n to end paragraph.
+        # This code prevents the whole document going wrong, but seems to produce an extra
+        # paragraph mark at the end of each table cell.
+        if self.in_table:
+        #    # Add LF when in table as in indiv_complete report
            self.write_text('\n')
-            self.end_paragraph()
+        self.end_paragraph()

    def write_endnotes_ref(self,text,style_name):
        """
@ -497,3 +493,40 @@ class RTFDoc(BaseDoc,TextDoc):
                self.text += '\\%s' % i
            else:
                self.text += i
+
+def process_spaces (self, line, format):
+    """
+    Function to process spaces in text lines for flowed and pre-formatted notes.
+    line : text to process
+    format : = 0 : Flowed, = 1 : Preformatted
+    
+    If the text is flowed (format==0), then leading spaces
+    are removed, and multiple spaces are reduced to one.
+    If the text is pre-formatted (format==1). then all spaces are preserved
+    
+    Note that xml is just treated like any other text, 
+    because it will be from the original note, and it is just printed, not interpreted.
+    Returns the processed text, and the number of significant (i.e. non-white-space) chars.
+    """
+    txt = ""
+    xml = False
+    space = False
+    sigcount = 0
+    # we loop through every character, which is very inefficient, but an attempt to use
+    # a regex replace didn't always work.
+    for char in line:
+        if char == " " or char == "\t":
+            if format == 1:
+                txt += char
+            elif format == 0 and sigcount == 0:
+                pass
+            elif format == 0 and space == False:
+                space = True
+                txt += char
+            elif format == 0 and space == True:
+                pass
+        else:
+            sigcount += 1
+            space = False
+            txt += char
+    return [txt, sigcount]
--- a/src/plugins/lib/libcairodoc.py
+++ b/src/plugins/lib/libcairodoc.py
@ -1335,10 +1335,12 @@ class CairoDoc(BaseDoc, TextDoc, DrawDoc):
            #FIXME: following split should be regex to match \n\s*\n instead?
            for line in markuptext.split('\n\n'):
                self.start_paragraph(style_name)
-                #flowed, make normal whitespace go away
-                line = line.replace('\n',' ')
-                line = ' '.join(line.split())
-                self.__write_text(line, markup=True)
+                #flowed, normal whitespace goes away, but we keep linebreak
+                lines = line.split('\n')
+                newlines = []
+                for singleline in lines:
+                    newlines.append(' '.join(singleline.split()))
+                self.__write_text('\n'.join(newlines), markup=True)
                self.end_paragraph()

    def __write_text(self, text, mark=None, markup=False):
--- a/src/plugins/lib/libhtmlbackend.py
+++ b/src/plugins/lib/libhtmlbackend.py
@ -46,6 +46,106 @@ from gen.plug.docbackend import DocBackend
 from libhtml import Html
 from Utils import xml_lang

+
+#------------------------------------------------------------------------
+#
+# Functions
+#
+#------------------------------------------------------------------------
+
+def process_spaces(intext, format):
+    """
+    Function to process spaces in text lines for pre-formatted notes.
+    line : text to process
+    format : = 0 : Flowed, = 1 : Preformatted
+    
+    If the text is pre-formatted (format==1), then leading spaces  (after ignoring XML)
+    are replaced by alternating non-breaking spaces and ordinary spaces.
+    After the first non-space character, single spaces are left
+    but multiple spaces are replaced by alternating NBSP and space
+    If the text is flowed, the text is unchanged.
+    
+    Returns the processed text, and the number of significant
+    (i.e. non-xml non-white-space) chars.
+    """
+    NORMAL=1
+    SPACE=2
+    NBSP=3
+    XML=4
+    SPACEHOLD=5
+    
+    sigcount = 0
+    state = NORMAL
+    outtext = ""
+    if format == 1:
+    # Pre-formatted
+        for char in intext:
+            if state == NORMAL:
+                if char == " ":
+                    if sigcount == 0:
+                        state = NBSP
+                        outtext += "&nbsp;"
+                    else:
+                        state = SPACEHOLD
+                elif char == "<":
+                    state = XML
+                    outtext += char
+                else:
+                    sigcount += 1
+                    outtext += char
+            elif state == SPACE:
+                if char == " ":
+                    state = NBSP
+                    outtext += "&nbsp;"
+                elif char == "<":
+                    state = XML
+                    outtext += char
+                else:
+                    sigcount += 1
+                    state = NORMAL
+                    outtext += char
+            elif state == NBSP:
+                if char == " ":
+                    state = SPACE
+                elif char == "<":
+                    state = XML
+                else:
+                    sigcount += 1
+                    state = NORMAL
+                outtext += char
+            elif state == XML:
+                if char == ">":
+                    state = NORMAL
+                outtext += char
+            elif state == SPACEHOLD:
+                if char == " ":
+                    outtext += "&nbsp; "
+                    state = NORMAL
+                elif char == "<":
+                    outtext += " "+char
+                    state = XML
+                else:
+                    outtext += " "+char
+                    sigcount += 1
+                    state = NORMAL
+    
+    else:
+    # format == 0 flowed
+        for char in intext:
+            if char == '<' and state == NORMAL:
+                state = XML
+                outtext += char
+            elif char == '>' and state == XML:
+                state = NORMAL
+                outtext += char
+            elif state == XML:
+                outtext += char
+            else:
+                sigcount += 1
+                outtext += char
+
+    return [outtext, sigcount]
+
 #------------------------------------------------------------------------
 #
 # Document Backend class for html pages
--- a/src/plugins/webreport/NarrativeWeb.py
+++ b/src/plugins/webreport/NarrativeWeb.py
@ -60,6 +60,7 @@ from cStringIO import StringIO
 from textwrap import TextWrapper
 from unicodedata import normalize
 from collections import defaultdict
+import re

 import operator
 from decimal import Decimal
@ -104,7 +105,7 @@ from libhtml import Html

 # import styled notes from
 # src/plugins/lib/libhtmlbackend.py
-from libhtmlbackend import HtmlBackend
+from libhtmlbackend import HtmlBackend, process_spaces

 from libgedcom import make_gedcom_date
 from PlaceUtils import conv_lat_lon
@ -488,13 +489,6 @@ class BasePage(object):
        # return text of the note to its callers
        return text

-#################################################
-#
-# Will produce styled notes for NarrativeWeb by using:
-# src/plugins/lib/libhtmlbackend.py
-#
-#################################################
-
    def styled_note(self, styledtext, format, contains_html=False):
        """
        styledtext : assumed a StyledText object to write
@ -507,23 +501,37 @@ class BasePage(object):
            return ''

        s_tags = styledtext.get_tags()
-        #FIXME: following split should be regex to match \n\s*\n instead?
        markuptext = self._backend.add_markup_from_styled(text, s_tags,
-                                                         split='\n\n')
+                                                         split='\n')
        htmllist = Html("div", id = "grampsstylednote")
        if contains_html:
            htmllist.extend((Html('p') + text))
-        elif format == 1:
-            #preformatted, retain whitespace.
-            #so use \n\n for paragraph detection
-            htmllist += Html("pre", indent=None) + markuptext.split('\n\n')
-
-        elif format == 0:
-            #flowed, use \n\n for paragraph detection
-            htmllist.extend(
-                (Html('p') + para.split('\n'))
-                    for para in markuptext.split("\n\n"))
-
+        else:
+            linelist = []
+            linenb = 1
+            for line in markuptext.split('\n'):
+                [line, sigcount] = process_spaces(line, format)
+                if sigcount == 0:
+                    # The rendering of an empty paragraph '<p></p>'
+                    # is undefined so we use a non-breaking space
+                    if linenb == 1:
+                        linelist.append('&nbsp;')
+                    htmllist.extend(Html('p') + linelist)
+                    linelist = []
+                    linenb = 1
+                else:
+                    if linenb > 1:
+                        linelist[-1] += '<br>'
+                    linelist.append(line)
+                    linenb += 1
+            if linenb > 1:
+                htmllist.extend(Html('p') + linelist)
+            # if the last line was blank, then as well as outputting the previous para,
+            # which we have just done,
+            # we also output a new blank para
+            if sigcount == 0:
+                linelist = ["&nbsp;"]
+                htmllist.extend(Html('p') + linelist)
        return htmllist

    def dump_notes(self, notelist):