changeset 38:0f98784bcc40

Improve handling of whitespace and paragraphs in html stripping code
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 21 May 2023 09:40:00 -0700
parents b2cacc853680
children c5f73ebb43a5
files silorider/format.py
diffstat 1 files changed, 30 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/format.py	Sun May 21 09:39:35 2023 -0700
+++ b/silorider/format.py	Sun May 21 09:40:00 2023 -0700
@@ -193,26 +193,30 @@
     return txt.replace('%', '%%')
 
 
+tags_valid_for_whitespace = {
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+    'p'
+}
+
+
 def _do_strip_html(elem, ctx):
     if isinstance(elem, bs4.NavigableString):
-        # Don't necessarily include this bit of text...
-        # If it belongs to a paragraph, include it. If not, include it
-        # only if there are not paragraphs in its siblings (because that
-        # means this is the white-space between the paragraph tags)
-        include_this = False
-        for parent in elem.parents:
-            if parent and parent.name == 'p':
-                include_this = True
-                break
-        else:
-            next_sib = next(elem.next_siblings, None)
-            prev_sib = next(elem.previous_siblings, None)
-            if ((prev_sib is None or prev_sib.name != 'p') and
-                (next_sib is None or next_sib.name != 'p')):
-                include_this = True
+        # We have some text.
+        # We generally include this text without any alteration except when
+        # the string is entirely whitespace. In that case, we only include
+        # it if it's inside a valid text tag like <p>. Otherwise, it's
+        # most likely whitespace inside html markup, such as indenting and
+        # newlines between html tags.
+        include_this = True
+        raw_txt = str(elem)
+        if raw_txt.isspace():
+            include_this = False
+            for p in elem.parents:
+                if p and p.name in tags_valid_for_whitespace:
+                    include_this = True
+                    break
 
         if include_this:
-            raw_txt = str(elem)
             return _escape_percents(ctx.processText(raw_txt))
         else:
             return ''
@@ -270,6 +274,16 @@
                 outtxt += '\n'
         return ctx.processText(outtxt)
 
+    if elem.name == 'p':
+        # Add a newline before starting a paragraph only if this isn't
+        # the first paragraph or piece of content.
+        p_txt = ''
+        if ctx.text_length > 0:
+            p_txt = '\n'
+        for c in elem.children:
+            p_txt += _do_strip_html(c, ctx)
+        return p_txt
+
     return ''.join([_do_strip_html(c, ctx) for c in elem.children])