# HG changeset patch
# User Ludovic Chabant <ludovic@chabant.com>
# Date 1684687200 25200
# Node ID 0f98784bcc40e6ec0ef98140b7b7d1870c1c12e8
# Parent  b2cacc853680576d594e99eb0a70aa85f4699be6
Improve handling of whitespace and paragraphs in html stripping code

diff -r b2cacc853680 -r 0f98784bcc40 silorider/format.py
--- a/silorider/format.py	Sun May 21 09:39:35 2023 -0700
+++ b/silorider/format.py	Sun May 21 09:40:00 2023 -0700
@@ -193,26 +193,30 @@
     return txt.replace('%', '%%')
 
 
+tags_valid_for_whitespace = {
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+    'p'
+}
+
+
 def _do_strip_html(elem, ctx):
     if isinstance(elem, bs4.NavigableString):
-        # Don't necessarily include this bit of text...
-        # If it belongs to a paragraph, include it. If not, include it
-        # only if there are not paragraphs in its siblings (because that
-        # means this is the white-space between the paragraph tags)
-        include_this = False
-        for parent in elem.parents:
-            if parent and parent.name == 'p':
-                include_this = True
-                break
-        else:
-            next_sib = next(elem.next_siblings, None)
-            prev_sib = next(elem.previous_siblings, None)
-            if ((prev_sib is None or prev_sib.name != 'p') and
-                (next_sib is None or next_sib.name != 'p')):
-                include_this = True
+        # We have some text.
+        # We generally include this text without any alteration except when
+        # the string is entirely whitespace. In that case, we only include
+        # it if it's inside a valid text tag like <p>. Otherwise, it's
+        # most likely whitespace inside html markup, such as indenting and
+        # newlines between html tags.
+        include_this = True
+        raw_txt = str(elem)
+        if raw_txt.isspace():
+            include_this = False
+            for p in elem.parents:
+                if p and p.name in tags_valid_for_whitespace:
+                    include_this = True
+                    break
 
         if include_this:
-            raw_txt = str(elem)
             return _escape_percents(ctx.processText(raw_txt))
         else:
             return ''
@@ -270,6 +274,16 @@
                 outtxt += '\n'
         return ctx.processText(outtxt)
 
+    if elem.name == 'p':
+        # Add a newline before starting a paragraph only if this isn't
+        # the first paragraph or piece of content.
+        p_txt = ''
+        if ctx.text_length > 0:
+            p_txt = '\n'
+        for c in elem.children:
+            p_txt += _do_strip_html(c, ctx)
+        return p_txt
+
     return ''.join([_do_strip_html(c, ctx) for c in elem.children])