Mercurial > silorider
changeset 38:0f98784bcc40
Improve handling of whitespace and paragraphs in html stripping code
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sun, 21 May 2023 09:40:00 -0700 |
parents | b2cacc853680 |
children | c5f73ebb43a5 |
files | silorider/format.py |
diffstat | 1 files changed, 30 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/format.py Sun May 21 09:39:35 2023 -0700 +++ b/silorider/format.py Sun May 21 09:40:00 2023 -0700 @@ -193,26 +193,30 @@ return txt.replace('%', '%%') +tags_valid_for_whitespace = { + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'p' +} + + def _do_strip_html(elem, ctx): if isinstance(elem, bs4.NavigableString): - # Don't necessarily include this bit of text... - # If it belongs to a paragraph, include it. If not, include it - # only if there are not paragraphs in its siblings (because that - # means this is the white-space between the paragraph tags) - include_this = False - for parent in elem.parents: - if parent and parent.name == 'p': - include_this = True - break - else: - next_sib = next(elem.next_siblings, None) - prev_sib = next(elem.previous_siblings, None) - if ((prev_sib is None or prev_sib.name != 'p') and - (next_sib is None or next_sib.name != 'p')): - include_this = True + # We have some text. + # We generally include this text without any alteration except when + # the string is entirely whitespace. In that case, we only include + # it if it's inside a valid text tag like <p>. Otherwise, it's + # most likely whitespace inside html markup, such as indenting and + # newlines between html tags. + include_this = True + raw_txt = str(elem) + if raw_txt.isspace(): + include_this = False + for p in elem.parents: + if p and p.name in tags_valid_for_whitespace: + include_this = True + break if include_this: - raw_txt = str(elem) return _escape_percents(ctx.processText(raw_txt)) else: return '' @@ -270,6 +274,16 @@ outtxt += '\n' return ctx.processText(outtxt) + if elem.name == 'p': + # Add a newline before starting a paragraph only if this isn't + # the first paragraph or piece of content. + p_txt = '' + if ctx.text_length > 0: + p_txt = '\n' + for c in elem.children: + p_txt += _do_strip_html(c, ctx) + return p_txt + return ''.join([_do_strip_html(c, ctx) for c in elem.children])