changeset 42:67fde62e3862

Add new url mode, and text byte length These are for the bluesky silo in the next commit.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 20 Aug 2023 11:20:07 -0700
parents 1db1890edcf5
children ac5911e76ef2
files silorider/format.py
diffstat 1 files changed, 52 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/format.py	Sun Aug 20 11:14:31 2023 -0700
+++ b/silorider/format.py	Sun Aug 20 11:20:07 2023 -0700
@@ -5,12 +5,15 @@
 from .config import has_lxml
 
 
-def format_entry(entry, limit=None, add_url='auto', url_flattener=None):
+def format_entry(entry, limit=None, add_url='auto', url_flattener=None,
+                 url_mode=None):
     url = entry.url
 
     ctx = HtmlStrippingContext()
     if url_flattener:
         ctx.url_flattener = url_flattener
+    if url_mode is not None:
+        ctx.url_mode = url_mode
     # Don't add the limit yet.
 
     name = get_best_text(entry, ctx)
@@ -69,6 +72,7 @@
 URLMODE_INLINE = 0
 URLMODE_LAST = 1
 URLMODE_BOTTOM_LIST = 2
+URLMODE_ERASE = 3
 
 class HtmlStrippingContext:
     def __init__(self):
@@ -85,6 +89,8 @@
 
         # Accumulated text length when accounting for shortened URLs
         self.text_length = 0
+        # Same, but computed in bytes, as per UTF8 encoding
+        self.byte_length = 0
         # Whether limit was reached
         self.limit_reached = False
 
@@ -93,6 +99,7 @@
         next_text_length = self.text_length + added_len
         if self.limit <= 0 or next_text_length <= self.limit:
             self.text_length = next_text_length
+            self.byte_length += len(txt.encode())
             return txt
 
         if allow_shorten:
@@ -104,6 +111,7 @@
                 replace_whitespace=False,
                 placeholder="...")
             self.text_length += len(short_txt)
+            self.byte_length += len(short_txt.encode())
             self.limit_reached = True
             return short_txt
         else:
@@ -156,7 +164,7 @@
     outtxt = outtxt % urls
     if ctx.limit_reached:
         return outtxt
-    if ctx.url_mode != URLMODE_INLINE and ctx.urls:
+    if ctx.urls:
         if ctx.url_mode == URLMODE_LAST:
             # Don't add unnecessary whitespace.
             # NOTE: our final measure of the text might be one character
@@ -174,18 +182,26 @@
             else:
                 outtxt += '\n'
             outtxt += '\n'.join(ctx.urls)
+    # else, if url_mode is URLMODE_ERASE, don't do anything: we have
+    # removed the markers and don't need to add the URLs anywhere.
 
-    # Add the length of URLs to the text length.
-    for url in ctx.urls:
-        ctx.text_length += ctx.url_flattener.measureUrl(url)
-    # Add spaces and other extra characters to the text length.
-    if ctx.url_mode == URLMODE_INLINE:
-        # One space per URL except the explicitly no-space-urls.
-        ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls)
-    else:
-        # One space or newline per URL.
-        ctx.text_length += len(ctx.urls)
-
+    if ctx.url_mode != URLMODE_ERASE:
+        # Add the length of URLs to the text length.
+        for url in ctx.urls:
+            url_len = ctx.url_flattener.measureUrl(url)
+            ctx.text_length += url_len
+            ctx.byte_length += url_len
+        # Add spaces and other extra characters to the text length.
+        if ctx.url_mode == URLMODE_INLINE:
+            # One space per URL except the explicitly no-space-urls.
+            added_spaces = len(ctx.urls) - len(ctx.nosp_urls)
+            ctx.text_length += added_spaces
+            ctx.byte_length += added_spaces
+        else:
+            # One space or newline per URL.
+            added_spaces = len(ctx.urls)
+            ctx.text_length += added_spaces
+            ctx.byte_length += added_spaces
     return outtxt
 
 
@@ -226,28 +242,37 @@
             href = elem['href']
         except KeyError:
             href = None
+
+        # Get the text under the hyperlink.
         cnts = list(elem.contents)
         if len(cnts) == 1:
-            # Use the URL flattener to reformat the hyperlink.
-            href_txt = cnts[0].string
-            old_text_length = ctx.text_length
-            href_flattened = ctx.url_flattener.replaceHref(href_txt, href, ctx)
-            if href_flattened is not None:
-                # We have a reformatted URL. Use that, but check if the
-                # flattener computed a custom text length. If not, do the
-                # standard computation.
-                if ctx.text_length == old_text_length:
-                    return ctx.processText(href_flattened, False)
-                return href_flattened
+            a_txt = cnts[0].string
+        else:
+            a_txt = ''.join([_do_strip_html(c, ctx)
+                             for c in cnts])
 
-            # If we have a simple hyperlink where the text is a substring of
-            # the target URL, just return the URL.
-            if href_txt in href:
+        # Use the URL flattener to reformat the hyperlink.
+        old_text_length = ctx.text_length
+        href_flattened = ctx.url_flattener.replaceHref(a_txt, href, ctx)
+        if href_flattened is not None:
+            # We have a reformatted URL. Use that, but check if the
+            # flattener computed a custom text length. If not, do the
+            # standard computation.
+            if ctx.text_length == old_text_length:
+                return ctx.processText(href_flattened, False)
+            return href_flattened
+
+        # If we have a simple hyperlink where the text is a substring of
+        # the target URL, just return the URL.
+        if a_txt in href:
+            if ctx.url_mode != URLMODE_ERASE:
                 a_txt = '%%(url:%d)s' % len(ctx.urls)
                 ctx.nosp_urls.append(len(ctx.urls))
                 ctx.urls.append(href)
                 # No text length to add.
                 return a_txt
+            else:
+                return a_txt
 
         # No easy way to simplify this hyperlink... let's put a marker
         # for the URL to be later replaced in the text.