Mercurial > silorider
changeset 68:c678ca85cc25
Fixes for url flatteners
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Tue, 26 Dec 2023 16:32:17 -0800 |
parents | c5bf03406a33 |
children | dafbbf25bfc8 |
files | silorider/format.py silorider/silos/bluesky.py |
diffstat | 2 files changed, 58 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/format.py Tue Dec 26 16:30:36 2023 -0800 +++ b/silorider/format.py Tue Dec 26 16:32:17 2023 -0800 @@ -74,6 +74,7 @@ ctx.limit = limit if url_flattener: ctx.url_flattener = url_flattener + url_flattener.reset() card.text = get_best_text(entry, ctx) else: # We need to shorten the blurb! We can't do much else besides @@ -81,7 +82,12 @@ card.text = card.text[:limit] # Actually add the url to the original post now. + # We pass it through the URL flattener in case it needs to do extra + # stuff with it (for instance the Bluesky silo will remember the + # byte offsets to insert a hyperlink). if do_add_url and url: + ctx.reportAddedText(1) + url = _process_end_url(url, ctx) card.text += ' ' + url return card @@ -122,6 +128,9 @@ def measureUrl(self, url): raise NotImplementedError() + def reset(self): + pass + class _NullUrlFlattener(UrlFlattener): def replaceHref(self, text, url, ctx): @@ -156,36 +165,56 @@ self.nosp_urls = [] # Accumulated text length when accounting for shortened URLs - self.text_length = 0 + self._text_length = 0 # Same, but computed in bytes, as per UTF8 encoding - self.byte_length = 0 + self._byte_length = 0 # Whether limit was reached - self.limit_reached = False + self._limit_reached = False + + @property + def text_length(self): + return self._text_length + + @property + def byte_length(self): + return self._byte_length + + @property + def limit_reached(self): + return self._limit_reached def processText(self, txt, allow_shorten=True): added_len = len(txt) - next_text_length = self.text_length + added_len + next_text_length = self._text_length + added_len if self.limit <= 0 or next_text_length <= self.limit: - self.text_length = next_text_length - self.byte_length += len(txt.encode()) + self._text_length = next_text_length + self._byte_length += len(txt.encode()) return txt if allow_shorten: - max_allowed = self.limit - self.text_length + max_allowed = self.limit - self._text_length short_txt = textwrap.shorten( txt, width=max_allowed, expand_tabs=False, replace_whitespace=False, placeholder="...") - self.text_length += len(short_txt) - self.byte_length += len(short_txt.encode()) - self.limit_reached = True + self._text_length += len(short_txt) + self._byte_length += len(short_txt.encode()) + self._limit_reached = True return short_txt else: - self.limit_reached = True + self._limit_reached = True return '' + def reportSetText(self, charlen, bytelen=None): + self._text_length = charlen + self._byte_length = bytelen if bytelen is not None else charlen + + def reportAddedText(self, added_chars, added_bytes=None): + self._text_length += added_chars + self._byte_length += added_bytes if added_bytes is not None else added_chars + def get_best_text(entry, ctx=None, *, plain=True): elem = entry.htmlFind(class_='p-title') @@ -220,7 +249,7 @@ if desc: logger.debug("Found card info, description: %s (image: %s)" % (desc, img)) - ctx.text_length = len(desc) + ctx.reportSetText(len(desc), len(desc.encode('utf8'))) return CardInfo(entry, desc, img, 'card') return None @@ -261,7 +290,7 @@ # too long because of this, but that's desirable. if outtxt[-1] not in string.whitespace: outtxt += ' ' - outtxt += ' '.join(ctx.urls) + outtxt += ' '.join([_process_end_url(url, ctx) for url in ctx.urls]) elif ctx.url_mode == URLMODE_BOTTOM_LIST: # If the last character of the text is a whitespace, replace # it with a newline. @@ -271,30 +300,33 @@ outtxt = outtxt[:-1] + '\n' else: outtxt += '\n' - outtxt += '\n'.join(ctx.urls) + outtxt += '\n'.join([_process_end_url(url, ctx) for url in ctx.urls]) # else, if url_mode is URLMODE_ERASE, don't do anything: we have # removed the markers and don't need to add the URLs anywhere. + # TODO: if using URLMODE_INLINE we don't process the URLs via the flatterners if ctx.url_mode != URLMODE_ERASE: # Add the length of URLs to the text length. for url in ctx.urls: url_len = ctx.url_flattener.measureUrl(url) - ctx.text_length += url_len - ctx.byte_length += url_len + ctx.reportAddedText(url_len) # Add spaces and other extra characters to the text length. if ctx.url_mode == URLMODE_INLINE: # One space per URL except the explicitly no-space-urls. added_spaces = len(ctx.urls) - len(ctx.nosp_urls) - ctx.text_length += added_spaces - ctx.byte_length += added_spaces + ctx.reportAddedText(added_spaces) else: # One space or newline per URL. added_spaces = len(ctx.urls) - ctx.text_length += added_spaces - ctx.byte_length += added_spaces + ctx.reportAddedText(added_spaces) return outtxt +def _process_end_url(url, ctx): + new_url = ctx.url_flattener.replaceHref(url, url, ctx) + return new_url if new_url is not None else url + + def _escape_percents(txt): return txt.replace('%', '%%') @@ -354,15 +386,10 @@ return a_txt # Use the URL flattener to reformat the hyperlink. - old_text_length = ctx.text_length href_flattened = ctx.url_flattener.replaceHref(a_txt, href, ctx) if href_flattened is not None: - # We have a reformatted URL. Use that, but check if the - # flattener computed a custom text length. If not, do the - # standard computation. - if ctx.text_length == old_text_length: - return ctx.processText(href_flattened, False) - return href_flattened + # We have a reformatted URL, use that. + return ctx.processText(href_flattened, False) # If we have a simple hyperlink where the text is a substring of # the target URL, just return the URL.