Mercurial > silorider
changeset 60:b7da3d97ea99
Add profile URL handlers
Silos register these handlers so that everybody knows if a hyperlink is
a mention to another user on a particular social network. If any handler
matches, silos not related to that social media will skip that link.
It's possible than in rare cases we want that link everywhere, but so
far I haven't needed it, compared to all the times I didn't want these
links.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sat, 28 Oct 2023 11:57:04 -0700 |
parents | b404445082f8 |
children | 91dea4fe31ec |
files | silorider/commands/process.py silorider/format.py silorider/silos/base.py silorider/silos/bluesky.py silorider/silos/facebook.py silorider/silos/mastodon.py silorider/silos/twitter.py |
diffstat | 7 files changed, 113 insertions(+), 14 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/commands/process.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/commands/process.py Sat Oct 28 11:57:04 2023 -0700 @@ -33,9 +33,17 @@ def process(self): self.preProcess() + # Get all silos to return a profile URL handler. + profile_url_handlers = {} + for silo in self.ctx.silos: + handler = silo.getProfileUrlHandler() + if handler: + profile_url_handlers[silo.SILO_TYPE] = handler + + postctx = SiloPostingContext(self.ctx, profile_url_handlers) feed = parse_url(self.url, self.name, self.config) for entry in feed.entries: - self.processEntry(entry) + self.processEntry(postctx, entry) self.postProcess() @@ -46,6 +54,8 @@ if self.ctx.args.until: self.ctx.args.until = dateparser.parse(self.ctx.args.until) + # Go over the silos needed for this command (i.e. potentially + # filtered by passing `-s`) and call their `onPostStart`. for silo in self.silos: silo.onPostStart(self.ctx) @@ -53,7 +63,7 @@ for silo in self.silos: silo.onPostEnd(self.ctx) - def processEntry(self, entry): + def processEntry(self, postctx, entry): entry_url = entry.get('url') if not entry_url: logger.warning("Found entry without a URL: %s" % repr(entry._mf_entry)) @@ -63,10 +73,10 @@ logger.debug("Entry is filtered out: %s" % entry_url) return - postctx = SiloPostingContext(self.ctx) no_cache = self.ctx.args.no_cache only_since = self.ctx.args.since only_until = self.ctx.args.until + logger.debug("Processing entry: %s" % entry_url) for silo in self.silos: if only_since or only_until:
--- a/silorider/format.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/format.py Sat Oct 28 11:57:04 2023 -0700 @@ -11,11 +11,19 @@ _disable_get_card_info = False -def format_entry(entry, *, limit=None, card_props=None, - add_url='auto', url_flattener=None, url_mode=None): + +def format_entry(entry, *, + silo_name=None, silo_type=None, + limit=None, card_props=None, + add_url='auto', url_flattener=None, + profile_url_handlers=None, url_mode=None): url = entry.url ctx = HtmlStrippingContext() + ctx.silo_name = silo_name + ctx.silo_type = silo_type + if profile_url_handlers: + ctx.profile_url_handler = ProfileUrlHandler(profile_url_handlers) if url_flattener: ctx.url_flattener = url_flattener if url_mode is not None: @@ -94,6 +102,19 @@ self.is_from = from_label +class ProfileUrlHandler: + def __init__(self, handlers=None): + self.handlers = handlers + + def handleUrl(self, text, url): + if self.handlers: + for name, handler in self.handlers.items(): + res = handler.handleUrl(text, url) + if res: + return name, res + return None, None + + class UrlFlattener: def replaceHref(self, text, url, ctx): raise NotImplementedError() @@ -117,16 +138,22 @@ class HtmlStrippingContext: def __init__(self): + # The name and type of the silo we are working for + self.silo_name = None + self.silo_type = None # Mode for inserting URLs self.url_mode = URLMODE_LAST + # Object that can handle profile links + self.profile_url_handler = ProfileUrlHandler() + # Object that can measure and shorten URLs + self.url_flattener = _NullUrlFlattener() + # Limit for how long the text can be + self.limit = -1 + # List of URLs to insert self.urls = [] # Indices of URLs that should not get a leading whitespace self.nosp_urls = [] - # Object that can measure and shorten URLs - self.url_flattener = _NullUrlFlattener() - # Limit for how long the text can be - self.limit = -1 # Accumulated text length when accounting for shortened URLs self.text_length = 0 @@ -314,6 +341,18 @@ a_txt = ''.join([_do_strip_html(c, ctx) for c in cnts]) + # See if the URL is a link to a social media profile. If so, + # we will want to strip it out for any silo that isn't for + # that social network platform. + name, new_txt = ctx.profile_url_handler.handleUrl(a_txt, href) + if name: + if ctx.silo_type == name: + # Correct silo, return the processed text. + return ctx.processText(new_txt, False) + else: + # Another silo, strip the link. + return a_txt + # Use the URL flattener to reformat the hyperlink. old_text_length = ctx.text_length href_flattened = ctx.url_flattener.replaceHref(a_txt, href, ctx)
--- a/silorider/silos/base.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/silos/base.py Sat Oct 28 11:57:04 2023 -0700 @@ -44,7 +44,14 @@ class SiloPostingContext(SiloContextBase): - pass + def __init__(self, exec_ctx, profile_url_handlers=None): + SiloContextBase.__init__(self, exec_ctx) + self.profile_url_handlers = profile_url_handlers + + +class SiloProfileUrlHandler: + def handleUrl(self, text, url): + return None class Silo: @@ -73,7 +80,10 @@ return self.ctx.cache.setCustomValue(full_name, val) def formatEntry(self, entry, *args, **kwargs): - return format_entry(entry, *args, **kwargs) + return format_entry( + entry, + silo_name=self.name, silo_type=self.SILO_TYPE, + *args, **kwargs) def authenticate(self, ctx): raise NotImplementedError() @@ -81,6 +91,9 @@ def onPostStart(self, ctx): pass + def getProfileUrlHandler(self): + return None + def getEntryCard(self, entry, ctx): raise NotImplementedError()
--- a/silorider/silos/bluesky.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/silos/bluesky.py Sat Oct 28 11:57:04 2023 -0700 @@ -97,6 +97,7 @@ limit=300, # Use Twitter's meta properties card_props=CardProps('name', 'twitter'), + profile_url_handlers=ctx.profile_url_handlers, url_flattener=url_flattener, url_mode=URLMODE_ERASE) card.__bsky_url_flattener = url_flattener
--- a/silorider/silos/facebook.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/silos/facebook.py Sat Oct 28 11:57:04 2023 -0700 @@ -135,7 +135,10 @@ access_token=access_token) def getEntryCard(self, entry, ctx): - return self.formatEntry(entry, card_props=CardProps('property', 'og')) + return self.formatEntry( + entry, + card_props=CardProps('property', 'og'), + profile_url_handlers=ctx.profile_url_handlers) def mediaCallback(self, tmpfile, mt, url, desc): resp = self.client.post_object(
--- a/silorider/silos/mastodon.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/silos/mastodon.py Sat Oct 28 11:57:04 2023 -0700 @@ -1,8 +1,9 @@ import time import getpass import logging +import urllib.parse import mastodon -from .base import Silo +from .base import Silo, SiloProfileUrlHandler from ..format import CardProps @@ -111,6 +112,9 @@ access_token=access_token, api_base_url=self.base_url) + def getProfileUrlHandler(self): + return MastodonProfileUrlHandler() + def getEntryCard(self, entry, ctx): return self.formatEntry( entry, limit=500, @@ -147,3 +151,12 @@ continue raise +class MastodonProfileUrlHandler(SiloProfileUrlHandler): + def handleUrl(self, text, raw_url): + url = urllib.parse.urlparse(raw_url) + server_url = url.netloc + path = url.path.lstrip('/') + if path.startswith('@') and '/' not in path: + return '@%s%s' % (path, server_url) + return None +
--- a/silorider/silos/twitter.py Sat Oct 28 11:54:24 2023 -0700 +++ b/silorider/silos/twitter.py Sat Oct 28 11:57:04 2023 -0700 @@ -2,7 +2,7 @@ import logging import tweepy import urllib.parse -from .base import Silo +from .base import Silo, SiloProfileUrlHandler from ..format import CardProps, UrlFlattener from ..parse import strip_img_alt @@ -99,11 +99,15 @@ access_token_key=access_key, access_token_secret=access_secret) + def getProfileUrlHandler(self): + return TwitterProfileUrlHandler() + def getEntryCard(self, entry, ctx): return self.formatEntry( entry, limit=280, card_props=CardProps('name', 'twitter'), + profile_url_handlers=ctx.profile_url_handlers, url_flattener=TwitterUrlFlattener()) def mediaCallback(self, tmpfile, mt, url, desc): @@ -121,6 +125,22 @@ TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com'] +class TwitterProfileUrlHandler(SiloProfileUrlHandler): + def handleUrl(self, text, raw_url): + url = urllib.parse.urlparse(raw_url) + + # Is it a Twitter URL? + if url.netloc not in TWITTER_NETLOCS: + return None + + path = url.path.lstrip('/') + # Is it a profile URL? + if '/' not in path: + return '@' + path + + return None + + class TwitterUrlFlattener(UrlFlattener): def replaceHref(self, text, raw_url, ctx): url = urllib.parse.urlparse(raw_url)