changeset 60:b7da3d97ea99

Add profile URL handlers Silos register these handlers so that everybody knows if a hyperlink is a mention to another user on a particular social network. If any handler matches, silos not related to that social media will skip that link. It's possible than in rare cases we want that link everywhere, but so far I haven't needed it, compared to all the times I didn't want these links.
author Ludovic Chabant <ludovic@chabant.com>
date Sat, 28 Oct 2023 11:57:04 -0700
parents b404445082f8
children 91dea4fe31ec
files silorider/commands/process.py silorider/format.py silorider/silos/base.py silorider/silos/bluesky.py silorider/silos/facebook.py silorider/silos/mastodon.py silorider/silos/twitter.py
diffstat 7 files changed, 113 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/commands/process.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/commands/process.py	Sat Oct 28 11:57:04 2023 -0700
@@ -33,9 +33,17 @@
     def process(self):
         self.preProcess()
 
+        # Get all silos to return a profile URL handler.
+        profile_url_handlers = {}
+        for silo in self.ctx.silos:
+            handler = silo.getProfileUrlHandler()
+            if handler:
+                profile_url_handlers[silo.SILO_TYPE] = handler
+
+        postctx = SiloPostingContext(self.ctx, profile_url_handlers)
         feed = parse_url(self.url, self.name, self.config)
         for entry in feed.entries:
-            self.processEntry(entry)
+            self.processEntry(postctx, entry)
 
         self.postProcess()
 
@@ -46,6 +54,8 @@
         if self.ctx.args.until:
             self.ctx.args.until = dateparser.parse(self.ctx.args.until)
 
+        # Go over the silos needed for this command (i.e. potentially
+        # filtered by passing `-s`) and call their `onPostStart`.
         for silo in self.silos:
             silo.onPostStart(self.ctx)
 
@@ -53,7 +63,7 @@
         for silo in self.silos:
             silo.onPostEnd(self.ctx)
 
-    def processEntry(self, entry):
+    def processEntry(self, postctx, entry):
         entry_url = entry.get('url')
         if not entry_url:
             logger.warning("Found entry without a URL: %s" % repr(entry._mf_entry))
@@ -63,10 +73,10 @@
             logger.debug("Entry is filtered out: %s" % entry_url)
             return
 
-        postctx = SiloPostingContext(self.ctx)
         no_cache = self.ctx.args.no_cache
         only_since = self.ctx.args.since
         only_until = self.ctx.args.until
+
         logger.debug("Processing entry: %s" % entry_url)
         for silo in self.silos:
             if only_since or only_until:
--- a/silorider/format.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/format.py	Sat Oct 28 11:57:04 2023 -0700
@@ -11,11 +11,19 @@
 
 _disable_get_card_info = False
 
-def format_entry(entry, *, limit=None, card_props=None,
-                 add_url='auto', url_flattener=None, url_mode=None):
+
+def format_entry(entry, *,
+                 silo_name=None, silo_type=None,
+                 limit=None, card_props=None,
+                 add_url='auto', url_flattener=None,
+                 profile_url_handlers=None, url_mode=None):
     url = entry.url
 
     ctx = HtmlStrippingContext()
+    ctx.silo_name = silo_name
+    ctx.silo_type = silo_type
+    if profile_url_handlers:
+        ctx.profile_url_handler = ProfileUrlHandler(profile_url_handlers)
     if url_flattener:
         ctx.url_flattener = url_flattener
     if url_mode is not None:
@@ -94,6 +102,19 @@
         self.is_from = from_label
 
 
+class ProfileUrlHandler:
+    def __init__(self, handlers=None):
+        self.handlers = handlers
+
+    def handleUrl(self, text, url):
+        if self.handlers:
+            for name, handler in self.handlers.items():
+                res = handler.handleUrl(text, url)
+                if res:
+                    return name, res
+        return None, None
+
+
 class UrlFlattener:
     def replaceHref(self, text, url, ctx):
         raise NotImplementedError()
@@ -117,16 +138,22 @@
 
 class HtmlStrippingContext:
     def __init__(self):
+        # The name and type of the silo we are working for
+        self.silo_name = None
+        self.silo_type = None
         # Mode for inserting URLs
         self.url_mode = URLMODE_LAST
+        # Object that can handle profile links
+        self.profile_url_handler = ProfileUrlHandler()
+        # Object that can measure and shorten URLs
+        self.url_flattener = _NullUrlFlattener()
+        # Limit for how long the text can be
+        self.limit = -1
+
         # List of URLs to insert
         self.urls = []
         # Indices of URLs that should not get a leading whitespace
         self.nosp_urls = []
-        # Object that can measure and shorten URLs
-        self.url_flattener = _NullUrlFlattener()
-        # Limit for how long the text can be
-        self.limit = -1
 
         # Accumulated text length when accounting for shortened URLs
         self.text_length = 0
@@ -314,6 +341,18 @@
             a_txt = ''.join([_do_strip_html(c, ctx)
                              for c in cnts])
 
+        # See if the URL is a link to a social media profile. If so,
+        # we will want to strip it out for any silo that isn't for
+        # that social network platform.
+        name, new_txt = ctx.profile_url_handler.handleUrl(a_txt, href)
+        if name:
+            if ctx.silo_type == name:
+                # Correct silo, return the processed text.
+                return ctx.processText(new_txt, False)
+            else:
+                # Another silo, strip the link.
+                return a_txt
+
         # Use the URL flattener to reformat the hyperlink.
         old_text_length = ctx.text_length
         href_flattened = ctx.url_flattener.replaceHref(a_txt, href, ctx)
--- a/silorider/silos/base.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/silos/base.py	Sat Oct 28 11:57:04 2023 -0700
@@ -44,7 +44,14 @@
 
 
 class SiloPostingContext(SiloContextBase):
-    pass
+    def __init__(self, exec_ctx, profile_url_handlers=None):
+        SiloContextBase.__init__(self, exec_ctx)
+        self.profile_url_handlers = profile_url_handlers
+
+
+class SiloProfileUrlHandler:
+    def handleUrl(self, text, url):
+        return None
 
 
 class Silo:
@@ -73,7 +80,10 @@
         return self.ctx.cache.setCustomValue(full_name, val)
 
     def formatEntry(self, entry, *args, **kwargs):
-        return format_entry(entry, *args, **kwargs)
+        return format_entry(
+                entry,
+                silo_name=self.name, silo_type=self.SILO_TYPE,
+                *args, **kwargs)
 
     def authenticate(self, ctx):
         raise NotImplementedError()
@@ -81,6 +91,9 @@
     def onPostStart(self, ctx):
         pass
 
+    def getProfileUrlHandler(self):
+        return None
+
     def getEntryCard(self, entry, ctx):
         raise NotImplementedError()
 
--- a/silorider/silos/bluesky.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/silos/bluesky.py	Sat Oct 28 11:57:04 2023 -0700
@@ -97,6 +97,7 @@
             limit=300,
             # Use Twitter's meta properties
             card_props=CardProps('name', 'twitter'),
+            profile_url_handlers=ctx.profile_url_handlers,
             url_flattener=url_flattener,
             url_mode=URLMODE_ERASE)
         card.__bsky_url_flattener = url_flattener
--- a/silorider/silos/facebook.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/silos/facebook.py	Sat Oct 28 11:57:04 2023 -0700
@@ -135,7 +135,10 @@
             access_token=access_token)
 
     def getEntryCard(self, entry, ctx):
-        return self.formatEntry(entry, card_props=CardProps('property', 'og'))
+        return self.formatEntry(
+                entry,
+                card_props=CardProps('property', 'og'),
+                profile_url_handlers=ctx.profile_url_handlers)
 
     def mediaCallback(self, tmpfile, mt, url, desc):
         resp = self.client.post_object(
--- a/silorider/silos/mastodon.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/silos/mastodon.py	Sat Oct 28 11:57:04 2023 -0700
@@ -1,8 +1,9 @@
 import time
 import getpass
 import logging
+import urllib.parse
 import mastodon
-from .base import Silo
+from .base import Silo, SiloProfileUrlHandler
 from ..format import CardProps
 
 
@@ -111,6 +112,9 @@
             access_token=access_token,
             api_base_url=self.base_url)
 
+    def getProfileUrlHandler(self):
+        return MastodonProfileUrlHandler()
+
     def getEntryCard(self, entry, ctx):
         return self.formatEntry(
                 entry, limit=500,
@@ -147,3 +151,12 @@
                     continue
                 raise
 
+class MastodonProfileUrlHandler(SiloProfileUrlHandler):
+    def handleUrl(self, text, raw_url):
+        url = urllib.parse.urlparse(raw_url)
+        server_url = url.netloc
+        path = url.path.lstrip('/')
+        if path.startswith('@') and '/' not in path:
+            return '@%s%s' % (path, server_url)
+        return None
+
--- a/silorider/silos/twitter.py	Sat Oct 28 11:54:24 2023 -0700
+++ b/silorider/silos/twitter.py	Sat Oct 28 11:57:04 2023 -0700
@@ -2,7 +2,7 @@
 import logging
 import tweepy
 import urllib.parse
-from .base import Silo
+from .base import Silo, SiloProfileUrlHandler
 from ..format import CardProps, UrlFlattener
 from ..parse import strip_img_alt
 
@@ -99,11 +99,15 @@
             access_token_key=access_key,
             access_token_secret=access_secret)
 
+    def getProfileUrlHandler(self):
+        return TwitterProfileUrlHandler()
+
     def getEntryCard(self, entry, ctx):
         return self.formatEntry(
                 entry,
                 limit=280,
                 card_props=CardProps('name', 'twitter'),
+                profile_url_handlers=ctx.profile_url_handlers,
                 url_flattener=TwitterUrlFlattener())
 
     def mediaCallback(self, tmpfile, mt, url, desc):
@@ -121,6 +125,22 @@
 TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com']
 
 
+class TwitterProfileUrlHandler(SiloProfileUrlHandler):
+    def handleUrl(self, text, raw_url):
+        url = urllib.parse.urlparse(raw_url)
+
+        # Is it a Twitter URL?
+        if url.netloc not in TWITTER_NETLOCS:
+            return None
+
+        path = url.path.lstrip('/')
+        # Is it a profile URL?
+        if '/' not in path:
+            return '@' + path
+
+        return None
+
+
 class TwitterUrlFlattener(UrlFlattener):
     def replaceHref(self, text, raw_url, ctx):
         url = urllib.parse.urlparse(raw_url)