changeset 48:486affad656e

Rewrite posting process with card system and more structured API - The basic posting process is more opinionated so that silos have less code to write, and dry-run posting is handled by default. - Add "card" system where SiloRider is able to fetch the page of a post to check for any custom summary or featured image. This is how Twitter, Facebook, Discord, and many others come up with their "preview card" when a link is posted.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 08 Oct 2023 13:47:28 -0700
parents 4be321bd5ed4
children 2c433f19e467
files silorider/commands/process.py silorider/format.py silorider/silos/base.py silorider/silos/bluesky.py silorider/silos/mastodon.py silorider/silos/twitter.py tests/test_format.py tests/test_silos_bluesky.py
diffstat 8 files changed, 186 insertions(+), 112 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/commands/process.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/commands/process.py	Sun Oct 08 13:47:28 2023 -0700
@@ -1,7 +1,7 @@
 import logging
 import dateparser
 from .utils import get_named_silos, get_named_urls
-from ..silos.base import SiloPostingContext
+from ..silos.base import SiloPostingContext, upload_silo_media
 from ..parse import parse_url
 
 
@@ -98,9 +98,20 @@
                              (silo.name, entry_url))
                 continue
 
+            entry_card = silo.getEntryCard(entry, postctx)
+            if not entry_card:
+                logger.error("Can't find any content to use for entry: %s" % entry_url)
+                continue
+
+            media_callback = silo.mediaCallback
+            if self.ctx.args.dry_run:
+                media_callback = silo.dryRunMediaCallback
+            media_ids = upload_silo_media(entry_card, 'photo', media_callback)
+
             if not self.ctx.args.dry_run:
+                logger.debug("Posting to '%s': %s" % (silo.name, entry_url))
                 try:
-                    did_post = silo.postEntry(entry, postctx)
+                    did_post = silo.postEntry(entry_card, media_ids, postctx)
                 except Exception as ex:
                     did_post = False
                     logger.error("Error posting: %s" % entry_url)
@@ -110,9 +121,8 @@
                 if did_post is True or did_post is None:
                     self.ctx.cache.addPost(silo.name, entry_url)
             else:
-                logger.info("Would post entry on %s: %s" %
-                            (silo.name, entry_url))
-                silo.dryRunPostEntry(entry, postctx)
+                logger.info("Would post to '%s': %s" % (silo.name, entry_url))
+                silo.dryRunPostEntry(entry_card, media_ids, postctx)
 
     def isEntryFiltered(self, entry):
         if not self.config.has_section('filter'):
--- a/silorider/format.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/format.py	Sun Oct 08 13:47:28 2023 -0700
@@ -1,12 +1,17 @@
 import re
 import string
+import logging
+import urllib.request
 import textwrap
 import bs4
 from .config import has_lxml
 
 
-def format_entry(entry, limit=None, add_url='auto', url_flattener=None,
-                 url_mode=None):
+logger = logging.getLogger(__name__)
+
+
+def format_entry(entry, *, limit=None, card_props=None,
+                 add_url='auto', url_flattener=None, url_mode=None):
     url = entry.url
 
     ctx = HtmlStrippingContext()
@@ -16,10 +21,24 @@
         ctx.url_mode = url_mode
     # Don't add the limit yet.
 
-    name = get_best_text(entry, ctx)
-    if not name:
+    card = None
+
+    # See if we can use a nice blurb for articles instead of their title.
+    if card_props and not entry.is_micropost:
+         card = get_card_info(entry, card_props, ctx)
+
+    # Otherwise, find the best text, generally the title of the article, or the
+    # text of the micropost.
+    if card is None:
+        best_text = get_best_text(entry, ctx)
+        if best_text:
+            card = CardInfo(entry, best_text, None, 'best_text')
+
+    if not card:
         raise Exception("Can't find best text for entry: %s" % url)
 
+    # We need to add the URL to the output if we were told to, or if we
+    # are dealing with an article.
     do_add_url = ((add_url is True) or
                   (add_url == 'auto' and not entry.is_micropost))
     if limit:
@@ -31,26 +50,47 @@
 
         shortened = text_length > limit
         if shortened:
-            # We need to shorten the text! We can't really reason about it
-            # anymore at this point because we could have URLs inside the
-            # text that don't measure the correct number of characters
-            # (such as with Twitter's URL shortening). Let's just start
-            # again with a limit that's our max limit, minus the room
-            # needed to add the link to the post.
             if not do_add_url and add_url == 'auto' and url:
                 do_add_url = True
                 limit -= 1 + ctx.url_flattener.measureUrl(url)
 
-            ctx = HtmlStrippingContext()
-            ctx.limit = limit
-            if url_flattener:
-                ctx.url_flattener = url_flattener
-            name = get_best_text(entry, ctx)
+            if card.is_from == 'best_text':
+                # We need to shorten the text! We can't really reason about it
+                # anymore at this point because we could have URLs inside the
+                # text that don't measure the correct number of characters
+                # (such as with Twitter's URL shortening). Let's just start
+                # again with a limit that's our max limit, minus the room
+                # needed to add the link to the post.
+                ctx = HtmlStrippingContext()
+                ctx.limit = limit
+                if url_flattener:
+                    ctx.url_flattener = url_flattener
+                card.text = get_best_text(entry, ctx)
+            else:
+                # We need to shorten the blurb! We can't do much else besides
+                # truncate it...
+                card.text = card.text[:limit]
 
     # Actually add the url to the original post now.
     if do_add_url and url:
-        name += ' ' + url
-    return name
+        card.text += ' ' + url
+    return card
+
+
+class CardProps:
+    def __init__(self, meta_attr, namespace):
+        self.meta_attr = meta_attr
+        self.namespace = namespace
+        self.description = '%s:description' % namespace
+        self.image = '%s:image' % namespace
+
+
+class CardInfo:
+    def __init__(self, entry, txt, img, from_label=None):
+        self.entry = entry
+        self.text = txt
+        self.image = img
+        self.is_from = from_label
 
 
 class UrlFlattener:
@@ -135,6 +175,28 @@
     return None
 
 
+def get_card_info(entry, card_props, ctx):
+    logger.debug("Downloading entry page to check meta entries: %s" % entry.url)
+    with urllib.request.urlopen(entry.url) as req:
+        raw_html = req.read()
+
+    bs_html = bs4.BeautifulSoup(raw_html,
+            'lxml' if has_lxml else 'html5lib')
+    head = bs_html.find('head')
+
+    desc_meta = head.find('meta', attrs={card_props.meta_attr: card_props.description})
+    desc = desc_meta.attrs.get('content') if desc_meta else None
+
+    img_meta = head.find('meta', attrs={card_props.meta_attr: card_props.image})
+    img = img_meta.attrs.get('content') if img_meta else None
+
+    if desc:
+        logger.debug("Found card info, description: %s (image: %s)" % (desc, img))
+        ctx.text_length = len(desc)
+        return CardInfo(entry, desc, img, 'card')
+    return None
+
+
 def strip_html(bs_elem, ctx=None):
     if isinstance(bs_elem, str):
         bs_elem = bs4.BeautifulSoup(bs_elem,
--- a/silorider/silos/base.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/silos/base.py	Sun Oct 08 13:47:28 2023 -0700
@@ -23,6 +23,10 @@
         return self.exec_ctx.args
 
     @property
+    def dry_run(self):
+        return self.exec_ctx.args.dry_run
+
+    @property
     def config(self):
         return self.exec_ctx.config
 
@@ -73,11 +77,21 @@
     def onPostStart(self, ctx):
         pass
 
-    def postEntry(self, entry, ctx):
+    def getEntryCard(self, entry, ctx):
+        raise NotImplementedError()
+
+    def mediaCallback(self, tmpfile, mimetype, url, desc):
         raise NotImplementedError()
 
-    def dryRunPostEntry(self, entry, ctx):
-        pass
+    def postEntry(self, entry_card, media_ids, ctx):
+        raise NotImplementedError()
+
+    def dryRunMediaCallback(self, tmpfile, mimetype, url, desc):
+        return (url, desc)
+
+    def dryRunPostEntry(self, entry_card, media_ids, ctx):
+        logger.info(entry_card.text)
+        logger.info("...with photos: %s" % media_ids)
 
     def onPostEnd(self, ctx):
         pass
@@ -120,11 +134,19 @@
     return silos
 
 
-def upload_silo_media(entry, propname, callback):
+def upload_silo_media(card, propname, callback):
     # The provided callback must take the parameters:
     #  tmpfile path, mimetype, original media url, media description
+
+    # Upload and use forced image, if any.
+    if card.image:
+        mid = _do_upload_silo_media(card.image, None, callback)
+        if mid is not None:
+            return [mid]
+
+    # Look for media in the body of the original post.
     media_ids = None
-    media_entries = entry.get(propname, [], force_list=True)
+    media_entries = card.entry.get(propname, [], force_list=True)
     if media_entries:
         media_ids = []
         for media_entry in media_entries:
@@ -132,6 +154,7 @@
             mid = _do_upload_silo_media(url, desc, callback)
             if mid is not None:
                 media_ids.append(mid)
+
     return media_ids
 
 
--- a/silorider/silos/bluesky.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/silos/bluesky.py	Sun Oct 08 13:47:28 2023 -0700
@@ -5,8 +5,8 @@
 import getpass
 import logging
 import datetime
-from .base import Silo, upload_silo_media
-from ..format import UrlFlattener, URLMODE_ERASE
+from .base import Silo
+from ..format import CardProps, UrlFlattener, URLMODE_ERASE
 
 import atproto
 import atproto.xrpc_client.models as atprotomodels
@@ -19,7 +19,7 @@
     def __init__(self, *args, **kwargs):
         atproto.Client.__init__(self, *args, **kwargs)
 
-    def send_post(self, text, embed=None, facets=None):
+    def send_post(self, text, *, post_datetime=None, embed=None, facets=None):
         # Override the atproto.Client send_post function because it
         # doesn't support facets yet. The code is otherwise more or
         # less identical.
@@ -73,30 +73,43 @@
                                 self.ctx.silo_name)
             self.client.login(email, password)
 
-    def postEntry(self, entry, ctx):
+    def getEntryCard(self, entry, ctx):
         # We use URLMODE_ERASE to remove all hyperlinks from the
         # formatted text, and we later add them as facets to the atproto
         # record.
         url_flattener = BlueskyUrlFlattener()
-        posttxt = self.formatEntry(
+        card = self.formatEntry(
             entry,
-            limit=256,
+            limit=300,
+            # Use Twitter's meta properties
+            card_props=CardProps('name', 'twitter'),
             url_flattener=url_flattener,
             url_mode=URLMODE_ERASE)
-        if not posttxt:
-            raise Exception("Can't find any content to use for the post!")
+        card.__bsky_url_flattener = url_flattener
+        return card
+
+    def mediaCallback(self, tmpfile, mt, url, desc):
+        with open(tmpfile, 'rb') as tmpfp:
+            data = tmpfp.read()
 
-        # Upload the images as blobs and add them as an embed on the
-        # atproto record.
-        images = upload_silo_media(entry, 'photo', self._media_callback)
+        logger.debug("Uploading image to Bluesky (%d bytes) with description: %s" %
+                     (len(data), desc))
+        upload = self.client.com.atproto.repo.upload_blob(data)
 
+        if desc is None:
+            desc = ""
+        return atprotomodels.AppBskyEmbedImages.Image(alt=desc, image=upload.blob)
+
+    def postEntry(self, entry_card, media_ids, ctx):
+        # Add images as an embed on the atproto record.
         embed = None
-        if images:
-            embed = atprotomodels.AppBskyEmbedImages.Main(images=images)
+        if media_ids:
+            embed = atprotomodels.AppBskyEmbedImages.Main(images=media_ids)
 
         # Grab any URLs detected by our URL flattener and add them as
         # facets on the atproto record.
         facets = None
+        url_flattener = entry_card.__bsky_url_flattener
         if url_flattener.urls:
             facets = []
             for url_info in url_flattener.urls:
@@ -113,27 +126,10 @@
                 facets.append(facet)
 
         # Create the record!
-        self.client.send_post(text=posttxt, embed=embed, facets=facets)
-
-    def dryRunPostEntry(self, entry, ctx):
-        posttxt = self.formatEntry(entry, limit=256)
-        logger.info("Post would be:")
-        logger.info(posttxt)
-        media_urls = entry.get('photo', [], force_list=True)
-        if media_urls:
-            logger.info("...with photos: %s" % str(media_urls))
-
-    def _media_callback(self, tmpfile, mt, url, desc):
-        with open(tmpfile, 'rb') as tmpfp:
-            data = tmpfp.read()
-
-        logger.debug("Uploading image to Bluesky (%d bytes) with description: %s" %
-                     (len(data), desc))
-        upload = self.client.com.atproto.repo.upload_blob(data)
-
-        if desc is None:
-            desc = ""
-        return atprotomodels.AppBskyEmbedImages.Image(alt=desc, image=upload.blob)
+        entry_dt = entry_card.entry.get('published')
+        self.client.send_post(
+                text=entry_card.text, post_datetime=entry_dt, embed=embed,
+                facets=facets)
 
 
 BLUESKY_NETLOC = 'bsky.app'
--- a/silorider/silos/mastodon.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/silos/mastodon.py	Sun Oct 08 13:47:28 2023 -0700
@@ -2,7 +2,8 @@
 import getpass
 import logging
 import mastodon
-from .base import Silo, upload_silo_media
+from .base import Silo
+from ..format import CardProps
 
 
 logger = logging.getLogger(__name__)
@@ -110,20 +111,26 @@
             access_token=access_token,
             api_base_url=self.base_url)
 
-    def postEntry(self, entry, ctx):
-        toottxt = self.formatEntry(entry, limit=500)
-        if not toottxt:
-            raise Exception("Can't find any content to use for the toot!")
+    def getEntryCard(self, entry, ctx):
+        return self.formatEntry(
+                entry, limit=500,
+                # Use Twitter's meta properties
+                card_props=CardProps('name', 'twitter'))
 
+    def mediaCallback(self, tmpfile, mt, url, desc):
+        with open(tmpfile, 'rb') as tmpfp:
+            logger.debug("Uploading to mastodon with description: %s" % desc)
+            return self.client.media_post(
+                    tmpfp, mime_type=mt, description=desc)
+
+    def postEntry(self, entry_card, media_ids, ctx):
         visibility = self.getConfigItem('toot_visibility', fallback='public')
 
-        media_ids = upload_silo_media(entry, 'photo', self._media_callback)
-
         tries_left = 5
-        logger.debug("Posting toot: %s" % toottxt)
+        logger.debug("Posting toot: %s" % entry_card.text)
         while tries_left > 0:
             try:
-                self.client.status_post(toottxt, media_ids=media_ids,
+                self.client.status_post(entry_card.text, media_ids=media_ids,
                                         visibility=visibility)
                 break # if we got here without an exception, it's all good!
             except mastodon.MastodonAPIError as merr:
@@ -140,16 +147,3 @@
                     continue
                 raise
 
-    def dryRunPostEntry(self, entry, ctx):
-        toottxt = self.formatEntry(entry, limit=500)
-        logger.info("Toot would be:")
-        logger.info(toottxt)
-        media_urls = entry.get('photo', [], force_list=True)
-        if media_urls:
-            logger.info("...with photos: %s" % str(media_urls))
-
-    def _media_callback(self, tmpfile, mt, url, desc):
-        with open(tmpfile, 'rb') as tmpfp:
-            logger.debug("Uploading to mastodon with description: %s" % desc)
-            return self.client.media_post(
-                    tmpfp, mime_type=mt, description=desc)
--- a/silorider/silos/twitter.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/silorider/silos/twitter.py	Sun Oct 08 13:47:28 2023 -0700
@@ -2,8 +2,8 @@
 import logging
 import tweepy
 import urllib.parse
-from .base import Silo, upload_silo_media
-from ..format import UrlFlattener
+from .base import Silo
+from ..format import CardProps, UrlFlattener
 from ..parse import strip_img_alt
 
 
@@ -93,28 +93,14 @@
             access_token_key=access_key,
             access_token_secret=access_secret)
 
-    def postEntry(self, entry, ctx):
-        tweettxt = self.formatEntry(entry, limit=280,
-                                    url_flattener=TwitterUrlFlattener())
-        if not tweettxt:
-            raise Exception("Can't find any content to use for the tweet!")
-
-        media_ids = upload_silo_media(entry, 'photo', self._media_callback)
-
-        logger.debug("Posting tweet: %s" % tweettxt)
-        self.client.create_tweet(text=tweettxt, media_ids=media_ids)
+    def getEntryCard(self, entry, ctx):
+        return self.formatEntry(
+                entry,
+                limit=280,
+                card_props=CardProps('name', 'twitter'),
+                url_flattener=TwitterUrlFlattener())
 
-    def dryRunPostEntry(self, entry, ctx):
-        tweettxt = self.formatEntry(entry, limit=280,
-                                    url_flattener=TwitterUrlFlattener())
-        logger.info("Tweet would be:")
-        logger.info(tweettxt)
-        media_urls = entry.get('photo', [], force_list=True)
-        media_urls = strip_img_alt(media_urls)
-        if media_urls:
-            logger.info("...with photos: %s" % str(media_urls))
-
-    def _media_callback(self, tmpfile, mt, url, desc):
+    def mediaCallback(self, tmpfile, mt, url, desc):
         url_parsed = urllib.parse.urlparse(url)
         fname = os.path.basename(url_parsed.path)
         with open(tmpfile, 'rb') as tmpfp:
@@ -122,6 +108,9 @@
             media = self.client.simple_upload(fname, file=tmpfp)
         return media.media_id
 
+    def postEntry(self, entry_card, media_ids, ctx):
+        self.client.create_tweet(text=entry_card.text, media_ids=media_ids)
+
 
 TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com']
 
--- a/tests/test_format.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/tests/test_format.py	Sun Oct 08 13:47:28 2023 -0700
@@ -81,10 +81,10 @@
      'words in it for no good reason', 80, True,
      'A test entry that is very very long because its... ' + test_url)
 ])
-def test_format_lonform_entry(title, limit, add_url, expected):
+def test_format_longform_entry(title, limit, add_url, expected):
     entry = _make_test_entry(title, False)
-    actual = format_entry(entry, limit, add_url)
-    assert actual == expected
+    actual = format_entry(entry, limit=limit, add_url=add_url)
+    assert actual.text == expected
 
 
 @pytest.mark.parametrize("text, limit, add_url, expected", [
@@ -109,5 +109,5 @@
 ])
 def test_format_micropost_entry(text, limit, add_url, expected):
     entry = _make_test_entry(text, True)
-    actual = format_entry(entry, limit, add_url)
-    assert actual == expected
+    actual = format_entry(entry, limit=limit, add_url=add_url)
+    assert actual.text == expected
--- a/tests/test_silos_bluesky.py	Sun Oct 08 13:38:08 2023 -0700
+++ b/tests/test_silos_bluesky.py	Sun Oct 08 13:47:28 2023 -0700
@@ -167,7 +167,7 @@
         self.blobs.append((tmpfile, desc))
         return len(self.blobs)
 
-    def send_post(self, text, embed=None, facets=None):
+    def send_post(self, text, post_datetime=None, embed=None, facets=None):
         self.posts.append((text, embed, facets))