Mercurial > silorider
changeset 48:486affad656e
Rewrite posting process with card system and more structured API
- The basic posting process is more opinionated so that silos have less code to
write, and dry-run posting is handled by default.
- Add "card" system where SiloRider is able to fetch the page of a post to
check for any custom summary or featured image. This is how Twitter,
Facebook, Discord, and many others come up with their "preview card" when
a link is posted.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sun, 08 Oct 2023 13:47:28 -0700 |
parents | 4be321bd5ed4 |
children | 2c433f19e467 |
files | silorider/commands/process.py silorider/format.py silorider/silos/base.py silorider/silos/bluesky.py silorider/silos/mastodon.py silorider/silos/twitter.py tests/test_format.py tests/test_silos_bluesky.py |
diffstat | 8 files changed, 186 insertions(+), 112 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/commands/process.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/commands/process.py Sun Oct 08 13:47:28 2023 -0700 @@ -1,7 +1,7 @@ import logging import dateparser from .utils import get_named_silos, get_named_urls -from ..silos.base import SiloPostingContext +from ..silos.base import SiloPostingContext, upload_silo_media from ..parse import parse_url @@ -98,9 +98,20 @@ (silo.name, entry_url)) continue + entry_card = silo.getEntryCard(entry, postctx) + if not entry_card: + logger.error("Can't find any content to use for entry: %s" % entry_url) + continue + + media_callback = silo.mediaCallback + if self.ctx.args.dry_run: + media_callback = silo.dryRunMediaCallback + media_ids = upload_silo_media(entry_card, 'photo', media_callback) + if not self.ctx.args.dry_run: + logger.debug("Posting to '%s': %s" % (silo.name, entry_url)) try: - did_post = silo.postEntry(entry, postctx) + did_post = silo.postEntry(entry_card, media_ids, postctx) except Exception as ex: did_post = False logger.error("Error posting: %s" % entry_url) @@ -110,9 +121,8 @@ if did_post is True or did_post is None: self.ctx.cache.addPost(silo.name, entry_url) else: - logger.info("Would post entry on %s: %s" % - (silo.name, entry_url)) - silo.dryRunPostEntry(entry, postctx) + logger.info("Would post to '%s': %s" % (silo.name, entry_url)) + silo.dryRunPostEntry(entry_card, media_ids, postctx) def isEntryFiltered(self, entry): if not self.config.has_section('filter'):
--- a/silorider/format.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/format.py Sun Oct 08 13:47:28 2023 -0700 @@ -1,12 +1,17 @@ import re import string +import logging +import urllib.request import textwrap import bs4 from .config import has_lxml -def format_entry(entry, limit=None, add_url='auto', url_flattener=None, - url_mode=None): +logger = logging.getLogger(__name__) + + +def format_entry(entry, *, limit=None, card_props=None, + add_url='auto', url_flattener=None, url_mode=None): url = entry.url ctx = HtmlStrippingContext() @@ -16,10 +21,24 @@ ctx.url_mode = url_mode # Don't add the limit yet. - name = get_best_text(entry, ctx) - if not name: + card = None + + # See if we can use a nice blurb for articles instead of their title. + if card_props and not entry.is_micropost: + card = get_card_info(entry, card_props, ctx) + + # Otherwise, find the best text, generally the title of the article, or the + # text of the micropost. + if card is None: + best_text = get_best_text(entry, ctx) + if best_text: + card = CardInfo(entry, best_text, None, 'best_text') + + if not card: raise Exception("Can't find best text for entry: %s" % url) + # We need to add the URL to the output if we were told to, or if we + # are dealing with an article. do_add_url = ((add_url is True) or (add_url == 'auto' and not entry.is_micropost)) if limit: @@ -31,26 +50,47 @@ shortened = text_length > limit if shortened: - # We need to shorten the text! We can't really reason about it - # anymore at this point because we could have URLs inside the - # text that don't measure the correct number of characters - # (such as with Twitter's URL shortening). Let's just start - # again with a limit that's our max limit, minus the room - # needed to add the link to the post. if not do_add_url and add_url == 'auto' and url: do_add_url = True limit -= 1 + ctx.url_flattener.measureUrl(url) - ctx = HtmlStrippingContext() - ctx.limit = limit - if url_flattener: - ctx.url_flattener = url_flattener - name = get_best_text(entry, ctx) + if card.is_from == 'best_text': + # We need to shorten the text! We can't really reason about it + # anymore at this point because we could have URLs inside the + # text that don't measure the correct number of characters + # (such as with Twitter's URL shortening). Let's just start + # again with a limit that's our max limit, minus the room + # needed to add the link to the post. + ctx = HtmlStrippingContext() + ctx.limit = limit + if url_flattener: + ctx.url_flattener = url_flattener + card.text = get_best_text(entry, ctx) + else: + # We need to shorten the blurb! We can't do much else besides + # truncate it... + card.text = card.text[:limit] # Actually add the url to the original post now. if do_add_url and url: - name += ' ' + url - return name + card.text += ' ' + url + return card + + +class CardProps: + def __init__(self, meta_attr, namespace): + self.meta_attr = meta_attr + self.namespace = namespace + self.description = '%s:description' % namespace + self.image = '%s:image' % namespace + + +class CardInfo: + def __init__(self, entry, txt, img, from_label=None): + self.entry = entry + self.text = txt + self.image = img + self.is_from = from_label class UrlFlattener: @@ -135,6 +175,28 @@ return None +def get_card_info(entry, card_props, ctx): + logger.debug("Downloading entry page to check meta entries: %s" % entry.url) + with urllib.request.urlopen(entry.url) as req: + raw_html = req.read() + + bs_html = bs4.BeautifulSoup(raw_html, + 'lxml' if has_lxml else 'html5lib') + head = bs_html.find('head') + + desc_meta = head.find('meta', attrs={card_props.meta_attr: card_props.description}) + desc = desc_meta.attrs.get('content') if desc_meta else None + + img_meta = head.find('meta', attrs={card_props.meta_attr: card_props.image}) + img = img_meta.attrs.get('content') if img_meta else None + + if desc: + logger.debug("Found card info, description: %s (image: %s)" % (desc, img)) + ctx.text_length = len(desc) + return CardInfo(entry, desc, img, 'card') + return None + + def strip_html(bs_elem, ctx=None): if isinstance(bs_elem, str): bs_elem = bs4.BeautifulSoup(bs_elem,
--- a/silorider/silos/base.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/silos/base.py Sun Oct 08 13:47:28 2023 -0700 @@ -23,6 +23,10 @@ return self.exec_ctx.args @property + def dry_run(self): + return self.exec_ctx.args.dry_run + + @property def config(self): return self.exec_ctx.config @@ -73,11 +77,21 @@ def onPostStart(self, ctx): pass - def postEntry(self, entry, ctx): + def getEntryCard(self, entry, ctx): + raise NotImplementedError() + + def mediaCallback(self, tmpfile, mimetype, url, desc): raise NotImplementedError() - def dryRunPostEntry(self, entry, ctx): - pass + def postEntry(self, entry_card, media_ids, ctx): + raise NotImplementedError() + + def dryRunMediaCallback(self, tmpfile, mimetype, url, desc): + return (url, desc) + + def dryRunPostEntry(self, entry_card, media_ids, ctx): + logger.info(entry_card.text) + logger.info("...with photos: %s" % media_ids) def onPostEnd(self, ctx): pass @@ -120,11 +134,19 @@ return silos -def upload_silo_media(entry, propname, callback): +def upload_silo_media(card, propname, callback): # The provided callback must take the parameters: # tmpfile path, mimetype, original media url, media description + + # Upload and use forced image, if any. + if card.image: + mid = _do_upload_silo_media(card.image, None, callback) + if mid is not None: + return [mid] + + # Look for media in the body of the original post. media_ids = None - media_entries = entry.get(propname, [], force_list=True) + media_entries = card.entry.get(propname, [], force_list=True) if media_entries: media_ids = [] for media_entry in media_entries: @@ -132,6 +154,7 @@ mid = _do_upload_silo_media(url, desc, callback) if mid is not None: media_ids.append(mid) + return media_ids
--- a/silorider/silos/bluesky.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/silos/bluesky.py Sun Oct 08 13:47:28 2023 -0700 @@ -5,8 +5,8 @@ import getpass import logging import datetime -from .base import Silo, upload_silo_media -from ..format import UrlFlattener, URLMODE_ERASE +from .base import Silo +from ..format import CardProps, UrlFlattener, URLMODE_ERASE import atproto import atproto.xrpc_client.models as atprotomodels @@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs): atproto.Client.__init__(self, *args, **kwargs) - def send_post(self, text, embed=None, facets=None): + def send_post(self, text, *, post_datetime=None, embed=None, facets=None): # Override the atproto.Client send_post function because it # doesn't support facets yet. The code is otherwise more or # less identical. @@ -73,30 +73,43 @@ self.ctx.silo_name) self.client.login(email, password) - def postEntry(self, entry, ctx): + def getEntryCard(self, entry, ctx): # We use URLMODE_ERASE to remove all hyperlinks from the # formatted text, and we later add them as facets to the atproto # record. url_flattener = BlueskyUrlFlattener() - posttxt = self.formatEntry( + card = self.formatEntry( entry, - limit=256, + limit=300, + # Use Twitter's meta properties + card_props=CardProps('name', 'twitter'), url_flattener=url_flattener, url_mode=URLMODE_ERASE) - if not posttxt: - raise Exception("Can't find any content to use for the post!") + card.__bsky_url_flattener = url_flattener + return card + + def mediaCallback(self, tmpfile, mt, url, desc): + with open(tmpfile, 'rb') as tmpfp: + data = tmpfp.read() - # Upload the images as blobs and add them as an embed on the - # atproto record. - images = upload_silo_media(entry, 'photo', self._media_callback) + logger.debug("Uploading image to Bluesky (%d bytes) with description: %s" % + (len(data), desc)) + upload = self.client.com.atproto.repo.upload_blob(data) + if desc is None: + desc = "" + return atprotomodels.AppBskyEmbedImages.Image(alt=desc, image=upload.blob) + + def postEntry(self, entry_card, media_ids, ctx): + # Add images as an embed on the atproto record. embed = None - if images: - embed = atprotomodels.AppBskyEmbedImages.Main(images=images) + if media_ids: + embed = atprotomodels.AppBskyEmbedImages.Main(images=media_ids) # Grab any URLs detected by our URL flattener and add them as # facets on the atproto record. facets = None + url_flattener = entry_card.__bsky_url_flattener if url_flattener.urls: facets = [] for url_info in url_flattener.urls: @@ -113,27 +126,10 @@ facets.append(facet) # Create the record! - self.client.send_post(text=posttxt, embed=embed, facets=facets) - - def dryRunPostEntry(self, entry, ctx): - posttxt = self.formatEntry(entry, limit=256) - logger.info("Post would be:") - logger.info(posttxt) - media_urls = entry.get('photo', [], force_list=True) - if media_urls: - logger.info("...with photos: %s" % str(media_urls)) - - def _media_callback(self, tmpfile, mt, url, desc): - with open(tmpfile, 'rb') as tmpfp: - data = tmpfp.read() - - logger.debug("Uploading image to Bluesky (%d bytes) with description: %s" % - (len(data), desc)) - upload = self.client.com.atproto.repo.upload_blob(data) - - if desc is None: - desc = "" - return atprotomodels.AppBskyEmbedImages.Image(alt=desc, image=upload.blob) + entry_dt = entry_card.entry.get('published') + self.client.send_post( + text=entry_card.text, post_datetime=entry_dt, embed=embed, + facets=facets) BLUESKY_NETLOC = 'bsky.app'
--- a/silorider/silos/mastodon.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/silos/mastodon.py Sun Oct 08 13:47:28 2023 -0700 @@ -2,7 +2,8 @@ import getpass import logging import mastodon -from .base import Silo, upload_silo_media +from .base import Silo +from ..format import CardProps logger = logging.getLogger(__name__) @@ -110,20 +111,26 @@ access_token=access_token, api_base_url=self.base_url) - def postEntry(self, entry, ctx): - toottxt = self.formatEntry(entry, limit=500) - if not toottxt: - raise Exception("Can't find any content to use for the toot!") + def getEntryCard(self, entry, ctx): + return self.formatEntry( + entry, limit=500, + # Use Twitter's meta properties + card_props=CardProps('name', 'twitter')) + def mediaCallback(self, tmpfile, mt, url, desc): + with open(tmpfile, 'rb') as tmpfp: + logger.debug("Uploading to mastodon with description: %s" % desc) + return self.client.media_post( + tmpfp, mime_type=mt, description=desc) + + def postEntry(self, entry_card, media_ids, ctx): visibility = self.getConfigItem('toot_visibility', fallback='public') - media_ids = upload_silo_media(entry, 'photo', self._media_callback) - tries_left = 5 - logger.debug("Posting toot: %s" % toottxt) + logger.debug("Posting toot: %s" % entry_card.text) while tries_left > 0: try: - self.client.status_post(toottxt, media_ids=media_ids, + self.client.status_post(entry_card.text, media_ids=media_ids, visibility=visibility) break # if we got here without an exception, it's all good! except mastodon.MastodonAPIError as merr: @@ -140,16 +147,3 @@ continue raise - def dryRunPostEntry(self, entry, ctx): - toottxt = self.formatEntry(entry, limit=500) - logger.info("Toot would be:") - logger.info(toottxt) - media_urls = entry.get('photo', [], force_list=True) - if media_urls: - logger.info("...with photos: %s" % str(media_urls)) - - def _media_callback(self, tmpfile, mt, url, desc): - with open(tmpfile, 'rb') as tmpfp: - logger.debug("Uploading to mastodon with description: %s" % desc) - return self.client.media_post( - tmpfp, mime_type=mt, description=desc)
--- a/silorider/silos/twitter.py Sun Oct 08 13:38:08 2023 -0700 +++ b/silorider/silos/twitter.py Sun Oct 08 13:47:28 2023 -0700 @@ -2,8 +2,8 @@ import logging import tweepy import urllib.parse -from .base import Silo, upload_silo_media -from ..format import UrlFlattener +from .base import Silo +from ..format import CardProps, UrlFlattener from ..parse import strip_img_alt @@ -93,28 +93,14 @@ access_token_key=access_key, access_token_secret=access_secret) - def postEntry(self, entry, ctx): - tweettxt = self.formatEntry(entry, limit=280, - url_flattener=TwitterUrlFlattener()) - if not tweettxt: - raise Exception("Can't find any content to use for the tweet!") - - media_ids = upload_silo_media(entry, 'photo', self._media_callback) - - logger.debug("Posting tweet: %s" % tweettxt) - self.client.create_tweet(text=tweettxt, media_ids=media_ids) + def getEntryCard(self, entry, ctx): + return self.formatEntry( + entry, + limit=280, + card_props=CardProps('name', 'twitter'), + url_flattener=TwitterUrlFlattener()) - def dryRunPostEntry(self, entry, ctx): - tweettxt = self.formatEntry(entry, limit=280, - url_flattener=TwitterUrlFlattener()) - logger.info("Tweet would be:") - logger.info(tweettxt) - media_urls = entry.get('photo', [], force_list=True) - media_urls = strip_img_alt(media_urls) - if media_urls: - logger.info("...with photos: %s" % str(media_urls)) - - def _media_callback(self, tmpfile, mt, url, desc): + def mediaCallback(self, tmpfile, mt, url, desc): url_parsed = urllib.parse.urlparse(url) fname = os.path.basename(url_parsed.path) with open(tmpfile, 'rb') as tmpfp: @@ -122,6 +108,9 @@ media = self.client.simple_upload(fname, file=tmpfp) return media.media_id + def postEntry(self, entry_card, media_ids, ctx): + self.client.create_tweet(text=entry_card.text, media_ids=media_ids) + TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com']
--- a/tests/test_format.py Sun Oct 08 13:38:08 2023 -0700 +++ b/tests/test_format.py Sun Oct 08 13:47:28 2023 -0700 @@ -81,10 +81,10 @@ 'words in it for no good reason', 80, True, 'A test entry that is very very long because its... ' + test_url) ]) -def test_format_lonform_entry(title, limit, add_url, expected): +def test_format_longform_entry(title, limit, add_url, expected): entry = _make_test_entry(title, False) - actual = format_entry(entry, limit, add_url) - assert actual == expected + actual = format_entry(entry, limit=limit, add_url=add_url) + assert actual.text == expected @pytest.mark.parametrize("text, limit, add_url, expected", [ @@ -109,5 +109,5 @@ ]) def test_format_micropost_entry(text, limit, add_url, expected): entry = _make_test_entry(text, True) - actual = format_entry(entry, limit, add_url) - assert actual == expected + actual = format_entry(entry, limit=limit, add_url=add_url) + assert actual.text == expected
--- a/tests/test_silos_bluesky.py Sun Oct 08 13:38:08 2023 -0700 +++ b/tests/test_silos_bluesky.py Sun Oct 08 13:47:28 2023 -0700 @@ -167,7 +167,7 @@ self.blobs.append((tmpfile, desc)) return len(self.blobs) - def send_post(self, text, embed=None, facets=None): + def send_post(self, text, post_datetime=None, embed=None, facets=None): self.posts.append((text, embed, facets))