# HG changeset patch # User Ludovic Chabant # Date 1724859652 25200 # Node ID 5381655e7f6dc6fbc3e8cda9435c7d9972aecb91 # Parent 268b287461c06ee2fc8e17128d047b5e16a06bc9 Bluesky silo now tries to create link card embeds diff -r 268b287461c0 -r 5381655e7f6d silorider/silos/bluesky.py --- a/silorider/silos/bluesky.py Wed Aug 28 08:40:12 2024 -0700 +++ b/silorider/silos/bluesky.py Wed Aug 28 08:40:52 2024 -0700 @@ -1,12 +1,19 @@ import re +import bs4 import os.path +import gzip import json import time +import random +import signal +import urllib.error import urllib.parse +import urllib.request import getpass import logging import datetime from .base import Silo +from ..config import has_lxml from ..format import CardProps, UrlFlattener, URLMODE_ERASE import atproto @@ -122,6 +129,7 @@ # Grab any URLs detected by our URL flattener and add them as # facets on the atproto record. facets = None + first_url = None url_flattener = entry_card.__bsky_url_flattener if url_flattener.urls: facets = [] @@ -138,12 +146,194 @@ ) facets.append(facet) + if first_url is None: + first_url = url + + # Make a link embed for the first link if we didn't have an embed already. + if embed is None and first_url is not None: + embed = self._makeUrlEmbed(first_url) + # Create the record! entry_dt = entry_card.entry.get('published') self.client.send_post( - text=entry_card.text, post_datetime=entry_dt, embed=embed, + text=entry_card.text, + post_datetime=entry_dt, + embed=embed, facets=facets) + def _makeUrlEmbed(self, url): + # Fetch the document at the URL. + urlopen = urllib.request.urlopen + # Because we may hit well-known servers like YouTube, we need to: + # 1. specify a user-agent that won't get us thrown out + # 2. handle the case of an error 429, which tells us to wait + req_headers={ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5', + 'Accept-Encoding': 'gzip,deflate' + } + logging.debug("Fetching link to build Bluesky link embed: %s" % url) + + attempts = 0 + max_attempts = 3 + html_raw = None + html_encoding = None + while attempts < max_attempts: + attempts += 1 + try: + req = _build_http_request(url, req_headers) + # Wrap the request inside a signal-based timeout just in + # case we encounter some problem in low-level code. + with SignalTimeout(6, "urlopen timed out!") as sto: + with urlopen(req, timeout=5) as resp: + logging.debug("Response status: %s" % str(resp.status)) + logging.debug("Response headers: %s" % str(resp.headers)) + html_encoding = resp.headers['Content-Encoding'] + html_raw = resp.read() + break + except Exception as ex: + logger.warning("Couldn't fetch link: %s" % url) + logger.warning(str(ex)) + # See if we are being told to retry after a while. If so, + # wait and retry. If not, abort. + if not hasattr(ex, 'headers'): + break + retry_after = ex.headers.get('Retry-After') + if not retry_after: + break + try: + wait_time = int(float(retry_after)) + 1 + except ValueError: + wait_time = -1 + if wait_time < 0: + break + logger.warning( + "Received 'Too Many Requests' error from the server, " + "waiting %d seconds" % wait_time) + if wait_time > 60: + logger.warning("Don't want to wait too long, aborting.") + break + time.sleep(wait_time) + + if html_raw is None: + logger.error("Aborting after %d attempts." % attempts) + return None + + # Optionally unzip it. + if html_encoding == 'gzip': + html_raw = gzip.decompress(html_raw).decode() + + # Use BeautifulSoup to parse the HTML. + logging.debug("Parsing '%s' html document (%d bytes)" % (url, len(html_raw))) + html_doc = bs4.BeautifulSoup( + html_raw, + 'lxml' if has_lxml else 'html5lib') + + # Look for title, description, and thumbnail image. + # We first try OpenGraph info, fallback to Twitter info, and fallback + # last on general HTML5 info. + embed_title = _find_meta(html_doc, property="og:title") + if not embed_title: + embed_title = _find_meta(html_doc, name="twitter:title") + if not embed_title: + embed_title = html_doc.find("title").string + + if not embed_title: + logger.error("Couldn't find title! Aborting making an embed.") + return None + + embed_description = _find_meta(html_doc, property="og:description") + if not embed_description: + embed_description = _find_meta(html_doc, name="twitter:description") + if not embed_description: + embed_description = _find_meta(html_doc, name="description") + if not embed_description: + logger.warning("Couldn't find description, falling back to title.") + embed_description = embed_title + + embed_image = _find_meta(html_doc, property="og:image") + if not embed_image: + embed_image = _find_meta(html_doc, name="twitter:image") + if not embed_image: + embed_image = _find_meta(html_doc, property="thumbnail") + + logger.debug( + "Creating Bluesky embed with title '%s', description '%s', and " + "image '%s'" % (embed_title, embed_description, embed_image)) + + # Upload the thumbnail image to Bluesky. + embed_thumb_blob = None + if embed_image: + try: + thumb_req = _build_http_request(embed_image) + with SignalTimeout(6, "urlopen timed out!") as sto: + with urlopen(thumb_req, timeout=5) as thumb_resp: + thumb_data = thumb_rest.read() + logger.debug( + "Uploading embed image '%s' to Bluesky (%d bytes)" % + (embed_image, len(thumb_data))) + embed_thumb_blob = self.client.com.atproto.repo.upload_blob(thumb_data) + except Exception as ex: + logger.warning( + "Couldn't fetch thumbnail URL '%s' to build Bluesky embed" % + embed_image) + logger.warning(str(ex)) + + # Make the embed! + embed = atprotomodels.AppBskyEmbedExternal.Main( + external=atprotomodels.AppBskyEmbedExternal.External( + title=embed_title, + description=embed_description, + uri=url, + thumb=embed_thumb_blob)) + return embed + + +def _build_http_request(url, headers=None): + req = urllib.request.Request(url) + req.add_header('User-Agent', _get_random_user_agent()) + if headers: + for k, v in headers.items(): + req.add_header(k, v) + return req + + +class SignalTimeout: + def __init__(self, seconds, error_message): + self.seconds = seconds + self.error_message = error_message + + def __enter__(self): + signal.signal(signal.SIGALRM, self._onTimeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + def _onTimeout(self, signum, frame): + raise TimeoutError(self.error_message) + + +_user_agents = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.3', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.1', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.3', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.3', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + ] + +def _get_random_user_agent(): + return random.choice(_user_agents) + + +def _find_meta(html_doc, **kwargs): + # Pass kwargs as a dictionary so we can also look for tags with a property + # named 'name' without conflicting with the find() method's 'name' arg. + meta_tag = html_doc.find("meta", dict(kwargs)) + return meta_tag["content"] if meta_tag else None + BLUESKY_NETLOC = 'bsky.app'