view tests/test_format.py @ 33:9e4eb3f2754e

Improve handling of character limits in html stripping The code now more closely keeps track of character counts during html stripping, and should be absolutely exact. When the limit is exceeded, it now restarts the stripping without any URLs to prevent incorrect trimming. It also better preserves whitespace in the original post. New tests are added for Twitter silo to ensure it works as expected.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 10 May 2023 16:10:12 -0700
parents c898b4df0f29
children 486affad656e
line wrap: on
line source

import pytest
from silorider.format import (
        format_entry, strip_html, HtmlStrippingContext,
        URLMODE_INLINE, URLMODE_LAST, URLMODE_BOTTOM_LIST)


test_url = 'https://example.org/article'


def _make_test_entry(best_name, is_micropost):
    class TestEntry:
        def __init__(self):
            self.is_micropost = is_micropost
            self.url = test_url

        def get(self, _):
            return best_name

        def htmlFind(self, *args, **kwargs):
            return best_name

    entry = TestEntry()
    return entry


@pytest.mark.parametrize("text, expected", [
    ("<p>Something</p>",
     "Something"),
    ("<p>Something with <em>emphasis</em> in it</p>",
     "Something with emphasis in it"),
    ("<p>Something with <a href=\"http://example.org/blah\">a link</a>",
     "Something with a link http://example.org/blah"),
    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>",  # NOQA
     "Something with a link http://example.org/blah"),
    ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
     "Something with one link here http://example.org/first and another there http://example.org/second...")  # NOQA
])
def test_strip_html(text, expected):
    ctx = HtmlStrippingContext()
    ctx.url_mode = URLMODE_INLINE
    actual = strip_html(text, ctx)
    print(actual)
    print(expected)
    assert actual == expected


@pytest.mark.parametrize("text, expected", [
    ("<p>Something with <a href=\"http://example.org/blah\">a link</a></p>",
     "Something with a link\nhttp://example.org/blah"),
    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a></p>",  # NOQA
     "Something with a link\nhttp://example.org/blah"),
    ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
     "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second")  # NOQA
])
def test_strip_html_with_bottom_urls(text, expected):
    ctx = HtmlStrippingContext()
    ctx.url_mode = URLMODE_BOTTOM_LIST
    actual = strip_html(text, ctx)
    print(actual)
    print(expected)
    assert actual == expected


@pytest.mark.parametrize("title, limit, add_url, expected", [
    ('A test entry', None, False, 'A test entry'),
    ('A test entry', None, 'auto', 'A test entry ' + test_url),
    ('A test entry', None, True, 'A test entry ' + test_url),

    ('A test entry', 80, False, 'A test entry'),
    ('A test entry', 80, 'auto', 'A test entry ' + test_url),
    ('A test entry', 80, True, 'A test entry ' + test_url),

    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, False,
     'A test entry that is very very long because its title has many many '
     'words in...'),
    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, 'auto',
     'A test entry that is very very long because its... ' + test_url),
    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, True,
     'A test entry that is very very long because its... ' + test_url)
])
def test_format_lonform_entry(title, limit, add_url, expected):
    entry = _make_test_entry(title, False)
    actual = format_entry(entry, limit, add_url)
    assert actual == expected


@pytest.mark.parametrize("text, limit, add_url, expected", [
    ('A test entry', None, False, 'A test entry'),
    ('A test entry', None, 'auto', 'A test entry'),
    ('A test entry', None, True, 'A test entry ' + test_url),

    ('A test entry', 80, False, 'A test entry'),
    ('A test entry', 80, 'auto', 'A test entry'),
    ('A test entry', 80, True, 'A test entry ' + test_url),

    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, False,
     'A test entry that is very very long because its title has many many '
     'words in...'),
    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, 'auto',
     'A test entry that is very very long because its... ' + test_url),
    ('A test entry that is very very long because its title has many many '
     'words in it for no good reason', 80, True,
     'A test entry that is very very long because its... ' + test_url)
])
def test_format_micropost_entry(text, limit, add_url, expected):
    entry = _make_test_entry(text, True)
    actual = format_entry(entry, limit, add_url)
    assert actual == expected