Mercurial > silorider
annotate tests/test_commands_populate.py @ 33:9e4eb3f2754e
Improve handling of character limits in html stripping
The code now more closely keeps track of character counts during html
stripping, and should be absolutely exact. When the limit is exceeded,
it now restarts the stripping without any URLs to prevent incorrect
trimming. It also better preserves whitespace in the original post.
New tests are added for Twitter silo to ensure it works as expected.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Wed, 10 May 2023 16:10:12 -0700 |
parents | a921cc2306bc |
children | b739ca5feb45 |
rev | line source |
---|---|
0 | 1 |
2 feed1 = """ | |
3 <html><body> | |
4 <article class="h-entry"> | |
5 <h1 class="p-name">A new article</h1> | |
6 <div class="e-content"> | |
7 <p>This is the text of the article.</p> | |
8 <p>It has 2 paragraphs.</p> | |
9 </div> | |
10 <a class="u-url" href="https://example.org/a-new-article">permalink</a> | |
11 </article> | |
12 </body></html>""" | |
13 | |
14 | |
15 def test_populate(cli): | |
18
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
16 feed = cli.createTempFeed(feed1) |
0 | 17 cli.appendSiloConfig('test', 'print', items='name') |
18
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
18 cli.setFeedConfig('feed', feed) |
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
19 ctx, _ = cli.run('populate', '-s', 'test') |
0 | 20 assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article') |
21 | |
22 | |
23 feed2 = """ | |
24 <html><body> | |
25 <article class="h-entry"> | |
26 <h1 class="p-name">First article</h1> | |
27 <div><time class="dt-published" datetime="2018-01-07T09:30:00-00:00"></time></div> | |
28 <a class="u-url" href="https://example.org/first-article">permalink</a> | |
29 </article> | |
30 <article class="h-entry"> | |
31 <h1 class="p-name">Second article</h1> | |
32 <div><time class="dt-published" datetime="2018-01-08T09:30:00-00:00"></time></div> | |
33 <a class="u-url" href="https://example.org/second-article">permalink</a> | |
34 </article> | |
35 <article class="h-entry"> | |
36 <h1 class="p-name">Third article</h1> | |
37 <div><time class="dt-published" datetime="2018-01-09T09:30:00-00:00"></time></div> | |
38 <a class="u-url" href="https://example.org/third-article">permalink</a> | |
39 </article> | |
40 </body></html>""" # NOQA | |
41 | |
42 | |
43 def test_populate_until(cli): | |
18
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
44 feed = cli.createTempFeed(feed2) |
0 | 45 cli.appendSiloConfig('test', 'print', items='name') |
18
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
46 cli.setFeedConfig('feed', feed) |
a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
Ludovic Chabant <ludovic@chabant.com>
parents:
0
diff
changeset
|
47 ctx, _ = cli.run('populate', '-s', 'test', '--until', '2018-01-08') |
0 | 48 assert ctx.cache.wasPosted('test', 'https://example.org/first-article') |
49 assert ctx.cache.wasPosted('test', 'https://example.org/second-article') | |
50 assert not ctx.cache.wasPosted('test', 'https://example.org/third-article') |