Skip to content

Commit

Permalink
Remove unittest and switch to pytest fully
Browse files Browse the repository at this point in the history
  • Loading branch information
pjrobertson committed Jan 14, 2025
1 parent cef4037 commit 6f10270
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 206 deletions.
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,7 @@ repository = "https://github.com/bellingcat/auto-archiver"
documentation = "https://github.com/bellingcat/auto-archiver"


[tool.pytest.ini_options]
markers = [
"download: marks tests that download content from the network",
]
6 changes: 1 addition & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import unittest
import tempfile

from auto_archiver.core.context import ArchivingContext

ArchivingContext.reset(full_reset=True)
ArchivingContext.set_tmp_dir(tempfile.gettempdir())

if __name__ == '__main__':
unittest.main()
ArchivingContext.set_tmp_dir(tempfile.gettempdir())
12 changes: 5 additions & 7 deletions tests/archivers/test_archiver_base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import pytest

from auto_archiver.core import Metadata
from auto_archiver.core import Step
from auto_archiver.core.metadata import Metadata

class TestArchiverBase(object):

archiver_class = None
config = None

def setUp(self):
@pytest.fixture(autouse=True)
def setup_archiver(self):
assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.archiver = self.archiver_class(self.config)

def create_item(self, url, **kwargs):
item = Metadata().set_url(url)
for key, value in kwargs.items():
item.set(key, value)
return item

def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False

Expand Down
3 changes: 1 addition & 2 deletions tests/archivers/test_bluesky_archiver.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import pytest
import unittest

from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
from .test_archiver_base import TestArchiverBase

class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
class TestBlueskyArchiver(TestArchiverBase):
"""Tests Bluesky Archiver
Note that these tests will download API responses from the bluesky API, so they may be slow.
Expand Down
154 changes: 62 additions & 92 deletions tests/archivers/test_twitter_archiver.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,37 @@
import unittest
import datetime
import pytest

from auto_archiver.archivers.twitter_archiver import TwitterArchiver

from .test_archiver_base import TestArchiverBase

class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
class TestTwitterArchiver(TestArchiverBase):

archiver_class = TwitterArchiver
config = {}
@pytest.mark.parametrize("url, expected", [
("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839"), # strip tracking params
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
])
def test_sanitize_url(self, url, expected):
assert expected == self.archiver.sanitize_url(url)

def test_sanitize_url(self):
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False)
])

# should expand t.co URLs
t_co_url = "https://t.co/yl3oOJatFp"
t_co_resolved_url = "https://www.bellingcat.com/category/resources/"
assert t_co_resolved_url == self.archiver.sanitize_url(t_co_url)

# shouldn't alter valid x URLs
x_url = "https://x.com/bellingcat/status/1874097816571961839"
assert x_url == self.archiver.sanitize_url(x_url)

# shouldn't alter valid twitter.com URLs
twitter_url = "https://twitter.com/bellingcat/status/1874097816571961839"
assert twitter_url == self.archiver.sanitize_url(twitter_url)

# should strip tracking params
tracking_url = "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
assert "https://twitter.com/bellingcat/status/1874097816571961839" == self.archiver.sanitize_url(tracking_url)

# shouldn't alter non-twitter/x URLs
test_url = "https://www.bellingcat.com/category/resources/"
assert test_url == self.archiver.sanitize_url(test_url)

# shouldn't strip params from non-twitter/x URLs
test_url = "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
assert test_url == self.archiver.sanitize_url(test_url)

def test_get_username_tweet_id_from_url(self):

# test valid twitter URL
url = "https://twitter.com/bellingcat/status/1874097816571961839"
username, tweet_id = self.archiver.get_username_tweet_id(url)
assert "bellingcat" == username
assert "1874097816571961839" == tweet_id

# test valid x URL
url = "https://x.com/bellingcat/status/1874097816571961839"
username, tweet_id = self.archiver.get_username_tweet_id(url)
assert "bellingcat" == username
assert "1874097816571961839" == tweet_id

# test invalid URL
# TODO: should this return None, False or raise an exception? Right now it returns False
url = "https://www.bellingcat.com/category/resources/"
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):

username, tweet_id = self.archiver.get_username_tweet_id(url)
assert not username
assert not tweet_id

assert exptected_username == username
assert exptected_tweetid == tweet_id
def test_choose_variants(self):
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'},
Expand All @@ -68,25 +41,26 @@ def test_choose_variants(self):
]
chosen_variant = self.archiver.choose_variant(variant_list)
assert chosen_variant == variant_list[3]

def test_reverse_engineer_token(self):

@pytest.mark.parametrize("tweet_id, expected_token", [
("1874097816571961839", "4jjngwkifa"),
("1674700676612386816", "42586mwa3uv"),
("1877747914073620506", "4jv4aahw36n"),
("1876710769913450647", "4jruzjz5lux"),
("1346554693649113090", "39ibqxei7mo")
])
def test_reverse_engineer_token(self, tweet_id, expected_token):
# see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
# and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215

for tweet_id, real_token in [
("1874097816571961839", "4jjngwkifa"),
("1674700676612386816", "42586mwa3uv"),
("1877747914073620506", "4jv4aahw36n"),
("1876710769913450647", "4jruzjz5lux"),
("1346554693649113090", "39ibqxei7mo"),]:
generated_token = self.archiver.generate_token(tweet_id)
self.assertEqual(real_token, generated_token)
generated_token = self.archiver.generate_token(tweet_id)
assert expected_token == generated_token

@pytest.mark.download
def test_youtube_dlp_archiver(self):
def test_youtube_dlp_archiver(self, make_item):

url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_yt_dlp(self.create_item(url), url, "1874097816571961839")
post = self.archiver.download_yt_dlp(make_item(url), url, "1874097816571961839")
assert post
self.assertValidResponseMetadata(
post,
Expand All @@ -96,35 +70,35 @@ def test_youtube_dlp_archiver(self):
)

@pytest.mark.download
def test_syndication_archiver(self):
def test_syndication_archiver(self, make_item):

url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_syndication(self.create_item(url), url, "1874097816571961839")
self.assertTrue(post)
post = self.archiver.download_syndication(make_item(url), url, "1874097816571961839")
assert post
self.assertValidResponseMetadata(
post,
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
)

@pytest.mark.download
def test_download_nonexistend_tweet(self):
def test_download_nonexistend_tweet(self, make_item):
# this tweet does not exist
url = "https://x.com/Bellingcat/status/17197025860711058"
response = self.archiver.download(self.create_item(url))
self.assertFalse(response)
response = self.archiver.download(make_item(url))
assert not response

@pytest.mark.download
def test_download_malformed_tweetid(self):
def test_download_malformed_tweetid(self, make_item):
# this tweet does not exist
url = "https://x.com/Bellingcat/status/1719702586071100058"
response = self.archiver.download(self.create_item(url))
self.assertFalse(response)
response = self.archiver.download(make_item(url))
assert not response

@pytest.mark.download
def test_download_tweet_no_media(self):
def test_download_tweet_no_media(self, make_item):

item = self.create_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.archiver.download(item)

self.assertValidResponseMetadata(
Expand All @@ -135,36 +109,32 @@ def test_download_tweet_no_media(self):
)

@pytest.mark.download
def test_download_video(self):
def test_download_video(self, make_item):
url = "https://x.com/bellingcat/status/1871552600346415571"

post = self.archiver.download(self.create_item(url))
post = self.archiver.download(make_item(url))
self.assertValidResponseMetadata(
post,
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
)

@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
@pytest.mark.download
def test_download_sensitive_media(self):

"""Download tweets with sensitive media
Note: currently failing, youtube-dlp requres logged in users + download_syndication requires logging in"""

test_data = [
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash")
]
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
])
def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):

"""Download tweets with sensitive media"""

for url, title, timestamp, image_hash in test_data:
post = self.archiver.download(self.create_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
post = self.archiver.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
12 changes: 12 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest
from auto_archiver.core.metadata import Metadata

@pytest.fixture
def make_item():
def _make_item(url: str, **kwargs) -> Metadata:
item = Metadata().set_url(url)
for key, value in kwargs.items():
item.set(key, value)
return item

return _make_item
34 changes: 12 additions & 22 deletions tests/databases/test_csv_db.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,22 @@
import tempfile
import os
import unittest

from auto_archiver.databases.csv_db import CSVDb
from auto_archiver.core import Metadata


def test_store_item(tmp_path):
"""Tests storing an item in the CSV database"""

class TestCSVdb(unittest.TestCase):
temp_db = tmp_path / "temp_db.csv"
db = CSVDb({
"csv_db": {"csv_file": temp_db.as_posix()}
})

def setUp(self):
_, temp_db = tempfile.mkstemp(suffix="csv")
self.temp_db = temp_db
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")

def tearDown(self):
os.remove(self.temp_db)
db.done(item)

def test_store_item(self):
db = CSVDb({
"csv_db": {"csv_file": self.temp_db}
})
with open(temp_db, "r", encoding="utf-8") as f:
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"

item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")

db.done(item)

with open(self.temp_db, "r") as f:
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"

# TODO: csv db doesn't have a fetch method - need to add it (?)
# assert db.fetch(item) == item
# TODO: csv db doesn't have a fetch method - need to add it (?)
# assert db.fetch(item) == item
Loading

0 comments on commit 6f10270

Please sign in to comment.