From 10d61f0ace150cb7aa207726969b1eef4f1a1ccc Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Fri, 22 Jul 2022 10:00:22 +0200
Subject: [PATCH] Twitter - add language to corpus
---
orangecontrib/text/tests/test_twitter.py | 31 ++++++++++-
orangecontrib/text/twitter.py | 67 +++++++++++++-----------
2 files changed, 65 insertions(+), 33 deletions(-)
diff --git a/orangecontrib/text/tests/test_twitter.py b/orangecontrib/text/tests/test_twitter.py
index 0bd18421c..78443a1da 100644
--- a/orangecontrib/text/tests/test_twitter.py
+++ b/orangecontrib/text/tests/test_twitter.py
@@ -247,7 +247,7 @@ def test_query_authors(self, mock, user_mock):
def test_load_data(self):
corpus = self.client.search_content(["orange"])
self.assertEqual(4, len(corpus))
- self.assertTupleEqual(tuple(m[0] for m in twitter.METAS), corpus.domain.metas)
+ self.assertTupleEqual(tuple(m[0]() for m in twitter.METAS), corpus.domain.metas)
df = table_to_frame(corpus, include_metas=True)
pd.testing.assert_frame_equal(
@@ -261,7 +261,7 @@ def test_load_authors(self, user_mock):
corpus = self.client.search_authors(["orange"])
self.assertEqual(4, len(corpus))
- self.assertTupleEqual(tuple(m[0] for m in twitter.METAS), corpus.domain.metas)
+ self.assertTupleEqual(tuple(m[0]() for m in twitter.METAS), corpus.domain.metas)
df = table_to_frame(corpus, include_metas=True)
pd.testing.assert_frame_equal(
@@ -369,6 +369,33 @@ def test_author_not_existing(self, user_patch):
with self.assertRaises(NoAuthorError):
self.client.search_authors(["orange"], collecting=True)
+ @patch("tweepy.Client.get_user")
+ def test_tweets_language(self, user_mock):
+ user_mock.return_value = MagicMock(data=MagicMock(id=1))
+
+ with patch("tweepy.Paginator", DummyPaginator(tweets, users, places)):
+ # language should be None returned tweets have different languages
+ corpus = self.client.search_content(["orange"])
+ self.assertIsNone(corpus.language)
+
+ # corpus language should be same than language in the request
+ corpus = self.client.search_content(["orange"], lang="en")
+ self.assertEqual("en", corpus.language)
+
+ # language should be None returned tweets have different languages
+ corpus = self.client.search_content(["orange"])
+ self.assertIsNone(corpus.language)
+
+ with patch(
+ "tweepy.Paginator", DummyPaginator([tweets[0], tweets[2]], users, places)
+ ):
+ # corpus language should be same than language in the request
+ corpus = self.client.search_content(["orange"])
+ self.assertEqual("en", corpus.language)
+
+ corpus = self.client.search_content(["orange"])
+ self.assertEqual("en", corpus.language)
+
if __name__ == "__main__":
unittest.main()
diff --git a/orangecontrib/text/twitter.py b/orangecontrib/text/twitter.py
index d2aa959ba..1b6494130 100644
--- a/orangecontrib/text/twitter.py
+++ b/orangecontrib/text/twitter.py
@@ -10,13 +10,13 @@
Domain,
StringVariable,
TimeVariable,
+ Variable,
)
from Orange.util import dummy_callback, wrap_callback
from tweepy import TooManyRequests
from orangecontrib.text import Corpus
-from orangecontrib.text.language import ISO2LANG
-
+from orangecontrib.text.language import ISO2LANG, infer_language_from_variable
log = logging.getLogger(__name__)
@@ -46,57 +46,61 @@ def country_code(tweet, _, places):
return places[place_id].country_code if place_id else ""
-tv = TimeVariable("Date")
METAS = [
- (StringVariable("Content"), lambda doc, _, __: doc.text),
+ (partial(StringVariable, "Content"), lambda doc, _, __: doc.text),
(
- DiscreteVariable("Author"),
+ partial(DiscreteVariable, "Author"),
lambda doc, users, _: "@" + users[doc.author_id].username,
),
- (tv, lambda doc, _, __: tv.parse(doc.created_at.isoformat())),
- (DiscreteVariable("Language"), lambda doc, _, __: doc.lang),
- (DiscreteVariable("Location"), country_code),
+ # Twitter API return values in UTC, since Date variable is created later we
+ # don't use TimeVariable.parse but transform to UNIX timestamp manually
+ (partial(TimeVariable, "Date"), lambda doc, _, __: doc.created_at.timestamp()),
+ (partial(DiscreteVariable, "Language"), lambda doc, _, __: doc.lang),
+ (partial(DiscreteVariable, "Location"), country_code),
(
- ContinuousVariable("Number of Likes", number_of_decimals=0),
+ partial(ContinuousVariable, "Number of Likes", number_of_decimals=0),
lambda doc, _, __: doc.public_metrics["like_count"],
),
(
- ContinuousVariable("Number of Retweets", number_of_decimals=0),
+ partial(ContinuousVariable, "Number of Retweets", number_of_decimals=0),
lambda doc, _, __: doc.public_metrics["retweet_count"],
),
(
- DiscreteVariable("In Reply To"),
+ partial(DiscreteVariable, "In Reply To"),
lambda doc, users, _: "@" + users[doc.in_reply_to_user_id].username
if doc.in_reply_to_user_id and doc.in_reply_to_user_id in users
else "",
),
- (DiscreteVariable("Author Name"), lambda doc, users, __: users[doc.author_id].name),
(
- StringVariable("Author Description"),
+ partial(DiscreteVariable, "Author Name"),
+ lambda doc, users, __: users[doc.author_id].name,
+ ),
+ (
+ partial(StringVariable, "Author Description"),
lambda doc, users, _: users[doc.author_id].description,
),
(
- ContinuousVariable("Author Tweets Count", number_of_decimals=0),
+ partial(ContinuousVariable, "Author Tweets Count", number_of_decimals=0),
lambda doc, users, _: users[doc.author_id].public_metrics["tweet_count"],
),
(
- ContinuousVariable("Author Following Count", number_of_decimals=0),
+ partial(ContinuousVariable, "Author Following Count", number_of_decimals=0),
lambda doc, users, _: users[doc.author_id].public_metrics["following_count"],
),
(
- ContinuousVariable("Author Followers Count", number_of_decimals=0),
+ partial(ContinuousVariable, "Author Followers Count", number_of_decimals=0),
lambda doc, users, _: users[doc.author_id].public_metrics["followers_count"],
),
(
- ContinuousVariable("Author Listed Count", number_of_decimals=0),
+ partial(ContinuousVariable, "Author Listed Count", number_of_decimals=0),
lambda doc, users, _: users[doc.author_id].public_metrics["listed_count"],
),
(
- DiscreteVariable("Author Verified"),
+ partial(DiscreteVariable, "Author Verified"),
lambda doc, users, _: str(users[doc.author_id].verified),
),
- (ContinuousVariable("Longitude"), partial(coordinates, dim=0)),
- (ContinuousVariable("Latitude"), partial(coordinates, dim=1)),
+ (partial(ContinuousVariable, "Longitude"), partial(coordinates, dim=0)),
+ (partial(ContinuousVariable, "Latitude"), partial(coordinates, dim=1)),
]
# maximum number of tweets that can be downloaded in one set of requests
# max 450requests/15min, request can contain max 100 tweets
@@ -128,10 +132,6 @@ class TwitterAPI:
call `reset` method before searching or provide `collecting=False`
argument to search method.
"""
-
- text_features = [METAS[0][0]] # Content
- string_attributes = [m for m, _ in METAS if isinstance(m, StringVariable)]
-
def __init__(self, bearer_token):
self.api = tweepy.Client(bearer_token)
self.tweets = {}
@@ -187,7 +187,7 @@ def build_query():
)
count = self._fetch(paginator, max_tweets, callback=callback)
self.append_history("Content", content, lang or "Any", allow_retweets, count)
- return self._create_corpus()
+ return self._create_corpus(lang)
def search_authors(
self,
@@ -241,10 +241,10 @@ def _fetch(
count = 0
try:
done = False
- for i, response in enumerate(paginator):
+ for response in paginator:
users = {u.id: u for u in response.includes.get("users", [])}
places = {p.id: p for p in response.includes.get("places", [])}
- for j, tweet in enumerate(response.data or [], start=1):
+ for tweet in response.data or []:
if tweet.id not in self.tweets:
count += 1
self.tweets[tweet.id] = [f(tweet, users, places) for _, f in METAS]
@@ -258,7 +258,7 @@ def _fetch(
log.debug("TooManyRequests raised")
return count
- def _create_corpus(self) -> Optional[Corpus]:
+ def _create_corpus(self, language: Optional[str] = None) -> Optional[Corpus]:
if len(self.tweets) == 0:
return None
@@ -267,19 +267,24 @@ def to_val(attr, val):
attr.val_from_str_add(val)
return attr.to_val(val)
- m = [attr for attr, _ in METAS]
+ m = [attr() for attr, _ in METAS]
domain = Domain(attributes=[], class_vars=[], metas=m)
metas = np.array(
[
- [to_val(attr, t) for (attr, _), t in zip(METAS, ts)]
+ [to_val(attr, t) for attr, t in zip(m, ts)]
for ts in self.tweets.values()
],
dtype=object,
)
x = np.empty((len(metas), 0))
- return Corpus.from_numpy(domain, x, metas=metas, text_features=self.text_features)
+ language_var = domain["Language"]
+ assert isinstance(language_var, DiscreteVariable)
+ language = language or infer_language_from_variable(language_var)
+ return Corpus.from_numpy(
+ domain, x, metas=metas, text_features=[domain["Content"]], language=language
+ )
def append_history(
self,