Skip to content

Commit

Permalink
PubMed - add language to corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Jan 11, 2023
1 parent 25952ea commit 1b5c4ea
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
4 changes: 3 additions & 1 deletion orangecontrib/text/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,10 @@ def _corpus_from_records(records, includes_metadata):

Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

# as documented here https://www.nlm.nih.gov/bsd/mms/medlineelements.html#ab
# all abstracts are in English - setting language to English
return Corpus.from_numpy(
domain=domain, X=np.empty((len(Y), 0)), Y=Y, metas=meta_values
domain=domain, X=np.empty((len(Y), 0)), Y=Y, metas=meta_values, language="en"
)


Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/tests/test_pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def test_date_to_iso(self):
)
self.assertEqual(type(_date_to_iso(unexpected_input)), type(np.nan))


def test_record_to_corpus(self):
mock_records = [
{
Expand Down Expand Up @@ -150,6 +149,7 @@ def test_record_to_corpus(self):
self.assertCountEqual(meta_values[0], correct_metas[0])
self.assertCountEqual(class_values, correct_classes)
self.assertIsNotNone(corpus)
self.assertEqual(corpus.language, "en")

@patch('Bio.Entrez.esearch', mock_entrez.esearch)
@patch('Bio.Entrez.read', mock_entrez.read)
Expand Down

0 comments on commit 1b5c4ea

Please sign in to comment.