-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
119 lines (104 loc) · 3.87 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import csv
import logging
import re
import sys
import spacy
import numpy as np
from nltk import download
from nltk.corpus import stopwords
# Download using "pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz"
nlp = spacy.load("en_core_sci_lg") # Scispacy model en_core_sci_lg
def get_entities(doc):
'''
Retrieves entities from a sequence of ScispaCy Token objects.
Parameters
----------
doc: spacy.tokens.doc.Doc
Sequence of Token objects from ScispaCy.
Returns
-------
ents: list
A list of identified entities.
'''
ents = list(doc.ents)
return ents
def get_tokens(text):
'''
Tokenize the text.
Parameters
----------
text: str
Plain text that is to be tokenized.
Returns
-------
tokens: list
A list of tokens.
'''
tokens = []
doc = nlp(text)
# Get entities in the text
text_ents = get_entities(doc)
# Keep track of entity counts
ent_count = 0
for token in doc:
# Check if the token is the boundary part of an entity
if str(token.ent_iob_) == "B":
ent = str(text_ents[ent_count])
string_list = ent.split()
tokens += string_list
ent_count += 1
# Skip if the token is the inside part of an entity
elif str(token.ent_iob_) == "I":
continue
# Add to the list of tokens all those words outside the entity
else:
tokens.append(token)
return tokens
def preprocessPhrases(filepathIn=None, filepathOut=None):
'''
Transforms TREC and RELISH tsv file to lowercase and removes all special characters aside from the hyphen.
Saves transformed TREC and RELISH files as an .npy format as a multi-dimensional array.
Parameters
----------
filepathIn: str
The input file for the RELISH or TREC tsv to be transformed.
'''
if not isinstance(filepathIn, str):
logging.warn("Wrong parameter type for preprocessPhrases.")
sys.exit("filepathIn needs to be of type string")
elif not isinstance(filepathOut, str):
logging.warn("Wrong parameter type for preprocessPhrases.")
sys.exit("filepathOut needs to be of type string")
else:
letters_pattern = '.*[a-zA-Z\d\-].*' #Includes all letters which are numbers, letters or a hyphen.
outputList = []
with open(filepathIn) as input:
inputFile = csv.reader(input, delimiter="\t")
headerline = True
for line in inputFile:
if(headerline):
headerline = False
else:
title = get_tokens(line[1].lower())
abstract = get_tokens(line[2].lower())
iteration = 0
while(iteration < len(title)):
word = "".join([c for c in str(title[iteration]) if re.match(letters_pattern, c)])
title[iteration] = word
iteration += 1
iteration = 0
while(iteration < len(abstract)):
word = "".join([c for c in str(abstract[iteration]) if re.match(letters_pattern, c)])
abstract[iteration] = word
iteration += 1
iteration = 0
cleanedTitle = []
for word in title:
if word != "":
cleanedTitle.append(word)
cleanedAbstract = []
for word in abstract:
if word != "":
cleanedAbstract.append(word)
outputList.append([np.asanyarray(line[0]), np.asanyarray(cleanedTitle), np.asanyarray(cleanedAbstract)])
np.save(filepathOut, np.asanyarray(outputList))