-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathReadDataSet.py
63 lines (45 loc) · 1.42 KB
/
ReadDataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import print_function
#Reads the given Dataset
def readD(txtdoc):
#find basename
import os, nltk
base = os.path.basename(txtdoc)
#read file
with open (txtdoc,"r") as myfile:
text = myfile.readlines()
#extract relevant text from dataset
#write document
f = open(base + ".ready", "w")
#counts loops
a = 0
#for every line
for line in text:
if line.startswith("<bestanswer>"):
cleansentence = line[12:-13].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","")
#split line into sentences
sentences = nltk.sent_tokenize(cleansentence)
s = len(sentences)
#write into document
x=0
while x < (s-1):
f.write(sentences[x] + "\n")
a +=1
x+=1
f.write(sentences[s-1])
a +=1
print( (str(a)), end='\r')
if line.startswith("<answer_item>"):
cleansentence = line[13:-14].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","")
#split line into sentences
sentences = nltk.sent_tokenize(cleansentence)
s = len(sentences)
#write into document
x=0
while x < (s-1):
f.write(sentences[x] + "\n")
a +=1
x+=1
f.write(sentences[s-1])
a +=1
print( (str(a)), end='\r')
f.close