-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathchunk_by_authors.py
63 lines (40 loc) · 1.06 KB
/
chunk_by_authors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
'''
split a file into a given number of chunks randomly, by authors.
Usage: chunk_by_authors.py <orig train file> <input file> <number of chunks>
input files with headers
no headers in output files
'''
import sys, random, os, csv
orig_train_file = sys.argv[1]
input_file = sys.argv[2]
num_chunks = int( sys.argv[3] )
try:
seed = sys.argv[4]
except IndexError:
seed = None
if seed:
random.seed( seed )
basename = os.path.basename( input_file )
basename, ext = os.path.splitext( basename )
i_orig = open( orig_train_file )
orig_reader = csv.reader( i_orig )
i = open( input_file )
headers = orig_reader.next()
i.next()
os = {}
for n in range( num_chunks ):
output_file = "%s_%s%s" % ( basename, n, ext )
os[n] = open( output_file, 'wb' )
# os[n].write( headers )
counter = 0
current_writer = None
for line in i:
orig_line = orig_reader.next()
writer = orig_line[0]
if writer != current_writer:
current_writer = writer
n = random.randint( 0, num_chunks - 1 )
os[n].write( line )
counter += 1
if counter % 100000 == 0:
print counter