-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpull_data.py
83 lines (68 loc) · 2.76 KB
/
pull_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
<<<<<<< HEAD
import urllib
#import lzma
=======
import urllib
>>>>>>> 494aa6f83fb3ec2195a836c5f7915e3894bb4123
from tqdm import tqdm #from tqdm import tqdm_notebook as tqdm
import pandas as pd
import json
import requests
import traceback
import subprocess
import os
#url constants
PUSHSHIFT_URL = 'https://files.pushshift.io/'
REDDIT = 'reddit'
SUBMISSIONS = 'submissions'
COMMENTS = 'comments'
RS = 'RS_20'
RS_V2 = 'RS_v2_20'
RC = 'RC_20'
XZ = '.xz'
ZST = '.zst'
BZ = '.bz2'
TMP = '/tmp/'
JSON = '.json'
AWS = 'aws s3 cp - s3://pushshift-data/'
START_YEAR = 5
END_YEAR = 19
def get_submissions():
def _get_file(year, month, url, how=XZ, rs=RS):
try:
file_ = '{}{}-{}'.format(rs, str(year).zfill(2), str(month).zfill(2))
compressed_file = "{}{}".format(file_, how)
decompressed_file = "{}{}".format(file_, JSON)
endpoint = '{}{}'.format(url, compressed_file)
print("File: {}".format(file_))
print("Compressed file: {}".format(compressed_file))
print("Decompressed File: {}".format(decompressed_file))
print("Endpoint: {}".format(endpoint))
if how == XZ: decomp_command = "xz --decompress"
if how == BZ: decomp_command = "bzip2 -d"
if how == ZST: decomp_command = "zstd --decompress"
# os.system("curl https://files.pushshift.io/reddit/submissions/RS_v2_2008-01.xz | xz --decompress | aws s3 cp - s3://mybucket/stream.txt")
os.system("curl {} | {} | {}{}".format(endpoint, decomp_command, AWS, decompressed_file))
except Exception as e:
traceback.print_exc()
print("ERROR: {}".format(e))
url = '{}{}/{}/'.format(PUSHSHIFT_URL, REDDIT, SUBMISSIONS)
for year in tqdm(range(START_YEAR, END_YEAR+1)):
for month in tqdm(range (1, 13)):
if (year == 5) and (month <= 5): continue #the first file is 2005-06
if year <= 10: #pre-2011 everything is xz encoded
_get_file(year, month, url, XZ, RS_V2)
elif year <= 14: #2011-2014 inclusive is bs2 encoded
_get_file(year, month, url, BZ)
elif year <= 16: #2015-2016 inclusive is zst encoded
_get_file(year, month, url, ZST)
elif (year <= 17) and (month <= 11): # 2017-01 through 2017-11 inclusive are bz2 encoded
_get_file(year, month, url, BZ)
elif ((year == 2017) and (month == 12)) or ((year <= 18) and (month <= 10)):
# 2017-12 through 2018-10 inclusive are xz encoded
_get_file(year, month, url, XZ)
else: # from 2018-11 to current is zst encoded
_get_file(year, month, url, ZST)
print('\n\n\n')
if __name__ == "__main__":
get_submissions()