1
- #!/usr/ bin/python
1
+ #!/bin/env python
2
2
# -*- coding: utf-8 -*-
3
+ from __future__ import print_function
4
+
5
+ try :
6
+ from urllib .parse import urlparse , urlsplit
7
+ from urllib .request import urlopen
8
+ except ImportError :
9
+ from urlparse import urlparse , urlsplit
10
+ from urllib2 import urlopen
3
11
4
12
import pkgutil
5
- import urlparse
13
+ import base64
6
14
import json
7
15
import logging
8
- import urllib2
9
16
from argparse import ArgumentParser
10
17
11
18
__all__ = ['main' ]
@@ -38,7 +45,8 @@ def decode_gfwlist(content):
38
45
try :
39
46
if '.' in content :
40
47
raise Exception ()
41
- return content .decode ('base64' )
48
+ content = base64 .b64decode (content )
49
+ return content .decode ('UTF-8' )
42
50
except :
43
51
return content
44
52
@@ -48,7 +56,7 @@ def get_hostname(something):
48
56
# quite enough for GFW
49
57
if not something .startswith ('http:' ):
50
58
something = 'http://' + something
51
- r = urlparse . urlparse (something )
59
+ r = urlparse (something )
52
60
return r .hostname
53
61
except Exception as e :
54
62
logging .error (e )
@@ -64,6 +72,7 @@ def add_domain_to_set(s, something):
64
72
def combine_lists (content , user_rule = None ):
65
73
builtin_rules = pkgutil .get_data ('gfwlist2pac' ,
66
74
'resources/builtin.txt' ).splitlines (False )
75
+ builtin_rules = [rule .decode ('UTF8' ) for rule in builtin_rules ]
67
76
gfwlist = content .splitlines (False )
68
77
gfwlist .extend (builtin_rules )
69
78
if user_rule :
@@ -99,12 +108,13 @@ def reduce_domains(domains):
99
108
# reduce 'www.google.com' to 'google.com'
100
109
# remove invalid domains
101
110
tld_content = pkgutil .get_data ('gfwlist2pac' , 'resources/tld.txt' )
111
+ tld_content = tld_content .decode ('UTF-8' )
102
112
tlds = set (tld_content .splitlines (False ))
103
113
new_domains = set ()
104
114
for domain in domains :
105
115
domain_parts = domain .split ('.' )
106
116
last_root_domain = None
107
- for i in xrange (0 , len (domain_parts )):
117
+ for i in range (0 , len (domain_parts )):
108
118
root_domain = '.' .join (domain_parts [len (domain_parts ) - i - 1 :])
109
119
if i == 0 :
110
120
if not tlds .__contains__ (root_domain ):
@@ -121,7 +131,7 @@ def reduce_domains(domains):
121
131
uni_domains = set ()
122
132
for domain in new_domains :
123
133
domain_parts = domain .split ('.' )
124
- for i in xrange (0 , len (domain_parts )- 1 ):
134
+ for i in range (0 , len (domain_parts )- 1 ):
125
135
root_domain = '.' .join (domain_parts [len (domain_parts ) - i - 1 :])
126
136
if domains .__contains__ (root_domain ):
127
137
break
@@ -133,6 +143,7 @@ def reduce_domains(domains):
133
143
def generate_pac_fast (domains , proxy ):
134
144
# render the pac file
135
145
proxy_content = pkgutil .get_data ('gfwlist2pac' , 'resources/proxy.pac' )
146
+ proxy_content = proxy_content .decode ('UTF-8' )
136
147
domains_dict = {}
137
148
for domain in domains :
138
149
domains_dict [domain ] = 1
@@ -155,7 +166,8 @@ def grep_rule(rule):
155
166
return None
156
167
# render the pac file
157
168
proxy_content = pkgutil .get_data ('gfwlist2pac' , 'resources/abp.js' )
158
- rules = filter (grep_rule , rules )
169
+ proxy_content = proxy_content .decode ('UTF-8' )
170
+ rules = list (filter (grep_rule , rules ))
159
171
proxy_content = proxy_content .replace ('__PROXY__' , json .dumps (str (proxy )))
160
172
proxy_content = proxy_content .replace ('__RULES__' ,
161
173
json .dumps (rules , indent = 2 ))
@@ -166,22 +178,23 @@ def main():
166
178
args = parse_args ()
167
179
user_rule = None
168
180
if (args .input ):
169
- with open (args .input , 'rb ' ) as f :
181
+ with open (args .input , 'r ' ) as f :
170
182
content = f .read ()
171
183
else :
172
- print 'Downloading gfwlist from %s' % gfwlist_url
173
- content = urllib2 .urlopen (gfwlist_url , timeout = 10 ).read ()
184
+ print ('Downloading gfwlist from %s' % gfwlist_url )
185
+ content = urlopen (gfwlist_url , timeout = 10 ).read ()
186
+ content = content .decode ('UTF-8' )
174
187
if args .user_rule :
175
- userrule_parts = urlparse . urlsplit (args .user_rule )
188
+ userrule_parts = urlsplit (args .user_rule )
176
189
if not userrule_parts .scheme or not userrule_parts .netloc :
177
190
# It's not an URL, deal it as local file
178
- with open (args .user_rule , 'rb ' ) as f :
191
+ with open (args .user_rule , 'r ' ) as f :
179
192
user_rule = f .read ()
180
193
else :
181
194
# Yeah, it's an URL, try to download it
182
- print 'Downloading user rules file from %s' % args .user_rule
183
- user_rule = urllib2 . urlopen (args .user_rule , timeout = 10 ).read ()
184
-
195
+ print ( 'Downloading user rules file from %s' % args .user_rule )
196
+ user_rule = urlopen (args .user_rule , timeout = 10 ).read ()
197
+ user_rule = user_rule . decode ( 'UTF-8' )
185
198
content = decode_gfwlist (content )
186
199
gfwlist = combine_lists (content , user_rule )
187
200
if args .precise :
@@ -190,7 +203,7 @@ def main():
190
203
domains = parse_gfwlist (gfwlist )
191
204
domains = reduce_domains (domains )
192
205
pac_content = generate_pac_fast (domains , args .proxy )
193
- with open (args .output , 'wb ' ) as f :
206
+ with open (args .output , 'w ' ) as f :
194
207
f .write (pac_content )
195
208
196
209
0 commit comments