1
+ <?php
2
+ /**
3
+ * @Author: Nate Bosscher (c) 2015
4
+ * @Date: 2016-03-22 16:21:30
5
+ * @Last Modified by: Nate Bosscher
6
+ * @Last Modified time: 2016-03-22 18:38:22
7
+ */
8
+
9
+ namespace Crawler ;
10
+
11
+ class Crawler{
12
+
13
+ private $ _activeUrls = array ();
14
+ private $ _downloadedUrls = array ();
15
+
16
+ private $ _downloadImages = false ;
17
+ private $ _removeHashTrail = true ;
18
+
19
+ /**
20
+ * @param $base_name the base url to crawl from (e.g. http://www.google.ca or http://www.google.ca/dogs)
21
+ */
22
+ function __construct ($ base_name , $ output_dir = false ){
23
+ $ this ->base = $ base_name ;
24
+
25
+ $ this ->host = parse_url ($ this ->base , PHP_URL_SCHEME ) . ":// " . parse_url ($ this ->base , PHP_URL_HOST );
26
+ $ this ->path = parse_url ($ this ->base , PHP_URL_PATH );
27
+ $ this ->path = realpath ($ this ->path );
28
+
29
+ if ($ this ->host == NULL ){
30
+ echo "base_name ' $ base_name' is not valid. Couldn't parse host name \nExiting... \n" ;
31
+ exit (-1 );
32
+ }
33
+
34
+ if ($ this ->path == NULL ){
35
+ echo "base_name ' $ base_name' is not valid. Couldn't parse path \nExiting... \n" ;
36
+ exit (-1 );
37
+ }
38
+
39
+ if (!$ output_dir )
40
+ $ output_dir = __dir__ . "/output " ;
41
+
42
+ $ this ->output = $ output_dir ;
43
+
44
+ if (!is_dir ($ this ->output )){
45
+ if (!mkdir ($ this ->output , 0777 , true )){
46
+ echo "Couldn't make directory ' $ this ->output ' \nExiting... \n" ;
47
+ exit (-1 );
48
+ }
49
+ }
50
+
51
+ $ this ->addUrlIfValid ($ this ->base );
52
+ }
53
+
54
+ function set_DownloadImages ($ tf = false ){
55
+ $ this ->_downloadImages = $ tf ;
56
+
57
+ if ($ this ->_downloadImages == true ){
58
+ if (!mkdir ($ this ->output . "/images " , 0777 )){
59
+ echo "Couldn't make directory ' $ this ->output ' \nExiting... " ;
60
+ exit (-1 );
61
+ }
62
+ }
63
+ }
64
+
65
+ function run (){
66
+ while (count ($ this ->_activeUrls ) > 0 ){
67
+ foreach ($ this ->_activeUrls as $ k => $ v ){
68
+ echo "(downloading " .basename ($ k ).") \n" ;
69
+
70
+ $ dp = new CrawlPage ($ k );
71
+ $ dp ->download ();
72
+
73
+ $ this ->_downloadedUrls [$ k ] = 1 ;
74
+ unset($ this ->_activeUrls [$ k ]);
75
+
76
+ foreach ($ dp ->fetchUrls () as $ u )
77
+ $ this ->addUrlIfValid ($ u );
78
+
79
+ $ filename = $ this ->fileNameFromUrl ($ k );
80
+ $ dirname = dirname ($ filename );
81
+
82
+ if (!is_dir ($ dirname )){
83
+ if (!mkdir ($ dirname , 0777 , true )){
84
+ echo "Couldn't create directory ' " . dirname ($ filename ) . "' \nExiting... \n" ;
85
+ exit (-1 );
86
+ }
87
+ }
88
+
89
+ file_put_contents ($ filename , $ dp ->getContents ());
90
+
91
+ // update status
92
+ echo "\033[A \033[2K " ; // clear line
93
+ echo "Downloaded " . count ($ this ->_downloadedUrls ) . " pages " ;
94
+ }
95
+ }
96
+
97
+ echo "\n" ;
98
+ echo "Done! \n\n" ;
99
+ }
100
+
101
+ /**
102
+ * converts given url to a local filename
103
+ * @param string $url
104
+ * @return string
105
+ */
106
+ function fileNameFromUrl ($ url ){
107
+
108
+ // remove leading http
109
+ $ url = preg_replace ("#http[s]{0,1}://# " , "" , $ url );
110
+
111
+ // if no slashes exist, add a trailing slash
112
+ if (strpos ($ url , "/ " ) === false ){
113
+ $ url .= "/ " ;
114
+ }
115
+
116
+ // remove domain name
117
+ $ upath = substr ($ url , strpos ($ url , "/ " ));
118
+
119
+ if (substr ($ upath , -1 ) == "/ " )
120
+ $ upath = "/__root__.html " ;
121
+
122
+ return $ this ->output . $ upath ;
123
+ }
124
+
125
+ /**
126
+ * resolves ../ and ./ and dir//subdir...
127
+ * @param string $path
128
+ * @return string on success
129
+ * @return null on failure
130
+ */
131
+ function resolvePath ($ path ){
132
+
133
+ // remove double fwd slash
134
+ $ path = str_replace ("// " , "/ " , $ path );
135
+
136
+ $ list = explode ("/ " , $ path );
137
+ for ($ i = 0 ; $ i < count ($ list ); $ i ++){
138
+ if ($ list [$ i ] == ". " ){
139
+ // remove this item
140
+ array_splice ($ list , $ i , 1 );
141
+
142
+ // update index (remember the for loop will increment before next iteration)
143
+ $ i --;
144
+ if ($ i < -1 ) $ i = 0 ;
145
+ }else if ($ list [$ i ] == ".. " ){
146
+ // remove last 2 items (the .. and the one before)
147
+ array_splice ($ list , $ i -1 , 2 );
148
+
149
+ // update index (remember the for loop will increment before next iteration)
150
+ $ i -=2 ;
151
+ if ($ i < -1 ){
152
+ return null ;
153
+ }
154
+ }
155
+ }
156
+
157
+ return implode ("/ " , $ list );
158
+ }
159
+
160
+ /**
161
+ * Adds the url to _activeUrls if it has the same host as $this->host
162
+ * and the path is below or equal to $this->path
163
+ *
164
+ * @param [type] $url [description]
165
+ */
166
+ private function addUrlIfValid ($ url ){
167
+
168
+ // remove trailing hash
169
+ if ($ this ->_removeHashTrail ){
170
+ $ url = preg_replace ("/#.*$/ " , "" , $ url );
171
+ }
172
+
173
+ // check that host is the same
174
+ $ host = parse_url ($ url , PHP_URL_SCHEME ) . ":// " . parse_url ($ url , PHP_URL_HOST );
175
+ if ($ host != $ this ->host || $ host == NULL ){
176
+ return ;
177
+ }
178
+
179
+ // check that path is the same or below the path specified
180
+ $ path = parse_url ($ url , PHP_URL_PATH );
181
+ // ensure that there's at least a slash
182
+ if ($ path == "" )
183
+ $ path = "/ " ;
184
+
185
+ $ path = $ this ->resolvePath ($ path );
186
+
187
+ if ($ path == NULL || strpos ($ this ->path , $ path ) != 0 ){
188
+ return ;
189
+ }
190
+
191
+ // already downloaded
192
+ if (array_key_exists ($ url , $ this ->_downloadedUrls )){
193
+ return ;
194
+ }
195
+
196
+ $ this ->_activeUrls [$ url ] = 1 ;
197
+ }
198
+ }
0 commit comments