Skip to content

Commit e4d119d

Browse files
committed
Initial commit
0 parents  commit e4d119d

8 files changed

+643
-0
lines changed

README.md

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# PHP Web Crawler
2+
3+
This CLI script will crawl a given url for any path that resolves within the given domain or url path provided. An optional folder parameter allows you to download to a specific location.
4+
5+
## Usage
6+
7+
```
8+
php crawl.php <url> <optional:output-dir>
9+
```
10+
11+
## Examples
12+
13+
```
14+
php crawl.php http://www.foo.com
15+
16+
php crawl.php http://www.foo.com ~/websites/foo.com
17+
18+
php crawl.php http://www.foo.com/some-sub-dir/
19+
# will only download files within /some-sub-dir
20+
```
21+
22+
## Tests
23+
24+
Selected tests can be run using the following:
25+
Note: this has very little code coverage, only used for some basic functions.
26+
27+
```
28+
php test/test.php
29+
```

class.crawl-page.php

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
<?php
2+
/**
3+
* @Author: Nate Bosscher (c) 2015
4+
* @Date: 2016-03-22 16:38:33
5+
* @Last Modified by: Nate Bosscher
6+
* @Last Modified time: 2016-03-22 18:26:26
7+
*/
8+
9+
namespace Crawler;
10+
11+
class CrawlPage{
12+
13+
function __construct($url){
14+
$this->url = $url;
15+
$this->c = new Curl($this->url);
16+
}
17+
18+
function download(){
19+
$this->c->download();
20+
$status = $this->c->getHttpStatus();
21+
22+
// ensure we get something in the success status range
23+
if($status >= 200 && $status < 300){
24+
return;
25+
}else{
26+
echo "\nError: got response '$status' for url '$this->url'\n";
27+
}
28+
}
29+
30+
/**
31+
* returns the contents of the url that was downloaded
32+
* @return [type] [description]
33+
*/
34+
function getContents(){
35+
return $this->c->__tostring();
36+
}
37+
38+
function fetchUrls(){
39+
$list = array();
40+
41+
// search links
42+
$matches = array();
43+
preg_match_all("#<a .*?href=[\"']([^\"']*)[\"']#", $this->c->__tostring(), $matches);
44+
45+
foreach($matches[1] as $v)
46+
$list[] = $v;
47+
48+
// search images
49+
$matches = array();
50+
preg_match_all("#<img .*?src=[\"']([^\"']*)[\"']#", $this->c->__tostring(), $matches);
51+
52+
foreach($matches[1] as $v)
53+
$list[] = $v;
54+
55+
// remove garbage
56+
$filtered = array_filter($list, function($v){
57+
$u = strtolower($v);
58+
if(strpos($u, "mailto:") === 0){
59+
return false;
60+
}
61+
62+
if(strpos($u, "#") === 0){
63+
return false;
64+
}
65+
66+
return true;
67+
});
68+
69+
// canonize relative links
70+
foreach($filtered as $k => $v){
71+
if(strpos($v, "http://") === 0 || strpos($v, "https://") === 0){
72+
continue;
73+
}
74+
75+
// not a valid url
76+
if(parse_url($this->url, PHP_URL_HOST) == null)
77+
continue;
78+
79+
// cannonize
80+
$filtered[$k] = parse_url($this->url, PHP_URL_SCHEME) . "://" . parse_url($this->url, PHP_URL_HOST) . $this->getDirFromUrl(parse_url($this->url, PHP_URL_PATH)) . "/" . $v;
81+
}
82+
83+
return $filtered;
84+
}
85+
86+
/**
87+
* takes a urlPath and returns it's directory
88+
*/
89+
private function getDirFromUrl($urlPath){
90+
if($urlPath == "/" || $urlPath == "")
91+
return "/";
92+
93+
return substr($urlPath, 0, strrpos($urlPath, "/"));
94+
}
95+
}

class.crawler.php

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
<?php
2+
/**
3+
* @Author: Nate Bosscher (c) 2015
4+
* @Date: 2016-03-22 16:21:30
5+
* @Last Modified by: Nate Bosscher
6+
* @Last Modified time: 2016-03-22 18:38:22
7+
*/
8+
9+
namespace Crawler;
10+
11+
class Crawler{
12+
13+
private $_activeUrls = array();
14+
private $_downloadedUrls = array();
15+
16+
private $_downloadImages = false;
17+
private $_removeHashTrail = true;
18+
19+
/**
20+
* @param $base_name the base url to crawl from (e.g. http://www.google.ca or http://www.google.ca/dogs)
21+
*/
22+
function __construct($base_name, $output_dir = false){
23+
$this->base = $base_name;
24+
25+
$this->host = parse_url($this->base, PHP_URL_SCHEME) . "://" . parse_url($this->base, PHP_URL_HOST);
26+
$this->path = parse_url($this->base, PHP_URL_PATH);
27+
$this->path = realpath($this->path);
28+
29+
if($this->host == NULL){
30+
echo "base_name '$base_name' is not valid. Couldn't parse host name\nExiting...\n";
31+
exit(-1);
32+
}
33+
34+
if($this->path == NULL){
35+
echo "base_name '$base_name' is not valid. Couldn't parse path\nExiting...\n";
36+
exit(-1);
37+
}
38+
39+
if(!$output_dir)
40+
$output_dir = __dir__ . "/output";
41+
42+
$this->output = $output_dir;
43+
44+
if(!is_dir($this->output)){
45+
if(!mkdir($this->output, 0777, true)){
46+
echo "Couldn't make directory '$this->output'\nExiting...\n";
47+
exit(-1);
48+
}
49+
}
50+
51+
$this->addUrlIfValid($this->base);
52+
}
53+
54+
function set_DownloadImages($tf = false){
55+
$this->_downloadImages = $tf;
56+
57+
if($this->_downloadImages == true){
58+
if(!mkdir($this->output . "/images", 0777)){
59+
echo "Couldn't make directory '$this->output'\nExiting...";
60+
exit(-1);
61+
}
62+
}
63+
}
64+
65+
function run(){
66+
while(count($this->_activeUrls) > 0){
67+
foreach($this->_activeUrls as $k => $v){
68+
echo "(downloading ".basename($k).")\n";
69+
70+
$dp = new CrawlPage($k);
71+
$dp->download();
72+
73+
$this->_downloadedUrls[$k] = 1;
74+
unset($this->_activeUrls[$k]);
75+
76+
foreach($dp->fetchUrls() as $u)
77+
$this->addUrlIfValid($u);
78+
79+
$filename = $this->fileNameFromUrl($k);
80+
$dirname = dirname($filename);
81+
82+
if(!is_dir($dirname)){
83+
if(!mkdir($dirname, 0777, true)){
84+
echo "Couldn't create directory '" . dirname($filename) . "'\nExiting...\n";
85+
exit(-1);
86+
}
87+
}
88+
89+
file_put_contents($filename, $dp->getContents());
90+
91+
// update status
92+
echo "\033[A\033[2K"; // clear line
93+
echo "Downloaded " . count($this->_downloadedUrls) . " pages ";
94+
}
95+
}
96+
97+
echo "\n";
98+
echo "Done!\n\n";
99+
}
100+
101+
/**
102+
* converts given url to a local filename
103+
* @param string $url
104+
* @return string
105+
*/
106+
function fileNameFromUrl($url){
107+
108+
// remove leading http
109+
$url = preg_replace("#http[s]{0,1}://#", "", $url);
110+
111+
// if no slashes exist, add a trailing slash
112+
if(strpos($url, "/") === false){
113+
$url .= "/";
114+
}
115+
116+
// remove domain name
117+
$upath = substr($url, strpos($url, "/"));
118+
119+
if(substr($upath, -1) == "/")
120+
$upath = "/__root__.html";
121+
122+
return $this->output . $upath;
123+
}
124+
125+
/**
126+
* resolves ../ and ./ and dir//subdir...
127+
* @param string $path
128+
* @return string on success
129+
* @return null on failure
130+
*/
131+
function resolvePath($path){
132+
133+
// remove double fwd slash
134+
$path = str_replace("//", "/", $path);
135+
136+
$list = explode("/", $path);
137+
for($i = 0; $i < count($list); $i++){
138+
if($list[$i] == "."){
139+
// remove this item
140+
array_splice($list, $i, 1);
141+
142+
// update index (remember the for loop will increment before next iteration)
143+
$i--;
144+
if($i < -1) $i = 0;
145+
}else if($list[$i] == ".."){
146+
// remove last 2 items (the .. and the one before)
147+
array_splice($list, $i-1, 2);
148+
149+
// update index (remember the for loop will increment before next iteration)
150+
$i-=2;
151+
if($i < -1){
152+
return null;
153+
}
154+
}
155+
}
156+
157+
return implode("/", $list);
158+
}
159+
160+
/**
161+
* Adds the url to _activeUrls if it has the same host as $this->host
162+
* and the path is below or equal to $this->path
163+
*
164+
* @param [type] $url [description]
165+
*/
166+
private function addUrlIfValid($url){
167+
168+
// remove trailing hash
169+
if($this->_removeHashTrail){
170+
$url = preg_replace("/#.*$/", "", $url);
171+
}
172+
173+
// check that host is the same
174+
$host = parse_url($url, PHP_URL_SCHEME) . "://" . parse_url($url, PHP_URL_HOST);
175+
if($host != $this->host || $host == NULL){
176+
return;
177+
}
178+
179+
// check that path is the same or below the path specified
180+
$path = parse_url($url, PHP_URL_PATH);
181+
// ensure that there's at least a slash
182+
if($path == "")
183+
$path = "/";
184+
185+
$path = $this->resolvePath($path);
186+
187+
if($path == NULL || strpos($this->path, $path) != 0){
188+
return;
189+
}
190+
191+
// already downloaded
192+
if(array_key_exists($url, $this->_downloadedUrls)){
193+
return;
194+
}
195+
196+
$this->_activeUrls[$url] = 1;
197+
}
198+
}

0 commit comments

Comments
 (0)