ntbosscher
diff --git a/‎README.md
+29 b/‎README.md
+29
diff --git a/‎class.crawl-page.php
+95 b/‎class.crawl-page.php
+95
diff --git a/‎class.crawler.php
+198 b/‎class.crawler.php
+198
@@ -0,0 +1,29 @@
+# PHP Web Crawler
+
+This CLI script will crawl a given url for any path that resolves within the given domain or url path provided. An optional folder parameter allows you to download to a specific location.
+
+## Usage
+
+```
+	php crawl.php <url> <optional:output-dir>
+```
+
+## Examples
+
+```
+	php crawl.php http://www.foo.com
+
+	php crawl.php http://www.foo.com ~/websites/foo.com
+
+	php crawl.php http://www.foo.com/some-sub-dir/
+	# will only download files within /some-sub-dir
+```
+
+## Tests
+
+Selected tests can be run using the following:
+Note: this has very little code coverage, only used for some basic functions.
+
+```
+	php test/test.php
+```
@@ -0,0 +1,95 @@
+<?php
+/**
+ * @Author: Nate Bosscher (c) 2015
+ * @Date:   2016-03-22 16:38:33
+ * @Last Modified by:   Nate Bosscher
+ * @Last Modified time: 2016-03-22 18:26:26
+ */
+
+namespace Crawler;
+
+class CrawlPage{
+
+	function __construct($url){
+		$this->url = $url;
+		$this->c = new Curl($this->url);
+	}
+
+	function download(){
+		$this->c->download();
+		$status = $this->c->getHttpStatus();
+
+		// ensure we get something in the success status range
+		if($status >= 200 && $status < 300){
+			return;
+		}else{
+			echo "\nError: got response '$status' for url '$this->url'\n";
+		}
+	}
+
+	/**
+	 * returns the contents of the url that was downloaded
+	 * @return [type] [description]
+	 */
+	function getContents(){
+		return $this->c->__tostring();
+	}
+
+	function fetchUrls(){
+		$list = array();
+
+		// search links
+		$matches = array();
+		preg_match_all("#<a .*?href=[\"']([^\"']*)[\"']#", $this->c->__tostring(), $matches);
+
+		foreach($matches[1] as $v)
+			$list[] = $v;
+
+		// search images
+		$matches = array();
+		preg_match_all("#<img .*?src=[\"']([^\"']*)[\"']#", $this->c->__tostring(), $matches);
+		
+		foreach($matches[1] as $v)
+			$list[] = $v;
+
+		// remove garbage
+		$filtered = array_filter($list, function($v){
+			$u = strtolower($v);
+			if(strpos($u, "mailto:") === 0){
+				return false;
+			}
+
+			if(strpos($u, "#") === 0){
+				return false;
+			}
+
+			return true;
+		});
+
+		// canonize relative links
+		foreach($filtered as $k => $v){
+			if(strpos($v, "http://") === 0 || strpos($v, "https://") === 0){
+				continue;
+			}
+
+			// not a valid url
+			if(parse_url($this->url, PHP_URL_HOST) == null)
+				continue;
+
+			// cannonize
+			$filtered[$k] = parse_url($this->url, PHP_URL_SCHEME) . "://" . parse_url($this->url, PHP_URL_HOST) . $this->getDirFromUrl(parse_url($this->url, PHP_URL_PATH)) . "/" . $v;
+		}
+
+		return $filtered;
+	}
+
+	/**
+	 * takes a urlPath and returns it's directory
+	 */
+	private function getDirFromUrl($urlPath){
+		if($urlPath == "/" || $urlPath == "")
+			return "/";
+
+		return substr($urlPath, 0, strrpos($urlPath, "/"));
+	}
+}
@@ -0,0 +1,198 @@
+<?php
+/**
+ * @Author: Nate Bosscher (c) 2015
+ * @Date:   2016-03-22 16:21:30
+ * @Last Modified by:   Nate Bosscher
+ * @Last Modified time: 2016-03-22 18:38:22
+ */
+
+namespace Crawler;
+
+class Crawler{
+
+	private $_activeUrls = array();
+	private $_downloadedUrls = array();
+
+	private $_downloadImages = false;
+	private $_removeHashTrail = true;
+
+	/**
+	 * @param $base_name the base url to crawl from (e.g. http://www.google.ca or http://www.google.ca/dogs)
+	 */
+	function __construct($base_name, $output_dir = false){
+		$this->base = $base_name;
+
+		$this->host = parse_url($this->base, PHP_URL_SCHEME) . "://" . parse_url($this->base, PHP_URL_HOST);
+		$this->path = parse_url($this->base, PHP_URL_PATH);
+		$this->path = realpath($this->path);
+		
+		if($this->host == NULL){
+			echo "base_name '$base_name' is not valid. Couldn't parse host name\nExiting...\n";
+			exit(-1);
+		}
+
+		if($this->path == NULL){
+			echo "base_name '$base_name' is not valid. Couldn't parse path\nExiting...\n";
+			exit(-1);
+		}
+
+		if(!$output_dir)
+			$output_dir = __dir__ . "/output";
+
+		$this->output = $output_dir;
+
+		if(!is_dir($this->output)){
+			if(!mkdir($this->output, 0777, true)){
+				echo "Couldn't make directory '$this->output'\nExiting...\n";
+				exit(-1);
+			}
+		}
+
+		$this->addUrlIfValid($this->base);
+	}
+
+	function set_DownloadImages($tf = false){
+		$this->_downloadImages = $tf;
+
+		if($this->_downloadImages == true){
+			if(!mkdir($this->output . "/images", 0777)){
+				echo "Couldn't make directory '$this->output'\nExiting...";
+				exit(-1);
+			}
+		}
+	}
+
+	function run(){
+		while(count($this->_activeUrls) > 0){
+			foreach($this->_activeUrls as $k => $v){
+				echo "(downloading ".basename($k).")\n";
+
+				$dp = new CrawlPage($k);
+				$dp->download();
+
+				$this->_downloadedUrls[$k] = 1;
+				unset($this->_activeUrls[$k]);
+
+				foreach($dp->fetchUrls() as $u)
+					$this->addUrlIfValid($u);
+
+				$filename = $this->fileNameFromUrl($k);
+				$dirname = dirname($filename);
+
+				if(!is_dir($dirname)){
+					if(!mkdir($dirname, 0777, true)){
+						echo "Couldn't create directory '" . dirname($filename) . "'\nExiting...\n";
+						exit(-1);
+					}
+				}
+
+				file_put_contents($filename, $dp->getContents());
+
+				// update status
+				echo "\033[A\033[2K"; // clear line
+				echo "Downloaded " . count($this->_downloadedUrls) . " pages ";
+			}
+		}
+
+		echo "\n";
+		echo "Done!\n\n";
+	}
+
+	/**
+	 * converts given url to a local filename
+	 * @param  string $url
+	 * @return string
+	 */
+	function fileNameFromUrl($url){
+
+		// remove leading http
+		$url = preg_replace("#http[s]{0,1}://#", "", $url);
+
+		// if no slashes exist, add a trailing slash
+		if(strpos($url, "/") === false){
+			$url .= "/";
+		}
+
+		// remove domain name
+		$upath = substr($url, strpos($url, "/"));
+
+		if(substr($upath, -1) == "/")
+			$upath = "/__root__.html";
+
+		return $this->output . $upath;
+	}
+
+	/**
+	 * resolves ../ and ./ and dir//subdir...
+	 * @param  string $path
+	 * @return string on success
+	 * @return null on failure
+	 */
+	function resolvePath($path){
+
+		// remove double fwd slash
+		$path = str_replace("//", "/", $path);
+
+		$list = explode("/", $path);
+		for($i = 0; $i < count($list); $i++){
+			if($list[$i] == "."){
+				// remove this item
+				array_splice($list, $i, 1);
+
+				// update index (remember the for loop will increment before next iteration)
+				$i--;
+				if($i < -1) $i = 0;
+			}else if($list[$i] == ".."){
+				// remove last 2 items (the .. and the one before)
+				array_splice($list, $i-1, 2);
+
+				// update index (remember the for loop will increment before next iteration)
+				$i-=2;
+				if($i < -1){
+					return null;
+				}
+			}
+		}
+
+		return implode("/", $list);
+	}
+
+	/**
+	 * Adds the url to _activeUrls if it has the same host as $this->host
+	 * and the path is below or equal to $this->path
+	 * 
+	 * @param [type] $url [description]
+	 */
+	private function addUrlIfValid($url){
+
+		// remove trailing hash
+		if($this->_removeHashTrail){
+			$url = preg_replace("/#.*$/", "", $url);
+		}
+
+		// check that host is the same
+		$host = parse_url($url, PHP_URL_SCHEME) . "://" . parse_url($url, PHP_URL_HOST);
+		if($host != $this->host || $host == NULL){
+			return;
+		}
+
+		// check that path is the same or below the path specified
+		$path = parse_url($url, PHP_URL_PATH);
+		// ensure that there's at least a slash
+		if($path == "")
+			$path = "/";
+
+		$path = $this->resolvePath($path);
+
+		if($path == NULL || strpos($this->path, $path) != 0){
+			return;
+		}
+
+		// already downloaded
+		if(array_key_exists($url, $this->_downloadedUrls)){
+			return;
+		}
+
+		$this->_activeUrls[$url] = 1;
+	}
+}