diff options
Diffstat (limited to 'src/phinde')
| -rw-r--r-- | src/phinde/Crawler.php | 70 | ||||
| -rw-r--r-- | src/phinde/Helper.php | 10 | ||||
| -rw-r--r-- | src/phinde/HttpRequest.php | 16 | ||||
| -rw-r--r-- | src/phinde/LinkExtractor/Atom.php | 35 | ||||
| -rw-r--r-- | src/phinde/LinkExtractor/Html.php | 67 | ||||
| -rw-r--r-- | src/phinde/LinkInfo.php | 17 | ||||
| -rw-r--r-- | src/phinde/Queue.php | 54 |
7 files changed, 269 insertions, 0 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php new file mode 100644 index 0000000..53320ec --- /dev/null +++ b/src/phinde/Crawler.php @@ -0,0 +1,70 @@ +<?php +namespace phinde; + +class Crawler +{ + protected $es; + protected $queue; + + static $supportedIndexTypes = array( + 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', + 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', + 'text/html' => '\\phinde\\LinkExtractor\\Html', + ); + + public function __construct() + { + $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + $this->queue = new Queue(); + } + + public function crawl($url) + { + $res = $this->fetch($url); + $linkInfos = $this->extractLinks($res); + $this->enqueue($linkInfos); + } + + protected function fetch($url) + { + $req = new HttpRequest($url); + $res = $req->send(); + if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + return $res; + } + + protected function extractLinks(\HTTP_Request2_Response $res) + { + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!isset(static::$supportedIndexTypes[$mimetype])) { + echo "MIME type not supported for indexing: $mimetype\n"; + return array(); + } + + $class = static::$supportedIndexTypes[$mimetype]; + $extractor = new $class(); + return $extractor->extract($res); + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($this->es->isKnown($linkInfo->url)) { + continue; + } + $this->es->markQueued($linkInfo->url); + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + if (Helper::isUrlAllowed($linkInfo->url)) { + $this->queue->addToCrawl($linkInfo->url); + } + } + } +} +?> diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index 0b98521..40ea751 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -20,5 +20,15 @@ class Helper $url ); } + + public static function addSchema($url) + { + if (substr($url, 0, 7) == 'http://' + || substr($url, 0, 8) == 'https://' + ) { + return $url; + } + return 'http://' . $url; + } } ?> diff --git a/src/phinde/HttpRequest.php b/src/phinde/HttpRequest.php new file mode 100644 index 0000000..e68bd84 --- /dev/null +++ b/src/phinde/HttpRequest.php @@ -0,0 +1,16 @@ +<?php +namespace phinde; + +class HttpRequest extends \HTTP_Request2 +{ + public function __construct($url) + { + parent::__construct($url); + $this->setConfig('follow_redirects', true); + $this->setConfig('connect_timeout', 5); + $this->setConfig('timeout', 10); + $this->setConfig('ssl_verify_peer', false); + $this->setHeader('user-agent', 'phinde/bot'); + } +} +?> diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php new file mode 100644 index 0000000..bb4d90b --- /dev/null +++ b/src/phinde/LinkExtractor/Atom.php @@ -0,0 +1,35 @@ +<?php +namespace phinde\LinkExtractor; + +use phinde\LinkInfo; + +class Atom +{ + public function extract(\HTTP_Request2_Response $res) + { + $url = $res->getEffectiveUrl(); + $base = new \Net_URL2($url); + + $sx = simplexml_load_string($res->getBody()); + $linkInfos = array(); + $alreadySeen = array(); + + foreach ($sx->entry as $entry) { + $linkTitle = (string) $entry->title; + foreach ($entry->link as $xlink) { + $linkUrl = (string) $base->resolve((string) $xlink['href']); + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + if ($xlink['rel'] == 'alternate') { + $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); + } + $alreadySeen[$linkUrl] = true; + } + } + + return $linkInfos; + } +} +?> diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php new file mode 100644 index 0000000..538d6c4 --- /dev/null +++ b/src/phinde/LinkExtractor/Html.php @@ -0,0 +1,67 @@ +<?php +namespace phinde\LinkExtractor; + +use phinde\LinkInfo; + +class Html +{ + public function extract(\HTTP_Request2_Response $res) + { + $url = $res->getEffectiveUrl(); + + $linkInfos = array(); + + //FIXME: mime type switch for cdata + $doc = new \DOMDocument(); + //@ to hide parse warning messages in invalid html + @$doc->loadHTML($res->getBody()); + + //FIXME: extract base url from html + $base = new \Net_URL2($url); + + $xpath = new \DOMXPath($doc); + $links = $xpath->evaluate('//a'); + //FIXME: link rel, img, video + + $alreadySeen = array(); + + foreach ($links as $link) { + $linkTitle = $link->textContent; + $href = ''; + foreach ($link->attributes as $attribute) { + if ($attribute->name == 'href') { + $href = $attribute->textContent; + } + } + if ($href == '' || $href{0} == '#') { + //link on this page + continue; + } + + $linkUrlObj = $base->resolve($href); + $linkUrlObj->setFragment(false); + $linkUrl = (string) $linkUrlObj; + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + switch ($linkUrlObj->getScheme()) { + case 'http': + case 'https': + break; + default: + continue 2; + } + + //FIXME: check target type + //FIXME: check nofollow + $linkInfos[] = new LinkInfo( + $linkUrl, $linkTitle, $url + ); + $alreadySeen[$linkUrl] = true; + } + + return $linkInfos; + } +} +?> diff --git a/src/phinde/LinkInfo.php b/src/phinde/LinkInfo.php new file mode 100644 index 0000000..4e3980c --- /dev/null +++ b/src/phinde/LinkInfo.php @@ -0,0 +1,17 @@ +<?php +namespace phinde; + +class LinkInfo +{ + public $url; + public $title; + public $source; + + public function __construct($url, $title = null, $source = null) + { + $this->url = $url; + $this->title = $title; + $this->source = $source; + } +} +?> diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php new file mode 100644 index 0000000..98f6462 --- /dev/null +++ b/src/phinde/Queue.php @@ -0,0 +1,54 @@ +<?php +namespace phinde; + +class Queue +{ + protected $gmclient; + + public function __construct() + { + $this->gmclient = new \GearmanClient(); + $this->gmclient->addServer('127.0.0.1'); + } + + public function addToIndex($linkUrl, $linkTitle, $sourceUrl) + { + echo "Queuing for indexing: $linkUrl\n"; + $this->gmclient->doBackground( + 'phinde_index', + serialize( + array( + 'url' => $linkUrl, + 'title' => $linkTitle, + 'source' => $sourceUrl + ) + ) + ); + if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL indexing for ' + . $linkUrl . "\n" + . 'Error code: ' . $this->gmclient->returnCode() . "\n"; + exit(2); + } + } + + public function addToCrawl($linkUrl) + { + echo "Queuing for crawling: $linkUrl\n"; + $this->gmclient->doBackground( + 'phinde_crawl', + serialize( + array( + 'url' => $linkUrl + ) + ) + ); + if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL crawling for ' + . $linkUrl . "\n" + . 'Error code: ' . $this->gmclient->returnCode() . "\n"; + exit(2); + } + } +} +?> |
