diff options
Diffstat (limited to 'src/phinde/Crawler.php')
| -rw-r--r-- | src/phinde/Crawler.php | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php new file mode 100644 index 0000000..53320ec --- /dev/null +++ b/src/phinde/Crawler.php @@ -0,0 +1,70 @@ +<?php +namespace phinde; + +class Crawler +{ + protected $es; + protected $queue; + + static $supportedIndexTypes = array( + 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', + 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', + 'text/html' => '\\phinde\\LinkExtractor\\Html', + ); + + public function __construct() + { + $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + $this->queue = new Queue(); + } + + public function crawl($url) + { + $res = $this->fetch($url); + $linkInfos = $this->extractLinks($res); + $this->enqueue($linkInfos); + } + + protected function fetch($url) + { + $req = new HttpRequest($url); + $res = $req->send(); + if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + return $res; + } + + protected function extractLinks(\HTTP_Request2_Response $res) + { + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!isset(static::$supportedIndexTypes[$mimetype])) { + echo "MIME type not supported for indexing: $mimetype\n"; + return array(); + } + + $class = static::$supportedIndexTypes[$mimetype]; + $extractor = new $class(); + return $extractor->extract($res); + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($this->es->isKnown($linkInfo->url)) { + continue; + } + $this->es->markQueued($linkInfo->url); + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + if (Helper::isUrlAllowed($linkInfo->url)) { + $this->queue->addToCrawl($linkInfo->url); + } + } + } +} +?> |
