diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-02-10 14:56:20 +0100 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-02-10 14:56:20 +0100 |
| commit | cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 (patch) | |
| tree | 8cc7ee5d841f868e38ccc0b54d8cc6d33a852ed7 /src/phinde/Crawler.php | |
| parent | f67e8f0bc3f51f2d280a86a8c7cffa68d812efe1 (diff) | |
| download | phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.tar.gz phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.zip | |
rework crawler; add atom link extraction
Diffstat (limited to 'src/phinde/Crawler.php')
| -rw-r--r-- | src/phinde/Crawler.php | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php new file mode 100644 index 0000000..53320ec --- /dev/null +++ b/src/phinde/Crawler.php @@ -0,0 +1,70 @@ +<?php +namespace phinde; + +class Crawler +{ + protected $es; + protected $queue; + + static $supportedIndexTypes = array( + 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', + 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', + 'text/html' => '\\phinde\\LinkExtractor\\Html', + ); + + public function __construct() + { + $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + $this->queue = new Queue(); + } + + public function crawl($url) + { + $res = $this->fetch($url); + $linkInfos = $this->extractLinks($res); + $this->enqueue($linkInfos); + } + + protected function fetch($url) + { + $req = new HttpRequest($url); + $res = $req->send(); + if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + return $res; + } + + protected function extractLinks(\HTTP_Request2_Response $res) + { + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!isset(static::$supportedIndexTypes[$mimetype])) { + echo "MIME type not supported for indexing: $mimetype\n"; + return array(); + } + + $class = static::$supportedIndexTypes[$mimetype]; + $extractor = new $class(); + return $extractor->extract($res); + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($this->es->isKnown($linkInfo->url)) { + continue; + } + $this->es->markQueued($linkInfo->url); + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + if (Helper::isUrlAllowed($linkInfo->url)) { + $this->queue->addToCrawl($linkInfo->url); + } + } + } +} +?> |
