X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/083fa4116b22f4898123006880fb52061763513d..6c95d7fe170a54449755a8b571a191a3aaaf954e:/src/phinde/Crawler.php diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 72726a5..1f63e60 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -11,7 +11,7 @@ class Crawler */ protected $showLinksOnly = false; - static $supportedIndexTypes = array( + static $supportedTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', @@ -23,58 +23,28 @@ class Crawler $this->queue = new Queue(); } - public function crawl($url) + public function run(Retrieved $retrieved) { - $res = $this->fetch($url); - if ($res === false) { - return; - } - - $linkInfos = $this->extractLinks($res); + $linkInfos = $this->extractLinks($retrieved->httpRes); $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); + return false; } else { $this->enqueue($linkInfos); + return true; } } - protected function fetch($url) - { - $existingDoc = $this->es->get($url); - - $req = new HttpRequest($url); - $req->setHeader( - 'accept', - implode(',', array_keys(static::$supportedIndexTypes)) - ); - if ($existingDoc && isset($existingDoc->modate)) { - $nMoDate = strtotime($existingDoc->modate); - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); - } - - $res = $req->send(); - if ($res->getStatus() === 304) { - //not modified since last time, so don't crawl again - return false; - } else if ($res->getStatus() !== 200) { - throw new \Exception( - "Response code is not 200 but " - . $res->getStatus() . ", stopping" - ); - } - return $res; - } - protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; - if (!isset(static::$supportedIndexTypes[$mimetype])) { - echo "MIME type not supported for indexing: $mimetype\n"; + if (!isset(static::$supportedTypes[$mimetype])) { + Log::info("MIME type not supported for crawling: $mimetype"); return array(); } - $class = static::$supportedIndexTypes[$mimetype]; + $class = static::$supportedTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } @@ -111,28 +81,32 @@ class Crawler } if ($linkInfo->crawl || $linkInfo->index) { $this->es->markQueued($linkInfo->url); - } - if ($linkInfo->index) { - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source + $actions = array(); + if ($linkInfo->index) { + $actions[] = 'index'; + } + if ($linkInfo->crawl) { + $actions[] = 'crawl'; + } + $this->queue->addToProcessList( + $linkInfo->url, $actions ); } - if ($linkInfo->crawl) { - $this->queue->addToCrawl($linkInfo->url); - } } } protected function showLinks($linkInfos) { foreach ($linkInfos as $linkInfo) { - echo $linkInfo->url . "\n"; + Log::msg($linkInfo->url); if ($linkInfo->title) { - echo ' title: ' . $linkInfo->title . "\n"; - echo ' source: ' . $linkInfo->source . "\n"; - echo ' known: ' . intval($linkInfo->known) + Log::msg(' title: ' . $linkInfo->title); + Log::msg(' source: ' . $linkInfo->source); + Log::msg( + ' known: ' . intval($linkInfo->known) . ', crawl: ' . intval($linkInfo->crawl) - . ', index: ' . intval($linkInfo->index) . "\n"; + . ', index: ' . intval($linkInfo->index) + ); } } }