X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/ba1f6fc77827dba86b1ac0b62aa288dd0f977cb8..f98e891b454e5677bdf61f476e366b01af713b50:/src/phinde/Crawler.php diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index a63815d..1f63e60 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -11,7 +11,7 @@ class Crawler */ protected $showLinksOnly = false; - static $supportedIndexTypes = array( + static $supportedTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', @@ -23,81 +23,74 @@ class Crawler $this->queue = new Queue(); } - public function crawl($url) + public function run(Retrieved $retrieved) { - $res = $this->fetch($url); - if ($res === false) { - return; - } - - $linkInfos = $this->extractLinks($res); + $linkInfos = $this->extractLinks($retrieved->httpRes); + $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); + return false; } else { $this->enqueue($linkInfos); + return true; } } - protected function fetch($url) - { - $existingDoc = $this->es->get($url); - - $req = new HttpRequest($url); - $req->setHeader( - 'accept', - implode(',', array_keys(static::$supportedIndexTypes)) - ); - if ($existingDoc && isset($existingDoc->modate)) { - $nMoDate = strtotime($existingDoc->modate); - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); - } - - $res = $req->send(); - if ($res->getStatus() === 304) { - //not modified since last time, so don't crawl again - return false; - } else if ($res->getStatus() !== 200) { - throw new \Exception( - "Response code is not 200 but " - . $res->getStatus() . ", stopping" - ); - } - return $res; - } - protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; - if (!isset(static::$supportedIndexTypes[$mimetype])) { - echo "MIME type not supported for indexing: $mimetype\n"; + if (!isset(static::$supportedTypes[$mimetype])) { + Log::info("MIME type not supported for crawling: $mimetype"); return array(); } - $class = static::$supportedIndexTypes[$mimetype]; + $class = static::$supportedTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } - protected function enqueue($linkInfos) + protected function filterLinks($linkInfos) { + $filteredLinkInfos = array(); foreach ($linkInfos as $linkInfo) { - if ($this->es->isKnown($linkInfo->url)) { - continue; - } $allowed = Helper::isUrlAllowed($linkInfo->url); $crawl = $allowed; $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; - if ($crawl || $index) { - $this->es->markQueued($linkInfo->url); + if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) { + foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) { + if (preg_match('#' . $bl . '#', $linkInfo->url)) { + $crawl = false; + } + } } - if ($index) { - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source - ); + + $linkInfo->known = $this->es->isKnown($linkInfo->url); + $linkInfo->crawl = $crawl; + $linkInfo->index = $index; + $filteredLinkInfos[] = $linkInfo; + } + return $filteredLinkInfos; + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($linkInfo->known) { + continue; } - if ($allowed) { - $this->queue->addToCrawl($linkInfo->url); + if ($linkInfo->crawl || $linkInfo->index) { + $this->es->markQueued($linkInfo->url); + $actions = array(); + if ($linkInfo->index) { + $actions[] = 'index'; + } + if ($linkInfo->crawl) { + $actions[] = 'crawl'; + } + $this->queue->addToProcessList( + $linkInfo->url, $actions + ); } } } @@ -105,10 +98,15 @@ class Crawler protected function showLinks($linkInfos) { foreach ($linkInfos as $linkInfo) { - echo $linkInfo->url . "\n"; + Log::msg($linkInfo->url); if ($linkInfo->title) { - echo ' title: ' . $linkInfo->title . "\n"; - echo ' source: ' . $linkInfo->source . "\n"; + Log::msg(' title: ' . $linkInfo->title); + Log::msg(' source: ' . $linkInfo->source); + Log::msg( + ' known: ' . intval($linkInfo->known) + . ', crawl: ' . intval($linkInfo->crawl) + . ', index: ' . intval($linkInfo->index) + ); } } }