X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8..f98e891b454e5677bdf61f476e366b01af713b50:/src/phinde/Crawler.php diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 53320ec..1f63e60 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -6,7 +6,12 @@ class Crawler protected $es; protected $queue; - static $supportedIndexTypes = array( + /** + * If the links only should be shown, not queued + */ + protected $showLinksOnly = false; + + static $supportedTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', @@ -18,53 +23,97 @@ class Crawler $this->queue = new Queue(); } - public function crawl($url) - { - $res = $this->fetch($url); - $linkInfos = $this->extractLinks($res); - $this->enqueue($linkInfos); - } - - protected function fetch($url) + public function run(Retrieved $retrieved) { - $req = new HttpRequest($url); - $res = $req->send(); - if ($res->getStatus() !== 200) { - throw new \Exception( - "Response code is not 200 but " - . $res->getStatus() . ", stopping" - ); + $linkInfos = $this->extractLinks($retrieved->httpRes); + $linkInfos = $this->filterLinks($linkInfos); + if ($this->showLinksOnly) { + $this->showLinks($linkInfos); + return false; + } else { + $this->enqueue($linkInfos); + return true; } - return $res; } protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; - if (!isset(static::$supportedIndexTypes[$mimetype])) { - echo "MIME type not supported for indexing: $mimetype\n"; + if (!isset(static::$supportedTypes[$mimetype])) { + Log::info("MIME type not supported for crawling: $mimetype"); return array(); } - $class = static::$supportedIndexTypes[$mimetype]; + $class = static::$supportedTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } + protected function filterLinks($linkInfos) + { + $filteredLinkInfos = array(); + foreach ($linkInfos as $linkInfo) { + $allowed = Helper::isUrlAllowed($linkInfo->url); + $crawl = $allowed; + $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; + + if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) { + foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) { + if (preg_match('#' . $bl . '#', $linkInfo->url)) { + $crawl = false; + } + } + } + + $linkInfo->known = $this->es->isKnown($linkInfo->url); + $linkInfo->crawl = $crawl; + $linkInfo->index = $index; + $filteredLinkInfos[] = $linkInfo; + } + return $filteredLinkInfos; + } + protected function enqueue($linkInfos) { foreach ($linkInfos as $linkInfo) { - if ($this->es->isKnown($linkInfo->url)) { + if ($linkInfo->known) { continue; } - $this->es->markQueued($linkInfo->url); - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source - ); - if (Helper::isUrlAllowed($linkInfo->url)) { - $this->queue->addToCrawl($linkInfo->url); + if ($linkInfo->crawl || $linkInfo->index) { + $this->es->markQueued($linkInfo->url); + $actions = array(); + if ($linkInfo->index) { + $actions[] = 'index'; + } + if ($linkInfo->crawl) { + $actions[] = 'crawl'; + } + $this->queue->addToProcessList( + $linkInfo->url, $actions + ); + } + } + } + + protected function showLinks($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + Log::msg($linkInfo->url); + if ($linkInfo->title) { + Log::msg(' title: ' . $linkInfo->title); + Log::msg(' source: ' . $linkInfo->source); + Log::msg( + ' known: ' . intval($linkInfo->known) + . ', crawl: ' . intval($linkInfo->crawl) + . ', index: ' . intval($linkInfo->index) + ); } } } + + public function setShowLinksOnly($showLinksOnly) + { + $this->showLinksOnly = $showLinksOnly; + } } ?>