X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8..ba1f6fc77827dba86b1ac0b62aa288dd0f977cb8:/src/phinde/Crawler.php?ds=inline diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 53320ec..a63815d 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -6,6 +6,11 @@ class Crawler protected $es; protected $queue; + /** + * If the links only should be shown, not queued + */ + protected $showLinksOnly = false; + static $supportedIndexTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', @@ -20,16 +25,38 @@ class Crawler public function crawl($url) { - $res = $this->fetch($url); + $res = $this->fetch($url); + if ($res === false) { + return; + } + $linkInfos = $this->extractLinks($res); - $this->enqueue($linkInfos); + if ($this->showLinksOnly) { + $this->showLinks($linkInfos); + } else { + $this->enqueue($linkInfos); + } } protected function fetch($url) { + $existingDoc = $this->es->get($url); + $req = new HttpRequest($url); + $req->setHeader( + 'accept', + implode(',', array_keys(static::$supportedIndexTypes)) + ); + if ($existingDoc && isset($existingDoc->modate)) { + $nMoDate = strtotime($existingDoc->modate); + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); + } + $res = $req->send(); - if ($res->getStatus() !== 200) { + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + return false; + } else if ($res->getStatus() !== 200) { throw new \Exception( "Response code is not 200 but " . $res->getStatus() . ", stopping" @@ -57,14 +84,38 @@ class Crawler if ($this->es->isKnown($linkInfo->url)) { continue; } - $this->es->markQueued($linkInfo->url); - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source - ); - if (Helper::isUrlAllowed($linkInfo->url)) { + $allowed = Helper::isUrlAllowed($linkInfo->url); + $crawl = $allowed; + $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; + + if ($crawl || $index) { + $this->es->markQueued($linkInfo->url); + } + if ($index) { + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + } + if ($allowed) { $this->queue->addToCrawl($linkInfo->url); } } } + + protected function showLinks($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + echo $linkInfo->url . "\n"; + if ($linkInfo->title) { + echo ' title: ' . $linkInfo->title . "\n"; + echo ' source: ' . $linkInfo->source . "\n"; + } + } + } + + public function setShowLinksOnly($showLinksOnly) + { + $this->showLinksOnly = $showLinksOnly; + } } ?>