aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/Crawler.php
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-11-07 21:41:36 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-11-07 21:41:36 +0100
commitd7651fd96dcfa2829519504e4c8ec1ce511cd57f (patch)
treee24d7a9f90060b0fee5a652de43bd0627f1c5bde /src/phinde/Crawler.php
parentf90790c6b2a54c9b1c8a0aeaf1f23e6aa67d7aca (diff)
downloadphinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.tar.gz
phinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.zip
Big patch merging crawling+indexing into one command, new json document structure
Diffstat (limited to 'src/phinde/Crawler.php')
-rw-r--r--src/phinde/Crawler.php66
1 files changed, 16 insertions, 50 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
index 1cf9bdc..38e3c3f 100644
--- a/src/phinde/Crawler.php
+++ b/src/phinde/Crawler.php
@@ -11,7 +11,7 @@ class Crawler
*/
protected $showLinksOnly = false;
- static $supportedIndexTypes = array(
+ static $supportedTypes = array(
'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
'text/html' => '\\phinde\\LinkExtractor\\Html',
@@ -23,59 +23,28 @@ class Crawler
$this->queue = new Queue();
}
- public function crawl($url)
+ public function run(Retrieved $retrieved)
{
- $res = $this->fetch($url);
- if ($res === false) {
- return;
- }
-
- $linkInfos = $this->extractLinks($res);
+ $linkInfos = $this->extractLinks($retrieved->httpRes);
$linkInfos = $this->filterLinks($linkInfos);
if ($this->showLinksOnly) {
$this->showLinks($linkInfos);
+ return false;
} else {
$this->enqueue($linkInfos);
+ return true;
}
}
- protected function fetch($url)
- {
- $existingDoc = $this->es->get($url);
-
- $req = new HttpRequest($url);
- $req->setHeader(
- 'accept',
- implode(',', array_keys(static::$supportedIndexTypes))
- );
- if ($existingDoc && isset($existingDoc->modate)) {
- $nMoDate = strtotime($existingDoc->modate);
- $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
- }
-
- $res = $req->send();
- if ($res->getStatus() === 304) {
- //not modified since last time, so don't crawl again
- $this->log('Not modified since last fetch');
- return false;
- } else if ($res->getStatus() !== 200) {
- throw new \Exception(
- "Response code is not 200 but "
- . $res->getStatus() . ", stopping"
- );
- }
- return $res;
- }
-
protected function extractLinks(\HTTP_Request2_Response $res)
{
$mimetype = explode(';', $res->getHeader('content-type'))[0];
- if (!isset(static::$supportedIndexTypes[$mimetype])) {
+ if (!isset(static::$supportedTypes[$mimetype])) {
echo "MIME type not supported for indexing: $mimetype\n";
return array();
}
- $class = static::$supportedIndexTypes[$mimetype];
+ $class = static::$supportedTypes[$mimetype];
$extractor = new $class();
return $extractor->extract($res);
}
@@ -112,15 +81,17 @@ class Crawler
}
if ($linkInfo->crawl || $linkInfo->index) {
$this->es->markQueued($linkInfo->url);
- }
- if ($linkInfo->index) {
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
+ $actions = array();
+ if ($linkInfo->index) {
+ $actions[] = 'index';
+ }
+ if ($linkInfo->crawl) {
+ $actions[] = 'crawl';
+ }
+ $this->queue->addToProcessList(
+ $linkInfo->url, $actions
);
}
- if ($linkInfo->crawl) {
- $this->queue->addToCrawl($linkInfo->url);
- }
}
}
@@ -142,10 +113,5 @@ class Crawler
{
$this->showLinksOnly = $showLinksOnly;
}
-
- protected function log($msg)
- {
- echo $msg . "\n";
- }
}
?>