X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/f90790c6b2a54c9b1c8a0aeaf1f23e6aa67d7aca..d7651fd96dcfa2829519504e4c8ec1ce511cd57f:/src/phinde/Fetcher.php diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php new file mode 100644 index 0000000..b5644af --- /dev/null +++ b/src/phinde/Fetcher.php @@ -0,0 +1,93 @@ +es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + } + + /** + * @return Retrieved HTTP response and elasticsearch document + */ + public function fetch($url, $actions, $force = false) + { + $esDoc = $this->es->get($url); + if (isset($esDoc->status->location) + && $esDoc->status->location != '' + ) { + //TODO: what if location redirects change? + $url = $esDoc->status->location; + $esDoc = $this->es->get($url); + } + + $types = array(); + foreach ($actions as $action) { + $types = array_merge($action::$supportedTypes); + } + $types = array_unique($types); + + $req = new HttpRequest($url); + $req->setHeader('accept', implode(',', $types)); + if (!$force && $esDoc + && isset($esDoc->status->processed) + && $esDoc->status->processed != '' + ) { + $nCrawlTime = strtotime($esDoc->status->processed); + $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime)); + } + + $res = $req->send(); + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + echo "Not modified since last fetch\n"; + return false; + } else if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + + $effUrl = $res->getEffectiveUrl(); + if ($effUrl != $url) { + $this->storeRedirect($url, $effUrl); + $url = $effUrl; + $esDoc = $this->es->get($url); + } + //FIXME: etag, hash on content + + $retrieved = new Retrieved(); + $retrieved->httpRes = $res; + $retrieved->esDoc = $esDoc; + $retrieved->url = $url; + return $retrieved; + } + + protected function storeRedirect($url, $target) + { + $esDoc = new \stdClass(); + $esDoc->status = (object) array( + 'location' => $target + ); + $esDoc->url = $url; + $this->storeDoc($url, $esDoc); + } + + public function storeDoc($url, $esDoc) + { + echo "Store $url\n"; + $esDoc->status->processed = gmdate('c'); + $r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'] . 'document/' + . ElasticSearch::getDocId($url), + \HTTP_Request2::METHOD_PUT + ); + $r->setBody(json_encode($esDoc)); + $r->send(); + } +} +?>