8 public function __construct()
10 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
14 * @return Retrieved HTTP response and elasticsearch document
16 public function fetch($url, $actions, $force = false)
18 $esDoc = $this->es->get($url);
19 if (isset($esDoc->status->location)
20 && $esDoc->status->location != ''
22 //TODO: what if location redirects change?
23 $url = $esDoc->status->location;
24 $esDoc = $this->es->get($url);
28 foreach ($actions as $action) {
29 $types = array_merge($action::$supportedTypes);
31 $types = array_unique($types);
33 $req = new HttpRequest($url);
34 $req->setHeader('accept', implode(',', $types));
36 && isset($esDoc->status->processed)
37 && $esDoc->status->processed != ''
39 $nCrawlTime = strtotime($esDoc->status->processed);
40 $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
44 if ($res->getStatus() === 304) {
45 //not modified since last time, so don't crawl again
46 echo "Not modified since last fetch\n";
48 } else if ($res->getStatus() !== 200) {
50 "Response code is not 200 but "
51 . $res->getStatus() . ", stopping"
55 $effUrl = $res->getEffectiveUrl();
56 if ($effUrl != $url) {
57 $this->storeRedirect($url, $effUrl);
59 $esDoc = $this->es->get($url);
61 //FIXME: etag, hash on content
63 $retrieved = new Retrieved();
64 $retrieved->httpRes = $res;
65 $retrieved->esDoc = $esDoc;
66 $retrieved->url = $url;
70 protected function storeRedirect($url, $target)
72 $esDoc = new \stdClass();
73 $esDoc->status = (object) array(
77 $this->storeDoc($url, $esDoc);
80 public function storeDoc($url, $esDoc)
83 $esDoc->status->processed = gmdate('c');
84 $r = new Elasticsearch_Request(
85 $GLOBALS['phinde']['elasticsearch'] . 'document/'
86 . ElasticSearch::getDocId($url),
87 \HTTP_Request2::METHOD_PUT
89 $r->setBody(json_encode($esDoc));