8 public function __construct()
10 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
14 * @return Retrieved HTTP response and elasticsearch document
16 public function fetch($url, $actions, $force = false)
18 $url = Helper::rewriteUrl($url);
20 $esDoc = $this->es->get($url);
21 if (isset($esDoc->status->location)
22 && $esDoc->status->location != ''
24 //TODO: what if location redirects change?
25 $url = $esDoc->status->location;
26 $url = Helper::rewriteUrl($url);
27 $esDoc = $this->es->get($url);
31 foreach ($actions as $action) {
32 $types = array_merge($types, array_keys($action::$supportedTypes));
34 $types = array_unique($types);
36 $req = new HttpRequest($url);
37 $req->setHeader('accept', implode(',', $types));
39 && isset($esDoc->status->processed)
40 && $esDoc->status->processed != ''
42 $nCrawlTime = strtotime($esDoc->status->processed);
43 $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
47 if ($res->getStatus() === 304) {
48 //not modified since last time, so don't crawl again
49 Log::info("Not modified since last fetch");
51 } else if ($res->getStatus() !== 200) {
53 "Response code is not 200 but "
54 . $res->getStatus() . ", stopping"
58 $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
59 $effUrl = Helper::rewriteUrl($effUrl);
60 if ($effUrl != $url) {
61 $this->storeRedirect($url, $effUrl);
63 $esDoc = $this->es->get($url);
65 //FIXME: etag, hash on content
67 $retrieved = new Retrieved();
68 $retrieved->httpRes = $res;
69 $retrieved->esDoc = $esDoc;
70 $retrieved->url = $url;
74 protected function storeRedirect($url, $target)
76 $esDoc = Helper::baseDoc($url);
77 $esDoc->status = (object) array(
78 'location' => $target,
81 $this->storeDoc($url, $esDoc);
84 public function storeDoc($url, $esDoc)
86 Log::info("Store $url");
87 $esDoc->status->processed = gmdate('c');
88 $r = new Elasticsearch_Request(
89 $GLOBALS['phinde']['elasticsearch'] . 'document/'
90 . ElasticSearch::getDocId($url),
91 \HTTP_Request2::METHOD_PUT
93 $r->setBody(json_encode($esDoc));