8 public function __construct()
10 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
14 * @return Retrieved HTTP response and elasticsearch document
16 public function fetch($url, $actions, $force = false)
18 $url = Helper::rewriteUrl($url);
20 $esDoc = $this->es->get($url);
22 if (isset($esDoc->status->location)
23 && $esDoc->status->location != ''
25 //Location redirect: Use modified time of known target
26 $locUrl = $esDoc->status->location;
27 $locUrl = Helper::rewriteUrl($locUrl);
28 $esDoc = $this->es->get($locUrl);
32 foreach ($actions as $action) {
33 $types = array_merge($types, array_keys($action::$supportedTypes));
35 $types = array_unique($types);
37 $req = new HttpRequest($url);
38 $req->setHeader('accept', implode(',', $types));
40 && isset($esDoc->status->processed)
41 && $esDoc->status->processed != ''
43 $nCrawlTime = strtotime($esDoc->status->processed);
44 $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
48 $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
49 $effUrl = Helper::rewriteUrl($effUrl);
51 if ($res->getStatus() === 304) {
52 //not modified since last time, so don't crawl again
53 if ($locUrl !== null && $effUrl != $locUrl) {
54 //location URL changed, and we used the wrong crawl timestampx
55 $this->storeRedirect($url, $effUrl);
56 return $this->fetch($url, $actions, $force);
59 Log::info("Not modified since last fetch");
61 } else if ($res->getStatus() !== 200) {
63 "Response code is not 200 but "
64 . $res->getStatus() . ", stopping"
68 if ($effUrl != $url) {
69 $this->storeRedirect($url, $effUrl);
71 $esDoc = $this->es->get($url);
73 //FIXME: etag, hash on content
75 $retrieved = new Retrieved();
76 $retrieved->httpRes = $res;
77 $retrieved->esDoc = $esDoc;
78 $retrieved->url = $url;
82 protected function storeRedirect($url, $target)
84 $esDoc = Helper::baseDoc($url);
85 $esDoc->status = (object) array(
86 'location' => $target,
89 $this->storeDoc($url, $esDoc);
92 public function storeDoc($url, $esDoc)
94 Log::info("Store $url");
95 $esDoc->status->processed = gmdate('c');
96 $r = new Elasticsearch_Request(
97 $GLOBALS['phinde']['elasticsearch'] . 'document/'
98 . ElasticSearch::getDocId($url),
99 \HTTP_Request2::METHOD_PUT
101 $r->setBody(json_encode($esDoc));