8 public function __construct()
10 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
14 * @return Retrieved HTTP response and elasticsearch document
16 public function fetch($url, $actions, $force = false)
18 $esDoc = $this->es->get($url);
19 if (isset($esDoc->status->location)
20 && $esDoc->status->location != ''
22 //TODO: what if location redirects change?
23 $url = $esDoc->status->location;
24 $esDoc = $this->es->get($url);
28 foreach ($actions as $action) {
29 $types = array_merge($action::$supportedTypes);
31 $types = array_unique($types);
33 $req = new HttpRequest($url);
34 $req->setHeader('accept', implode(',', $types));
36 && isset($esDoc->status->processed)
37 && $esDoc->status->processed != ''
39 $nCrawlTime = strtotime($esDoc->status->processed);
40 $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
44 if ($res->getStatus() === 304) {
45 //not modified since last time, so don't crawl again
46 Log::info("Not modified since last fetch");
48 } else if ($res->getStatus() !== 200) {
50 "Response code is not 200 but "
51 . $res->getStatus() . ", stopping"
55 $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
56 if ($effUrl != $url) {
57 $this->storeRedirect($url, $effUrl);
59 $esDoc = $this->es->get($url);
61 //FIXME: etag, hash on content
63 if ($esDoc === null) {
65 $esDoc = Helper::baseDoc($url);
68 $lm = $res->getHeader('last-modified');
70 $esDoc->status->modate = gmdate('c', strtotime($lm));
72 $esDoc->status->modate = gmdate('c');
74 if ($esDoc->status->crdate == '') {
75 $esDoc->status->crdate = $esDoc->status->modate;
78 $retrieved = new Retrieved();
79 $retrieved->httpRes = $res;
80 $retrieved->esDoc = $esDoc;
81 $retrieved->url = $url;
85 protected function storeRedirect($url, $target)
87 $esDoc = Helper::baseDoc($url);
88 $esDoc->status = (object) array(
89 'location' => $target,
92 $this->storeDoc($url, $esDoc);
95 public function storeDoc($url, $esDoc)
97 Log::info("Store $url");
98 $esDoc->status->processed = gmdate('c');
99 $r = new Elasticsearch_Request(
100 $GLOBALS['phinde']['elasticsearch'] . 'document/'
101 . ElasticSearch::getDocId($url),
102 \HTTP_Request2::METHOD_PUT
104 $r->setBody(json_encode($esDoc));