Fix accept header in crawler
[phinde.git] / src / phinde / Fetcher.php
1 <?php
2 namespace phinde;
3
4 class Fetcher
5 {
6     protected $es;
7
8     public function __construct()
9     {
10         $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
11     }
12
13     /**
14      * @return Retrieved HTTP response and elasticsearch document
15      */
16     public function fetch($url, $actions, $force = false)
17     {
18         $url = Helper::rewriteUrl($url);
19
20         $esDoc = $this->es->get($url);
21         if (isset($esDoc->status->location)
22             && $esDoc->status->location != ''
23         ) {
24             //TODO: what if location redirects change?
25             $url = $esDoc->status->location;
26             $url = Helper::rewriteUrl($url);
27             $esDoc = $this->es->get($url);
28         }
29
30         $types = array();
31         foreach ($actions as $action) {
32             $types = array_merge($types, array_keys($action::$supportedTypes));
33         }
34         $types = array_unique($types);
35
36         $req = new HttpRequest($url);
37         $req->setHeader('accept', implode(',', $types));
38         if (!$force && $esDoc
39             && isset($esDoc->status->processed)
40             && $esDoc->status->processed != ''
41         ) {
42             $nCrawlTime = strtotime($esDoc->status->processed);
43             $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
44         }
45
46         $res = $req->send();
47         if ($res->getStatus() === 304) {
48             //not modified since last time, so don't crawl again
49             Log::info("Not modified since last fetch");
50             return false;
51         } else if ($res->getStatus() !== 200) {
52             throw new \Exception(
53                 "Response code is not 200 but "
54                 . $res->getStatus() . ", stopping"
55             );
56         }
57
58         $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
59         $effUrl = Helper::rewriteUrl($effUrl);
60         if ($effUrl != $url) {
61             $this->storeRedirect($url, $effUrl);
62             $url = $effUrl;
63             $esDoc = $this->es->get($url);
64         }
65         //FIXME: etag, hash on content
66
67         $retrieved = new Retrieved();
68         $retrieved->httpRes = $res;
69         $retrieved->esDoc   = $esDoc;
70         $retrieved->url     = $url;
71         return $retrieved;
72     }
73
74     protected function storeRedirect($url, $target)
75     {
76         $esDoc = Helper::baseDoc($url);
77         $esDoc->status = (object) array(
78             'location' => $target,
79             'findable' => false,
80         );
81         $this->storeDoc($url, $esDoc);
82     }
83
84     public function storeDoc($url, $esDoc)
85     {
86         Log::info("Store $url");
87         $esDoc->status->processed = gmdate('c');
88         $r = new Elasticsearch_Request(
89             $GLOBALS['phinde']['elasticsearch'] . 'document/'
90             . ElasticSearch::getDocId($url),
91             \HTTP_Request2::METHOD_PUT
92         );
93         $r->setBody(json_encode($esDoc));
94         $r->send();
95     }
96 }
97 ?>