wip
[phinde.git] / src / phinde / Fetcher.php
1 <?php
2 namespace phinde;
3
4 class Fetcher
5 {
6     protected $es;
7
8     public function __construct()
9     {
10         $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
11     }
12
13     /**
14      * @return Retrieved HTTP response and elasticsearch document
15      */
16     public function fetch($url, $actions, $force = false)
17     {
18         $esDoc = $this->es->get($url);
19         if (isset($esDoc->status->location)
20             && $esDoc->status->location != ''
21         ) {
22             //TODO: what if location redirects change?
23             $url = $esDoc->status->location;
24             $esDoc = $this->es->get($url);
25         }
26
27         $types = array();
28         foreach ($actions as $action) {
29             $types = array_merge($action::$supportedTypes);
30         }
31         $types = array_unique($types);
32
33         $req = new HttpRequest($url);
34         $req->setHeader('accept', implode(',', $types));
35         if (!$force && $esDoc
36             && isset($esDoc->status->processed)
37             && $esDoc->status->processed != ''
38         ) {
39             $nCrawlTime = strtotime($esDoc->status->processed);
40             $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
41         }
42
43         $res = $req->send();
44         if ($res->getStatus() === 304) {
45             //not modified since last time, so don't crawl again
46             Log::info("Not modified since last fetch");
47             return false;
48         } else if ($res->getStatus() !== 200) {
49             throw new \Exception(
50                 "Response code is not 200 but "
51                 . $res->getStatus() . ", stopping"
52             );
53         }
54
55         $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
56         if ($effUrl != $url) {
57             $this->storeRedirect($url, $effUrl);
58             $url = $effUrl;
59             $esDoc = $this->es->get($url);
60         }
61         //FIXME: etag, hash on content
62
63         if ($esDoc === null) {
64             //not known yet
65             $esDoc = Helper::baseDoc($url);
66         }
67
68         $lm = $res->getHeader('last-modified');
69         if ($lm !== null) {
70             $esDoc->status->modate = gmdate('c', strtotime($lm));
71         } else {
72             $esDoc->status->modate = gmdate('c');
73         }
74         if ($esDoc->status->crdate == '') {
75             $esDoc->status->crdate = $esDoc->status->modate;
76         }
77
78         $retrieved = new Retrieved();
79         $retrieved->httpRes = $res;
80         $retrieved->esDoc   = $esDoc;
81         $retrieved->url     = $url;
82         return $retrieved;
83     }
84
85     protected function storeRedirect($url, $target)
86     {
87         $esDoc = Helper::baseDoc($url);
88         $esDoc->status = (object) array(
89             'location' => $target,
90             'findable' => false,
91         );
92         $this->storeDoc($url, $esDoc);
93     }
94
95     public function storeDoc($url, $esDoc)
96     {
97         Log::info("Store $url");
98         $esDoc->status->processed = gmdate('c');
99         $r = new Elasticsearch_Request(
100             $GLOBALS['phinde']['elasticsearch'] . 'document/'
101             . ElasticSearch::getDocId($url),
102             \HTTP_Request2::METHOD_PUT
103         );
104         $r->setBody(json_encode($esDoc));
105         $r->send();
106     }
107 }
108 ?>