add log class
[phinde.git] / src / phinde / Fetcher.php
1 <?php
2 namespace phinde;
3
4 class Fetcher
5 {
6     protected $es;
7
8     public function __construct()
9     {
10         $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
11     }
12
13     /**
14      * @return Retrieved HTTP response and elasticsearch document
15      */
16     public function fetch($url, $actions, $force = false)
17     {
18         $esDoc = $this->es->get($url);
19         if (isset($esDoc->status->location)
20             && $esDoc->status->location != ''
21         ) {
22             //TODO: what if location redirects change?
23             $url = $esDoc->status->location;
24             $esDoc = $this->es->get($url);
25         }
26
27         $types = array();
28         foreach ($actions as $action) {
29             $types = array_merge($action::$supportedTypes);
30         }
31         $types = array_unique($types);
32
33         $req = new HttpRequest($url);
34         $req->setHeader('accept', implode(',', $types));
35         if (!$force && $esDoc
36             && isset($esDoc->status->processed)
37             && $esDoc->status->processed != ''
38         ) {
39             $nCrawlTime = strtotime($esDoc->status->processed);
40             $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
41         }
42
43         $res = $req->send();
44         if ($res->getStatus() === 304) {
45             //not modified since last time, so don't crawl again
46             Log::info("Not modified since last fetch");
47             return false;
48         } else if ($res->getStatus() !== 200) {
49             throw new \Exception(
50                 "Response code is not 200 but "
51                 . $res->getStatus() . ", stopping"
52             );
53         }
54
55         $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
56         if ($effUrl != $url) {
57             $this->storeRedirect($url, $effUrl);
58             $url = $effUrl;
59             $esDoc = $this->es->get($url);
60         }
61         //FIXME: etag, hash on content
62
63         $retrieved = new Retrieved();
64         $retrieved->httpRes = $res;
65         $retrieved->esDoc   = $esDoc;
66         $retrieved->url     = $url;
67         return $retrieved;
68     }
69
70     protected function storeRedirect($url, $target)
71     {
72         $esDoc = Helper::baseDoc($url);
73         $esDoc->status = (object) array(
74             'location' => $target,
75             'findable' => false,
76         );
77         $this->storeDoc($url, $esDoc);
78     }
79
80     public function storeDoc($url, $esDoc)
81     {
82         Log::info("Store $url");
83         $esDoc->status->processed = gmdate('c');
84         $r = new Elasticsearch_Request(
85             $GLOBALS['phinde']['elasticsearch'] . 'document/'
86             . ElasticSearch::getDocId($url),
87             \HTTP_Request2::METHOD_PUT
88         );
89         $r->setBody(json_encode($esDoc));
90         $r->send();
91     }
92 }
93 ?>