1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
<?php
namespace phinde;
class Fetcher
{
protected $es;
public function __construct()
{
$this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
}
/**
* @return Retrieved HTTP response and elasticsearch document
*/
public function fetch($url, $actions, $force = false)
{
$esDoc = $this->es->get($url);
if (isset($esDoc->status->location)
&& $esDoc->status->location != ''
) {
//TODO: what if location redirects change?
$url = $esDoc->status->location;
$esDoc = $this->es->get($url);
}
$types = array();
foreach ($actions as $action) {
$types = array_merge($action::$supportedTypes);
}
$types = array_unique($types);
$req = new HttpRequest($url);
$req->setHeader('accept', implode(',', $types));
if (!$force && $esDoc
&& isset($esDoc->status->processed)
&& $esDoc->status->processed != ''
) {
$nCrawlTime = strtotime($esDoc->status->processed);
$req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
}
$res = $req->send();
if ($res->getStatus() === 304) {
//not modified since last time, so don't crawl again
echo "Not modified since last fetch\n";
return false;
} else if ($res->getStatus() !== 200) {
throw new \Exception(
"Response code is not 200 but "
. $res->getStatus() . ", stopping"
);
}
$effUrl = $res->getEffectiveUrl();
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;
$esDoc = $this->es->get($url);
}
//FIXME: etag, hash on content
$retrieved = new Retrieved();
$retrieved->httpRes = $res;
$retrieved->esDoc = $esDoc;
$retrieved->url = $url;
return $retrieved;
}
protected function storeRedirect($url, $target)
{
$esDoc = new \stdClass();
$esDoc->status = (object) array(
'location' => $target
);
$esDoc->url = $url;
$this->storeDoc($url, $esDoc);
}
public function storeDoc($url, $esDoc)
{
echo "Store $url\n";
$esDoc->status->processed = gmdate('c');
$r = new Elasticsearch_Request(
$GLOBALS['phinde']['elasticsearch'] . 'document/'
. ElasticSearch::getDocId($url),
\HTTP_Request2::METHOD_PUT
);
$r->setBody(json_encode($esDoc));
$r->send();
}
}
?>
|