f3158aa07223979239f41bca742979d159f1aa2c
[phinde.git] / src / phinde / Crawler.php
1 <?php
2 namespace phinde;
3
4 class Crawler
5 {
6     protected $es;
7     protected $queue;
8
9     static $supportedIndexTypes = array(
10         'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
11         'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
12         'text/html'             => '\\phinde\\LinkExtractor\\Html',
13     );
14
15     public function __construct()
16     {
17         $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
18         $this->queue = new Queue();
19     }
20
21     public function crawl($url)
22     {
23         $res       = $this->fetch($url);
24         $linkInfos = $this->extractLinks($res);
25         $this->enqueue($linkInfos);
26     }
27
28     protected function fetch($url)
29     {
30         $req = new HttpRequest($url);
31         $req->setHeader(
32             'accept',
33             implode(',', array_keys(static::$supportedIndexTypes))
34         );
35         $res = $req->send();
36         if ($res->getStatus() !== 200) {
37             throw new \Exception(
38                 "Response code is not 200 but "
39                 . $res->getStatus() . ", stopping"
40             );
41         }
42         return $res;
43     }
44
45     protected function extractLinks(\HTTP_Request2_Response $res)
46     {
47         $mimetype = explode(';', $res->getHeader('content-type'))[0];
48         if (!isset(static::$supportedIndexTypes[$mimetype])) {
49             echo "MIME type not supported for indexing: $mimetype\n";
50             return array();
51         }
52
53         $class = static::$supportedIndexTypes[$mimetype];
54         $extractor = new $class();
55         return $extractor->extract($res);
56     }
57
58     protected function enqueue($linkInfos)
59     {
60         foreach ($linkInfos as $linkInfo) {
61             if ($this->es->isKnown($linkInfo->url)) {
62                 continue;
63             }
64             $this->es->markQueued($linkInfo->url);
65             $this->queue->addToIndex(
66                 $linkInfo->url, $linkInfo->title, $linkInfo->source
67             );
68             if (Helper::isUrlAllowed($linkInfo->url)) {
69                 $this->queue->addToCrawl($linkInfo->url);
70             }
71         }
72     }
73 }
74 ?>