9 static $supportedIndexTypes = array(
10 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
11 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
12 'text/html' => '\\phinde\\LinkExtractor\\Html',
15 public function __construct()
17 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
18 $this->queue = new Queue();
21 public function crawl($url)
23 $res = $this->fetch($url);
24 $linkInfos = $this->extractLinks($res);
25 $this->enqueue($linkInfos);
28 protected function fetch($url)
30 $req = new HttpRequest($url);
33 implode(',', array_keys(static::$supportedIndexTypes))
36 if ($res->getStatus() !== 200) {
38 "Response code is not 200 but "
39 . $res->getStatus() . ", stopping"
45 protected function extractLinks(\HTTP_Request2_Response $res)
47 $mimetype = explode(';', $res->getHeader('content-type'))[0];
48 if (!isset(static::$supportedIndexTypes[$mimetype])) {
49 echo "MIME type not supported for indexing: $mimetype\n";
53 $class = static::$supportedIndexTypes[$mimetype];
54 $extractor = new $class();
55 return $extractor->extract($res);
58 protected function enqueue($linkInfos)
60 var_dump($linkInfos);die();
61 foreach ($linkInfos as $linkInfo) {
62 if ($this->es->isKnown($linkInfo->url)) {
65 $this->es->markQueued($linkInfo->url);
66 $this->queue->addToIndex(
67 $linkInfo->url, $linkInfo->title, $linkInfo->source
69 if (Helper::isUrlAllowed($linkInfo->url)) {
70 $this->queue->addToCrawl($linkInfo->url);