'\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', ); public function __construct() { $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); $this->queue = new Queue(); } public function crawl($url) { $res = $this->fetch($url); $linkInfos = $this->extractLinks($res); $this->enqueue($linkInfos); } protected function fetch($url) { $req = new HttpRequest($url); $req->setHeader( 'accept', implode(',', array_keys(static::$supportedIndexTypes)) ); $res = $req->send(); if ($res->getStatus() !== 200) { throw new \Exception( "Response code is not 200 but " . $res->getStatus() . ", stopping" ); } return $res; } protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; if (!isset(static::$supportedIndexTypes[$mimetype])) { echo "MIME type not supported for indexing: $mimetype\n"; return array(); } $class = static::$supportedIndexTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } protected function enqueue($linkInfos) { var_dump($linkInfos);die(); foreach ($linkInfos as $linkInfo) { if ($this->es->isKnown($linkInfo->url)) { continue; } $this->es->markQueued($linkInfo->url); $this->queue->addToIndex( $linkInfo->url, $linkInfo->title, $linkInfo->source ); if (Helper::isUrlAllowed($linkInfo->url)) { $this->queue->addToCrawl($linkInfo->url); } } } } ?>