10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedIndexTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function crawl($url)
28 $res = $this->fetch($url);
29 $linkInfos = $this->extractLinks($res);
30 if ($this->showLinksOnly) {
31 $this->showLinks($linkInfos);
33 $this->enqueue($linkInfos);
37 protected function fetch($url)
39 $req = new HttpRequest($url);
42 implode(',', array_keys(static::$supportedIndexTypes))
45 if ($res->getStatus() !== 200) {
47 "Response code is not 200 but "
48 . $res->getStatus() . ", stopping"
54 protected function extractLinks(\HTTP_Request2_Response $res)
56 $mimetype = explode(';', $res->getHeader('content-type'))[0];
57 if (!isset(static::$supportedIndexTypes[$mimetype])) {
58 echo "MIME type not supported for indexing: $mimetype\n";
62 $class = static::$supportedIndexTypes[$mimetype];
63 $extractor = new $class();
64 return $extractor->extract($res);
67 protected function enqueue($linkInfos)
69 foreach ($linkInfos as $linkInfo) {
70 if ($this->es->isKnown($linkInfo->url)) {
73 $this->es->markQueued($linkInfo->url);
74 $this->queue->addToIndex(
75 $linkInfo->url, $linkInfo->title, $linkInfo->source
77 if (Helper::isUrlAllowed($linkInfo->url)) {
78 $this->queue->addToCrawl($linkInfo->url);
83 protected function showLinks($linkInfos)
85 foreach ($linkInfos as $linkInfo) {
86 echo $linkInfo->url . "\n";
87 if ($linkInfo->title) {
88 echo ' title: ' . $linkInfo->title . "\n";
89 echo ' source: ' . $linkInfo->source . "\n";
94 public function setShowLinksOnly($showLinksOnly)
96 $this->showLinksOnly = $showLinksOnly;