10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedIndexTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function crawl($url)
28 $res = $this->fetch($url);
33 $linkInfos = $this->extractLinks($res);
34 if ($this->showLinksOnly) {
35 $this->showLinks($linkInfos);
37 $this->enqueue($linkInfos);
41 protected function fetch($url)
43 $existingDoc = $this->es->get($url);
45 $req = new HttpRequest($url);
48 implode(',', array_keys(static::$supportedIndexTypes))
51 $nMoDate = strtotime($existingDoc->modate);
52 $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
56 if ($res->getStatus() === 304) {
57 //not modified since last time, so don't crawl again
59 } else if ($res->getStatus() !== 200) {
61 "Response code is not 200 but "
62 . $res->getStatus() . ", stopping"
68 protected function extractLinks(\HTTP_Request2_Response $res)
70 $mimetype = explode(';', $res->getHeader('content-type'))[0];
71 if (!isset(static::$supportedIndexTypes[$mimetype])) {
72 echo "MIME type not supported for indexing: $mimetype\n";
76 $class = static::$supportedIndexTypes[$mimetype];
77 $extractor = new $class();
78 return $extractor->extract($res);
81 protected function enqueue($linkInfos)
83 foreach ($linkInfos as $linkInfo) {
84 if ($this->es->isKnown($linkInfo->url)) {
87 $allowed = Helper::isUrlAllowed($linkInfo->url);
89 $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
91 if ($crawl || $index) {
92 $this->es->markQueued($linkInfo->url);
95 $this->queue->addToIndex(
96 $linkInfo->url, $linkInfo->title, $linkInfo->source
100 $this->queue->addToCrawl($linkInfo->url);
105 protected function showLinks($linkInfos)
107 foreach ($linkInfos as $linkInfo) {
108 echo $linkInfo->url . "\n";
109 if ($linkInfo->title) {
110 echo ' title: ' . $linkInfo->title . "\n";
111 echo ' source: ' . $linkInfo->source . "\n";
116 public function setShowLinksOnly($showLinksOnly)
118 $this->showLinksOnly = $showLinksOnly;