10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedIndexTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function crawl($url)
28 $res = $this->fetch($url);
33 $linkInfos = $this->extractLinks($res);
34 $linkInfos = $this->filterLinks($linkInfos);
35 if ($this->showLinksOnly) {
36 $this->showLinks($linkInfos);
38 $this->enqueue($linkInfos);
42 protected function fetch($url)
44 $existingDoc = $this->es->get($url);
46 $req = new HttpRequest($url);
49 implode(',', array_keys(static::$supportedIndexTypes))
51 if ($existingDoc && isset($existingDoc->modate)) {
52 $nMoDate = strtotime($existingDoc->modate);
53 $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
57 if ($res->getStatus() === 304) {
58 //not modified since last time, so don't crawl again
60 } else if ($res->getStatus() !== 200) {
62 "Response code is not 200 but "
63 . $res->getStatus() . ", stopping"
69 protected function extractLinks(\HTTP_Request2_Response $res)
71 $mimetype = explode(';', $res->getHeader('content-type'))[0];
72 if (!isset(static::$supportedIndexTypes[$mimetype])) {
73 echo "MIME type not supported for indexing: $mimetype\n";
77 $class = static::$supportedIndexTypes[$mimetype];
78 $extractor = new $class();
79 return $extractor->extract($res);
82 protected function filterLinks($linkInfos)
84 $filteredLinkInfos = array();
85 foreach ($linkInfos as $linkInfo) {
86 $allowed = Helper::isUrlAllowed($linkInfo->url);
88 $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
90 if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
91 foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
92 if (preg_match('#' . $bl . '#', $linkInfo->url)) {
98 $linkInfo->known = $this->es->isKnown($linkInfo->url);
99 $linkInfo->crawl = $crawl;
100 $linkInfo->index = $index;
101 $filteredLinkInfos[] = $linkInfo;
103 return $filteredLinkInfos;
106 protected function enqueue($linkInfos)
108 foreach ($linkInfos as $linkInfo) {
109 if ($linkInfo->known) {
112 if ($linkInfo->crawl || $linkInfo->index) {
113 $this->es->markQueued($linkInfo->url);
115 if ($linkInfo->index) {
116 $this->queue->addToIndex(
117 $linkInfo->url, $linkInfo->title, $linkInfo->source
120 if ($linkInfo->crawl) {
121 $this->queue->addToCrawl($linkInfo->url);
126 protected function showLinks($linkInfos)
128 foreach ($linkInfos as $linkInfo) {
129 echo $linkInfo->url . "\n";
130 if ($linkInfo->title) {
131 echo ' title: ' . $linkInfo->title . "\n";
132 echo ' source: ' . $linkInfo->source . "\n";
133 echo ' known: ' . intval($linkInfo->known)
134 . ', crawl: ' . intval($linkInfo->crawl)
135 . ', index: ' . intval($linkInfo->index) . "\n";
140 public function setShowLinksOnly($showLinksOnly)
142 $this->showLinksOnly = $showLinksOnly;