10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedIndexTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function crawl($url)
28 $res = $this->fetch($url);
33 $linkInfos = $this->extractLinks($res);
34 $linkInfos = $this->filterLinks($linkInfos);
35 if ($this->showLinksOnly) {
36 $this->showLinks($linkInfos);
38 $this->enqueue($linkInfos);
42 protected function fetch($url)
44 $existingDoc = $this->es->get($url);
46 $req = new HttpRequest($url);
49 implode(',', array_keys(static::$supportedIndexTypes))
51 if ($existingDoc && isset($existingDoc->modate)) {
52 $nMoDate = strtotime($existingDoc->modate);
53 $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
57 if ($res->getStatus() === 304) {
58 //not modified since last time, so don't crawl again
59 $this->log('Not modified since last fetch');
61 } else if ($res->getStatus() !== 200) {
63 "Response code is not 200 but "
64 . $res->getStatus() . ", stopping"
70 protected function extractLinks(\HTTP_Request2_Response $res)
72 $mimetype = explode(';', $res->getHeader('content-type'))[0];
73 if (!isset(static::$supportedIndexTypes[$mimetype])) {
74 echo "MIME type not supported for indexing: $mimetype\n";
78 $class = static::$supportedIndexTypes[$mimetype];
79 $extractor = new $class();
80 return $extractor->extract($res);
83 protected function filterLinks($linkInfos)
85 $filteredLinkInfos = array();
86 foreach ($linkInfos as $linkInfo) {
87 $allowed = Helper::isUrlAllowed($linkInfo->url);
89 $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
91 if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
92 foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
93 if (preg_match('#' . $bl . '#', $linkInfo->url)) {
99 $linkInfo->known = $this->es->isKnown($linkInfo->url);
100 $linkInfo->crawl = $crawl;
101 $linkInfo->index = $index;
102 $filteredLinkInfos[] = $linkInfo;
104 return $filteredLinkInfos;
107 protected function enqueue($linkInfos)
109 foreach ($linkInfos as $linkInfo) {
110 if ($linkInfo->known) {
113 if ($linkInfo->crawl || $linkInfo->index) {
114 $this->es->markQueued($linkInfo->url);
116 if ($linkInfo->index) {
117 $this->queue->addToIndex(
118 $linkInfo->url, $linkInfo->title, $linkInfo->source
121 if ($linkInfo->crawl) {
122 $this->queue->addToCrawl($linkInfo->url);
127 protected function showLinks($linkInfos)
129 foreach ($linkInfos as $linkInfo) {
130 echo $linkInfo->url . "\n";
131 if ($linkInfo->title) {
132 echo ' title: ' . $linkInfo->title . "\n";
133 echo ' source: ' . $linkInfo->source . "\n";
134 echo ' known: ' . intval($linkInfo->known)
135 . ', crawl: ' . intval($linkInfo->crawl)
136 . ', index: ' . intval($linkInfo->index) . "\n";
141 public function setShowLinksOnly($showLinksOnly)
143 $this->showLinksOnly = $showLinksOnly;
146 protected function log($msg)