10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function run(Retrieved $retrieved)
28 $linkInfos = $this->extractLinks($retrieved->httpRes);
29 $linkInfos = $this->filterLinks($linkInfos);
30 if ($this->showLinksOnly) {
31 $this->showLinks($linkInfos);
34 $this->enqueue($linkInfos);
39 protected function extractLinks(\HTTP_Request2_Response $res)
41 $mimetype = explode(';', $res->getHeader('content-type'))[0];
42 if (!isset(static::$supportedTypes[$mimetype])) {
43 echo "MIME type not supported for indexing: $mimetype\n";
47 $class = static::$supportedTypes[$mimetype];
48 $extractor = new $class();
49 return $extractor->extract($res);
52 protected function filterLinks($linkInfos)
54 $filteredLinkInfos = array();
55 foreach ($linkInfos as $linkInfo) {
56 $allowed = Helper::isUrlAllowed($linkInfo->url);
58 $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
60 if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
61 foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
62 if (preg_match('#' . $bl . '#', $linkInfo->url)) {
68 $linkInfo->known = $this->es->isKnown($linkInfo->url);
69 $linkInfo->crawl = $crawl;
70 $linkInfo->index = $index;
71 $filteredLinkInfos[] = $linkInfo;
73 return $filteredLinkInfos;
76 protected function enqueue($linkInfos)
78 foreach ($linkInfos as $linkInfo) {
79 if ($linkInfo->known) {
82 if ($linkInfo->crawl || $linkInfo->index) {
83 $this->es->markQueued($linkInfo->url);
85 if ($linkInfo->index) {
88 if ($linkInfo->crawl) {
91 $this->queue->addToProcessList(
92 $linkInfo->url, $actions
98 protected function showLinks($linkInfos)
100 foreach ($linkInfos as $linkInfo) {
101 echo $linkInfo->url . "\n";
102 if ($linkInfo->title) {
103 echo ' title: ' . $linkInfo->title . "\n";
104 echo ' source: ' . $linkInfo->source . "\n";
105 echo ' known: ' . intval($linkInfo->known)
106 . ', crawl: ' . intval($linkInfo->crawl)
107 . ', index: ' . intval($linkInfo->index) . "\n";
112 public function setShowLinksOnly($showLinksOnly)
114 $this->showLinksOnly = $showLinksOnly;