10 * If the links only should be shown, not queued
12 protected $showLinksOnly = false;
14 static $supportedTypes = array(
15 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
16 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
17 'text/html' => '\\phinde\\LinkExtractor\\Html',
20 public function __construct()
22 $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
23 $this->queue = new Queue();
26 public function run(Retrieved $retrieved)
28 $linkInfos = $this->extractLinks($retrieved->httpRes);
29 $linkInfos = $this->filterLinks($linkInfos);
30 if ($this->showLinksOnly) {
31 $this->showLinks($linkInfos);
34 $this->enqueue($linkInfos);
39 protected function extractLinks(\HTTP_Request2_Response $res)
41 $mimetype = explode(';', $res->getHeader('content-type'))[0];
42 if (!isset(static::$supportedTypes[$mimetype])) {
43 Log::info("MIME type not supported for crawling: $mimetype");
47 $class = static::$supportedTypes[$mimetype];
48 $extractor = new $class();
49 return $extractor->extract($res);
52 protected function filterLinks($linkInfos)
54 $filteredLinkInfos = array();
55 foreach ($linkInfos as $linkInfo) {
56 $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
57 $allowed = Helper::isUrlAllowed($linkInfo->url);
59 $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
61 if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
62 foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
63 if (preg_match('#' . $bl . '#', $linkInfo->url)) {
69 $linkInfo->known = $this->es->isKnown($linkInfo->url);
70 $linkInfo->crawl = $crawl;
71 $linkInfo->index = $index;
72 $filteredLinkInfos[] = $linkInfo;
74 return $filteredLinkInfos;
77 protected function enqueue($linkInfos)
79 foreach ($linkInfos as $linkInfo) {
80 if ($linkInfo->known) {
83 if ($linkInfo->crawl || $linkInfo->index) {
84 $this->es->markQueued($linkInfo->url);
86 if ($linkInfo->index) {
89 if ($linkInfo->crawl) {
92 $this->queue->addToProcessList(
93 $linkInfo->url, $actions
99 protected function showLinks($linkInfos)
101 foreach ($linkInfos as $linkInfo) {
102 Log::msg($linkInfo->url);
103 if ($linkInfo->title) {
104 Log::msg(' title: ' . $linkInfo->title);
105 Log::msg(' source: ' . $linkInfo->source);
107 ' known: ' . intval($linkInfo->known)
108 . ', crawl: ' . intval($linkInfo->crawl)
109 . ', index: ' . intval($linkInfo->index)
115 public function setShowLinksOnly($showLinksOnly)
117 $this->showLinksOnly = $showLinksOnly;