From 083fa4116b22f4898123006880fb52061763513d Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Tue, 30 Aug 2016 13:35:05 +0200 Subject: [PATCH] Add crawlBlacklist configuration option Resolves: #7 --- data/config.php.dist | 3 +++ src/phinde/Crawler.php | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/data/config.php.dist b/data/config.php.dist index 3c1ccea..20bf208 100644 --- a/data/config.php.dist +++ b/data/config.php.dist @@ -10,6 +10,9 @@ $GLOBALS['phinde'] = array( 'blacklist' => array( 'http://bad.example.org/' ), + //list of regexes for URLs that should not be crawled + 'crawlBlacklist' => array( + ), //list of URLs that should be subscribed to with PubSubHubbub 'subscriptions' => array( 'http://www.example.org/feed', diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index a63815d..72726a5 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -31,6 +31,7 @@ class Crawler } $linkInfos = $this->extractLinks($res); + $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); } else { @@ -78,25 +79,45 @@ class Crawler return $extractor->extract($res); } - protected function enqueue($linkInfos) + protected function filterLinks($linkInfos) { + $filteredLinkInfos = array(); foreach ($linkInfos as $linkInfo) { - if ($this->es->isKnown($linkInfo->url)) { - continue; - } $allowed = Helper::isUrlAllowed($linkInfo->url); $crawl = $allowed; $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; - if ($crawl || $index) { + if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) { + foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) { + if (preg_match('#' . $bl . '#', $linkInfo->url)) { + $crawl = false; + } + } + } + + $linkInfo->known = $this->es->isKnown($linkInfo->url); + $linkInfo->crawl = $crawl; + $linkInfo->index = $index; + $filteredLinkInfos[] = $linkInfo; + } + return $filteredLinkInfos; + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($linkInfo->known) { + continue; + } + if ($linkInfo->crawl || $linkInfo->index) { $this->es->markQueued($linkInfo->url); } - if ($index) { + if ($linkInfo->index) { $this->queue->addToIndex( $linkInfo->url, $linkInfo->title, $linkInfo->source ); } - if ($allowed) { + if ($linkInfo->crawl) { $this->queue->addToCrawl($linkInfo->url); } } @@ -107,8 +128,11 @@ class Crawler foreach ($linkInfos as $linkInfo) { echo $linkInfo->url . "\n"; if ($linkInfo->title) { - echo ' title: ' . $linkInfo->title . "\n"; + echo ' title: ' . $linkInfo->title . "\n"; echo ' source: ' . $linkInfo->source . "\n"; + echo ' known: ' . intval($linkInfo->known) + . ', crawl: ' . intval($linkInfo->crawl) + . ', index: ' . intval($linkInfo->index) . "\n"; } } } -- 2.30.2