diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-08-30 13:35:05 +0200 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-08-30 13:35:05 +0200 |
| commit | 083fa4116b22f4898123006880fb52061763513d (patch) | |
| tree | d5d04fccb84c82295446f736cda78ab1fb9d8f02 /src/phinde/Crawler.php | |
| parent | f77dad8f1dda382a23b9d22393e239be6c087a07 (diff) | |
| download | phinde-083fa4116b22f4898123006880fb52061763513d.tar.gz phinde-083fa4116b22f4898123006880fb52061763513d.zip | |
Add crawlBlacklist configuration option
Resolves: #7
Diffstat (limited to 'src/phinde/Crawler.php')
| -rw-r--r-- | src/phinde/Crawler.php | 40 |
1 files changed, 32 insertions, 8 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index a63815d..72726a5 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -31,6 +31,7 @@ class Crawler } $linkInfos = $this->extractLinks($res); + $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); } else { @@ -78,25 +79,45 @@ class Crawler return $extractor->extract($res); } - protected function enqueue($linkInfos) + protected function filterLinks($linkInfos) { + $filteredLinkInfos = array(); foreach ($linkInfos as $linkInfo) { - if ($this->es->isKnown($linkInfo->url)) { - continue; - } $allowed = Helper::isUrlAllowed($linkInfo->url); $crawl = $allowed; $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; - if ($crawl || $index) { + if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) { + foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) { + if (preg_match('#' . $bl . '#', $linkInfo->url)) { + $crawl = false; + } + } + } + + $linkInfo->known = $this->es->isKnown($linkInfo->url); + $linkInfo->crawl = $crawl; + $linkInfo->index = $index; + $filteredLinkInfos[] = $linkInfo; + } + return $filteredLinkInfos; + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($linkInfo->known) { + continue; + } + if ($linkInfo->crawl || $linkInfo->index) { $this->es->markQueued($linkInfo->url); } - if ($index) { + if ($linkInfo->index) { $this->queue->addToIndex( $linkInfo->url, $linkInfo->title, $linkInfo->source ); } - if ($allowed) { + if ($linkInfo->crawl) { $this->queue->addToCrawl($linkInfo->url); } } @@ -107,8 +128,11 @@ class Crawler foreach ($linkInfos as $linkInfo) { echo $linkInfo->url . "\n"; if ($linkInfo->title) { - echo ' title: ' . $linkInfo->title . "\n"; + echo ' title: ' . $linkInfo->title . "\n"; echo ' source: ' . $linkInfo->source . "\n"; + echo ' known: ' . intval($linkInfo->known) + . ', crawl: ' . intval($linkInfo->crawl) + . ', index: ' . intval($linkInfo->index) . "\n"; } } } |
