diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-08-30 08:13:33 +0200 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-08-30 08:13:33 +0200 |
| commit | 45638a5de3c8c05c1792f8a3ab93acb11a2c1a86 (patch) | |
| tree | 47f67feac7e1d85d42b6f36427acfd77e0d3a7d3 | |
| parent | 59f931647a2b4a13be20ba8f2baa4ec93e334ee5 (diff) | |
| download | phinde-45638a5de3c8c05c1792f8a3ab93acb11a2c1a86.tar.gz phinde-45638a5de3c8c05c1792f8a3ab93acb11a2c1a86.zip | |
Option to disable linked URL indexing
Resolves: #2
| -rw-r--r-- | data/config.php.dist | 4 | ||||
| -rw-r--r-- | src/phinde/Crawler.php | 18 |
2 files changed, 17 insertions, 5 deletions
diff --git a/data/config.php.dist b/data/config.php.dist index b4d7d5c..dc1cff8 100644 --- a/data/config.php.dist +++ b/data/config.php.dist @@ -1,6 +1,7 @@ <?php $GLOBALS['phinde'] = array( 'elasticsearch' => 'http://127.0.0.1:9200/phinde/', + //whitelist of domains that shall be crawled 'domains' => array( 'www.example.org', 'test.example.org' @@ -15,5 +16,8 @@ $GLOBALS['phinde'] = array( ), //time in seconds after which URLs may be re-indexed 'refreshtime' => 86400, + //if directly linked URLs shall be indexed, even if they are + // on a non-whitelisted domain + 'indexNonAllowed' => true, ); ?>
\ No newline at end of file diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 43d9459..6459fb4 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -84,11 +84,19 @@ class Crawler if ($this->es->isKnown($linkInfo->url)) { continue; } - $this->es->markQueued($linkInfo->url); - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source - ); - if (Helper::isUrlAllowed($linkInfo->url)) { + $allowed = Helper::isUrlAllowed($linkInfo->url); + $crawl = $allowed; + $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; + + if ($crawl || $index) { + $this->es->markQueued($linkInfo->url); + } + if ($index) { + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + } + if ($allowed) { $this->queue->addToCrawl($linkInfo->url); } } |
