aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-08-30 08:13:33 +0200
committerChristian Weiske <cweiske@cweiske.de>2016-08-30 08:13:33 +0200
commit45638a5de3c8c05c1792f8a3ab93acb11a2c1a86 (patch)
tree47f67feac7e1d85d42b6f36427acfd77e0d3a7d3
parent59f931647a2b4a13be20ba8f2baa4ec93e334ee5 (diff)
downloadphinde-45638a5de3c8c05c1792f8a3ab93acb11a2c1a86.tar.gz
phinde-45638a5de3c8c05c1792f8a3ab93acb11a2c1a86.zip
Option to disable linked URL indexing
Resolves: #2
-rw-r--r--data/config.php.dist4
-rw-r--r--src/phinde/Crawler.php18
2 files changed, 17 insertions, 5 deletions
diff --git a/data/config.php.dist b/data/config.php.dist
index b4d7d5c..dc1cff8 100644
--- a/data/config.php.dist
+++ b/data/config.php.dist
@@ -1,6 +1,7 @@
<?php
$GLOBALS['phinde'] = array(
'elasticsearch' => 'http://127.0.0.1:9200/phinde/',
+ //whitelist of domains that shall be crawled
'domains' => array(
'www.example.org',
'test.example.org'
@@ -15,5 +16,8 @@ $GLOBALS['phinde'] = array(
),
//time in seconds after which URLs may be re-indexed
'refreshtime' => 86400,
+ //if directly linked URLs shall be indexed, even if they are
+ // on a non-whitelisted domain
+ 'indexNonAllowed' => true,
);
?> \ No newline at end of file
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
index 43d9459..6459fb4 100644
--- a/src/phinde/Crawler.php
+++ b/src/phinde/Crawler.php
@@ -84,11 +84,19 @@ class Crawler
if ($this->es->isKnown($linkInfo->url)) {
continue;
}
- $this->es->markQueued($linkInfo->url);
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
- );
- if (Helper::isUrlAllowed($linkInfo->url)) {
+ $allowed = Helper::isUrlAllowed($linkInfo->url);
+ $crawl = $allowed;
+ $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+ if ($crawl || $index) {
+ $this->es->markQueued($linkInfo->url);
+ }
+ if ($index) {
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ }
+ if ($allowed) {
$this->queue->addToCrawl($linkInfo->url);
}
}