aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/config.php.dist4
-rw-r--r--src/phinde/Crawler.php18
2 files changed, 17 insertions, 5 deletions
diff --git a/data/config.php.dist b/data/config.php.dist
index b4d7d5c..dc1cff8 100644
--- a/data/config.php.dist
+++ b/data/config.php.dist
@@ -1,6 +1,7 @@
<?php
$GLOBALS['phinde'] = array(
'elasticsearch' => 'http://127.0.0.1:9200/phinde/',
+ //whitelist of domains that shall be crawled
'domains' => array(
'www.example.org',
'test.example.org'
@@ -15,5 +16,8 @@ $GLOBALS['phinde'] = array(
),
//time in seconds after which URLs may be re-indexed
'refreshtime' => 86400,
+ //if directly linked URLs shall be indexed, even if they are
+ // on a non-whitelisted domain
+ 'indexNonAllowed' => true,
);
?> \ No newline at end of file
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
index 43d9459..6459fb4 100644
--- a/src/phinde/Crawler.php
+++ b/src/phinde/Crawler.php
@@ -84,11 +84,19 @@ class Crawler
if ($this->es->isKnown($linkInfo->url)) {
continue;
}
- $this->es->markQueued($linkInfo->url);
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
- );
- if (Helper::isUrlAllowed($linkInfo->url)) {
+ $allowed = Helper::isUrlAllowed($linkInfo->url);
+ $crawl = $allowed;
+ $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+ if ($crawl || $index) {
+ $this->es->markQueued($linkInfo->url);
+ }
+ if ($index) {
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ }
+ if ($allowed) {
$this->queue->addToCrawl($linkInfo->url);
}
}