Option to disable linked URL indexing
authorChristian Weiske <cweiske@cweiske.de>
Tue, 30 Aug 2016 06:13:33 +0000 (08:13 +0200)
committerChristian Weiske <cweiske@cweiske.de>
Tue, 30 Aug 2016 06:13:33 +0000 (08:13 +0200)
Resolves: #2

data/config.php.dist
src/phinde/Crawler.php

index b4d7d5c..dc1cff8 100644 (file)
@@ -1,6 +1,7 @@
 <?php
 $GLOBALS['phinde'] = array(
     'elasticsearch' => 'http://127.0.0.1:9200/phinde/',
+    //whitelist of domains that shall be crawled
     'domains' => array(
         'www.example.org',
         'test.example.org'
@@ -15,5 +16,8 @@ $GLOBALS['phinde'] = array(
     ),
     //time in seconds after which URLs may be re-indexed
     'refreshtime' => 86400,
+    //if directly linked URLs shall be indexed, even if they are
+    // on a non-whitelisted domain
+    'indexNonAllowed' => true,
 );
 ?>
\ No newline at end of file
index 43d9459..6459fb4 100644 (file)
@@ -84,11 +84,19 @@ class Crawler
             if ($this->es->isKnown($linkInfo->url)) {
                 continue;
             }
-            $this->es->markQueued($linkInfo->url);
-            $this->queue->addToIndex(
-                $linkInfo->url, $linkInfo->title, $linkInfo->source
-            );
-            if (Helper::isUrlAllowed($linkInfo->url)) {
+            $allowed = Helper::isUrlAllowed($linkInfo->url);
+            $crawl   = $allowed;
+            $index   = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+            if ($crawl || $index) {
+                $this->es->markQueued($linkInfo->url);
+            }
+            if ($index) {
+                $this->queue->addToIndex(
+                    $linkInfo->url, $linkInfo->title, $linkInfo->source
+                );
+            }
+            if ($allowed) {
                 $this->queue->addToCrawl($linkInfo->url);
             }
         }