<?php
$GLOBALS['phinde'] = array(
'elasticsearch' => 'http://127.0.0.1:9200/phinde/',
+ //whitelist of domains that shall be crawled
'domains' => array(
'www.example.org',
'test.example.org'
),
//time in seconds after which URLs may be re-indexed
'refreshtime' => 86400,
+ //if directly linked URLs shall be indexed, even if they are
+ // on a non-whitelisted domain
+ 'indexNonAllowed' => true,
);
?>
\ No newline at end of file
if ($this->es->isKnown($linkInfo->url)) {
continue;
}
- $this->es->markQueued($linkInfo->url);
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
- );
- if (Helper::isUrlAllowed($linkInfo->url)) {
+ $allowed = Helper::isUrlAllowed($linkInfo->url);
+ $crawl = $allowed;
+ $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+ if ($crawl || $index) {
+ $this->es->markQueued($linkInfo->url);
+ }
+ if ($index) {
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ }
+ if ($allowed) {
$this->queue->addToCrawl($linkInfo->url);
}
}