aboutsummaryrefslogtreecommitdiff
path: root/src/phinde
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-11-09 21:46:05 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-11-09 21:46:05 +0100
commit31f0bc4f5a980b40ab8d6ebc6cf682e97f59f647 (patch)
treeebc4b1599f4dab89c762bb1928b71371d076e588 /src/phinde
parentd7651fd96dcfa2829519504e4c8ec1ce511cd57f (diff)
downloadphinde-31f0bc4f5a980b40ab8d6ebc6cf682e97f59f647.tar.gz
phinde-31f0bc4f5a980b40ab8d6ebc6cf682e97f59f647.zip
properly handle noindex pages
Diffstat (limited to 'src/phinde')
-rw-r--r--src/phinde/Fetcher.php8
-rw-r--r--src/phinde/Helper.php9
-rw-r--r--src/phinde/Indexer.php11
3 files changed, 17 insertions, 11 deletions
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php
index b5644af..5ea0cf2 100644
--- a/src/phinde/Fetcher.php
+++ b/src/phinde/Fetcher.php
@@ -52,7 +52,7 @@ class Fetcher
);
}
- $effUrl = $res->getEffectiveUrl();
+ $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;
@@ -69,11 +69,11 @@ class Fetcher
protected function storeRedirect($url, $target)
{
- $esDoc = new \stdClass();
+ $esDoc = Helper::baseDoc($url);
$esDoc->status = (object) array(
- 'location' => $target
+ 'location' => $target,
+ 'findable' => false,
);
- $esDoc->url = $url;
$this->storeDoc($url, $esDoc);
}
diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php
index 00215fe..aeb8ba5 100644
--- a/src/phinde/Helper.php
+++ b/src/phinde/Helper.php
@@ -77,5 +77,14 @@ class Helper
$diff = microtime(true) - static::$timer[$timer];
echo '+timer: ' . number_format($diff, 3) . 'ms ' . $timer . "\n";
}
+
+ public static function baseDoc($url)
+ {
+ $esDoc = new \stdClass();
+ $esDoc->status = new \stdClass();
+ $esDoc->url = $url;
+ $esDoc->schemalessUrl = Helper::noSchema($url);
+ return $esDoc;
+ }
}
?>
diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php
index 98b52c3..2e40ba9 100644
--- a/src/phinde/Indexer.php
+++ b/src/phinde/Indexer.php
@@ -21,10 +21,8 @@ class Indexer
}
if ($esDoc === null) {
- $esDoc = new \stdClass();
- }
- if (!isset($esDoc->status)) {
- $esDoc->status = new \stdClass();
+ $esDoc = Helper::baseDoc($url);
+ $retrieved->esDoc = $esDoc;
}
//FIXME: update index only if changed since last index time
@@ -52,8 +50,8 @@ class Indexer
$robots = $meta->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $robots) as $value) {
if (trim($value) == 'noindex') {
- echo "URL does not want to be indexed: $url\n";
- exit(0);
+ $esDoc->status->findable = false;
+ return true;
}
}
}
@@ -188,7 +186,6 @@ class Indexer
//var_dump($esDoc);die();
- $retrieved->esDoc = $esDoc;
return true;
}