From 31f0bc4f5a980b40ab8d6ebc6cf682e97f59f647 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Wed, 9 Nov 2016 21:46:05 +0100 Subject: [PATCH] properly handle noindex pages --- src/phinde/Fetcher.php | 8 ++++---- src/phinde/Helper.php | 9 +++++++++ src/phinde/Indexer.php | 11 ++++------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php index b5644af..5ea0cf2 100644 --- a/src/phinde/Fetcher.php +++ b/src/phinde/Fetcher.php @@ -52,7 +52,7 @@ class Fetcher ); } - $effUrl = $res->getEffectiveUrl(); + $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); if ($effUrl != $url) { $this->storeRedirect($url, $effUrl); $url = $effUrl; @@ -69,11 +69,11 @@ class Fetcher protected function storeRedirect($url, $target) { - $esDoc = new \stdClass(); + $esDoc = Helper::baseDoc($url); $esDoc->status = (object) array( - 'location' => $target + 'location' => $target, + 'findable' => false, ); - $esDoc->url = $url; $this->storeDoc($url, $esDoc); } diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index 00215fe..aeb8ba5 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -77,5 +77,14 @@ class Helper $diff = microtime(true) - static::$timer[$timer]; echo '+timer: ' . number_format($diff, 3) . 'ms ' . $timer . "\n"; } + + public static function baseDoc($url) + { + $esDoc = new \stdClass(); + $esDoc->status = new \stdClass(); + $esDoc->url = $url; + $esDoc->schemalessUrl = Helper::noSchema($url); + return $esDoc; + } } ?> diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php index 98b52c3..2e40ba9 100644 --- a/src/phinde/Indexer.php +++ b/src/phinde/Indexer.php @@ -21,10 +21,8 @@ class Indexer } if ($esDoc === null) { - $esDoc = new \stdClass(); - } - if (!isset($esDoc->status)) { - $esDoc->status = new \stdClass(); + $esDoc = Helper::baseDoc($url); + $retrieved->esDoc = $esDoc; } //FIXME: update index only if changed since last index time @@ -52,8 +50,8 @@ class Indexer $robots = $meta->attributes->getNamedItem('content')->textContent; foreach (explode(',', $robots) as $value) { if (trim($value) == 'noindex') { - echo "URL does not want to be indexed: $url\n"; - exit(0); + $esDoc->status->findable = false; + return true; } } } @@ -188,7 +186,6 @@ class Indexer //var_dump($esDoc);die(); - $retrieved->esDoc = $esDoc; return true; } -- 2.30.2