properly handle noindex pages
authorChristian Weiske <cweiske@cweiske.de>
Wed, 9 Nov 2016 20:46:05 +0000 (21:46 +0100)
committerChristian Weiske <cweiske@cweiske.de>
Wed, 9 Nov 2016 20:46:05 +0000 (21:46 +0100)
src/phinde/Fetcher.php
src/phinde/Helper.php
src/phinde/Indexer.php

index b5644af1310d16f2dbe107b36e75d2b059d57868..5ea0cf24fd7b0f2d27389815756aad91bdce9372 100644 (file)
@@ -52,7 +52,7 @@ class Fetcher
             );
         }
 
-        $effUrl = $res->getEffectiveUrl();
+        $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
         if ($effUrl != $url) {
             $this->storeRedirect($url, $effUrl);
             $url = $effUrl;
@@ -69,11 +69,11 @@ class Fetcher
 
     protected function storeRedirect($url, $target)
     {
-        $esDoc = new \stdClass();
+        $esDoc = Helper::baseDoc($url);
         $esDoc->status = (object) array(
-            'location' => $target
+            'location' => $target,
+            'findable' => false,
         );
-        $esDoc->url = $url;
         $this->storeDoc($url, $esDoc);
     }
 
index 00215fee0ca72b0cdb5d1fee5f175d8b46ad0202..aeb8ba5d4e8c08874963fc5cf2b28843677d753f 100644 (file)
@@ -77,5 +77,14 @@ class Helper
         $diff = microtime(true) - static::$timer[$timer];
         echo '+timer: ' . number_format($diff, 3) . 'ms ' . $timer . "\n";
     }
+
+    public static function baseDoc($url)
+    {
+        $esDoc = new \stdClass();
+        $esDoc->status = new \stdClass();
+        $esDoc->url = $url;
+        $esDoc->schemalessUrl = Helper::noSchema($url);
+        return $esDoc;
+    }
 }
 ?>
index 98b52c3a22304da42e9858e7d0588e5b7e5c2eab..2e40ba9a4222c67087298d50b5a41b4abeb9322a 100644 (file)
@@ -21,10 +21,8 @@ class Indexer
         }
 
         if ($esDoc === null) {
-            $esDoc = new \stdClass();
-        }
-        if (!isset($esDoc->status)) {
-            $esDoc->status = new \stdClass();
+            $esDoc = Helper::baseDoc($url);
+            $retrieved->esDoc = $esDoc;
         }
 
         //FIXME: update index only if changed since last index time
@@ -52,8 +50,8 @@ class Indexer
             $robots = $meta->attributes->getNamedItem('content')->textContent;
             foreach (explode(',', $robots) as $value) {
                 if (trim($value) == 'noindex') {
-                    echo "URL does not want to be indexed: $url\n";
-                    exit(0);
+                    $esDoc->status->findable = false;
+                    return true;
                 }
             }
         }
@@ -188,7 +186,6 @@ class Indexer
 
         //var_dump($esDoc);die();
 
-        $retrieved->esDoc = $esDoc;
         return true;
     }