Add URL rewrites/replacements
[phinde.git] / src / phinde / Crawler.php
index 72726a5a950fb808ac7c0019f2306c07cc42c855..4d596b40e0abc49031fa4127211defb78aed58c4 100644 (file)
@@ -11,7 +11,7 @@ class Crawler
      */
     protected $showLinksOnly = false;
 
-    static $supportedIndexTypes = array(
+    static $supportedTypes = array(
         'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
         'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
         'text/html'             => '\\phinde\\LinkExtractor\\Html',
@@ -23,58 +23,28 @@ class Crawler
         $this->queue = new Queue();
     }
 
-    public function crawl($url)
+    public function run(Retrieved $retrieved)
     {
-        $res = $this->fetch($url);
-        if ($res === false) {
-            return;
-        }
-
-        $linkInfos = $this->extractLinks($res);
+        $linkInfos = $this->extractLinks($retrieved->httpRes);
         $linkInfos = $this->filterLinks($linkInfos);
         if ($this->showLinksOnly) {
             $this->showLinks($linkInfos);
+            return false;
         } else {
             $this->enqueue($linkInfos);
+            return true;
         }
     }
 
-    protected function fetch($url)
-    {
-        $existingDoc = $this->es->get($url);
-
-        $req = new HttpRequest($url);
-        $req->setHeader(
-            'accept',
-            implode(',', array_keys(static::$supportedIndexTypes))
-        );
-        if ($existingDoc && isset($existingDoc->modate)) {
-            $nMoDate = strtotime($existingDoc->modate);
-            $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
-        }
-
-        $res = $req->send();
-        if ($res->getStatus() === 304) {
-            //not modified since last time, so don't crawl again
-            return false;
-        } else if ($res->getStatus() !== 200) {
-            throw new \Exception(
-                "Response code is not 200 but "
-                . $res->getStatus() . ", stopping"
-            );
-        }
-        return $res;
-    }
-
     protected function extractLinks(\HTTP_Request2_Response $res)
     {
         $mimetype = explode(';', $res->getHeader('content-type'))[0];
-        if (!isset(static::$supportedIndexTypes[$mimetype])) {
-            echo "MIME type not supported for indexing: $mimetype\n";
+        if (!isset(static::$supportedTypes[$mimetype])) {
+            Log::info("MIME type not supported for crawling: $mimetype");
             return array();
         }
 
-        $class = static::$supportedIndexTypes[$mimetype];
+        $class = static::$supportedTypes[$mimetype];
         $extractor = new $class();
         return $extractor->extract($res);
     }
@@ -83,6 +53,7 @@ class Crawler
     {
         $filteredLinkInfos = array();
         foreach ($linkInfos as $linkInfo) {
+            $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
             $allowed = Helper::isUrlAllowed($linkInfo->url);
             $crawl   = $allowed;
             $index   = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
@@ -111,28 +82,32 @@ class Crawler
             }
             if ($linkInfo->crawl || $linkInfo->index) {
                 $this->es->markQueued($linkInfo->url);
-            }
-            if ($linkInfo->index) {
-                $this->queue->addToIndex(
-                    $linkInfo->url, $linkInfo->title, $linkInfo->source
+                $actions = array();
+                if ($linkInfo->index) {
+                    $actions[] = 'index';
+                }
+                if ($linkInfo->crawl) {
+                    $actions[] = 'crawl';
+                }
+                $this->queue->addToProcessList(
+                    $linkInfo->url, $actions
                 );
             }
-            if ($linkInfo->crawl) {
-                $this->queue->addToCrawl($linkInfo->url);
-            }
         }
     }
 
     protected function showLinks($linkInfos)
     {
         foreach ($linkInfos as $linkInfo) {
-            echo $linkInfo->url . "\n";
+            Log::msg($linkInfo->url);
             if ($linkInfo->title) {
-                echo '   title: ' . $linkInfo->title . "\n";
-                echo '  source: ' . $linkInfo->source . "\n";
-                echo '   known: ' . intval($linkInfo->known)
+                Log::msg('   title: ' . $linkInfo->title);
+                Log::msg('  source: ' . $linkInfo->source);
+                Log::msg(
+                    '   known: ' . intval($linkInfo->known)
                     . ', crawl: ' . intval($linkInfo->crawl)
-                    . ', index: ' . intval($linkInfo->index) . "\n";
+                    . ', index: ' . intval($linkInfo->index)
+                );
             }
         }
     }