Add URL rewrites/replacements
[phinde.git] / src / phinde / Crawler.php
index f3158aa07223979239f41bca742979d159f1aa2c..4d596b40e0abc49031fa4127211defb78aed58c4 100644 (file)
@@ -6,7 +6,12 @@ class Crawler
     protected $es;
     protected $queue;
 
-    static $supportedIndexTypes = array(
+    /**
+     * If the links only should be shown, not queued
+     */
+    protected $showLinksOnly = false;
+
+    static $supportedTypes = array(
         'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
         'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
         'text/html'             => '\\phinde\\LinkExtractor\\Html',
@@ -18,57 +23,98 @@ class Crawler
         $this->queue = new Queue();
     }
 
-    public function crawl($url)
-    {
-        $res       = $this->fetch($url);
-        $linkInfos = $this->extractLinks($res);
-        $this->enqueue($linkInfos);
-    }
-
-    protected function fetch($url)
+    public function run(Retrieved $retrieved)
     {
-        $req = new HttpRequest($url);
-        $req->setHeader(
-            'accept',
-            implode(',', array_keys(static::$supportedIndexTypes))
-        );
-        $res = $req->send();
-        if ($res->getStatus() !== 200) {
-            throw new \Exception(
-                "Response code is not 200 but "
-                . $res->getStatus() . ", stopping"
-            );
+        $linkInfos = $this->extractLinks($retrieved->httpRes);
+        $linkInfos = $this->filterLinks($linkInfos);
+        if ($this->showLinksOnly) {
+            $this->showLinks($linkInfos);
+            return false;
+        } else {
+            $this->enqueue($linkInfos);
+            return true;
         }
-        return $res;
     }
 
     protected function extractLinks(\HTTP_Request2_Response $res)
     {
         $mimetype = explode(';', $res->getHeader('content-type'))[0];
-        if (!isset(static::$supportedIndexTypes[$mimetype])) {
-            echo "MIME type not supported for indexing: $mimetype\n";
+        if (!isset(static::$supportedTypes[$mimetype])) {
+            Log::info("MIME type not supported for crawling: $mimetype");
             return array();
         }
 
-        $class = static::$supportedIndexTypes[$mimetype];
+        $class = static::$supportedTypes[$mimetype];
         $extractor = new $class();
         return $extractor->extract($res);
     }
 
+    protected function filterLinks($linkInfos)
+    {
+        $filteredLinkInfos = array();
+        foreach ($linkInfos as $linkInfo) {
+            $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
+            $allowed = Helper::isUrlAllowed($linkInfo->url);
+            $crawl   = $allowed;
+            $index   = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+            if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
+                foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
+                    if (preg_match('#' . $bl . '#', $linkInfo->url)) {
+                        $crawl = false;
+                    }
+                }
+            }
+
+            $linkInfo->known = $this->es->isKnown($linkInfo->url);
+            $linkInfo->crawl = $crawl;
+            $linkInfo->index = $index;
+            $filteredLinkInfos[] = $linkInfo;
+        }
+        return $filteredLinkInfos;
+    }
+
     protected function enqueue($linkInfos)
     {
         foreach ($linkInfos as $linkInfo) {
-            if ($this->es->isKnown($linkInfo->url)) {
+            if ($linkInfo->known) {
                 continue;
             }
-            $this->es->markQueued($linkInfo->url);
-            $this->queue->addToIndex(
-                $linkInfo->url, $linkInfo->title, $linkInfo->source
-            );
-            if (Helper::isUrlAllowed($linkInfo->url)) {
-                $this->queue->addToCrawl($linkInfo->url);
+            if ($linkInfo->crawl || $linkInfo->index) {
+                $this->es->markQueued($linkInfo->url);
+                $actions = array();
+                if ($linkInfo->index) {
+                    $actions[] = 'index';
+                }
+                if ($linkInfo->crawl) {
+                    $actions[] = 'crawl';
+                }
+                $this->queue->addToProcessList(
+                    $linkInfo->url, $actions
+                );
+            }
+        }
+    }
+
+    protected function showLinks($linkInfos)
+    {
+        foreach ($linkInfos as $linkInfo) {
+            Log::msg($linkInfo->url);
+            if ($linkInfo->title) {
+                Log::msg('   title: ' . $linkInfo->title);
+                Log::msg('  source: ' . $linkInfo->source);
+                Log::msg(
+                    '   known: ' . intval($linkInfo->known)
+                    . ', crawl: ' . intval($linkInfo->crawl)
+                    . ', index: ' . intval($linkInfo->index)
+                );
             }
         }
     }
+
+    public function setShowLinksOnly($showLinksOnly)
+    {
+        $this->showLinksOnly = $showLinksOnly;
+    }
 }
 ?>