rework crawler; add atom link extraction
[phinde.git] / src / phinde / Crawler.php
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
new file mode 100644 (file)
index 0000000..53320ec
--- /dev/null
@@ -0,0 +1,70 @@
+<?php
+namespace phinde;
+
+class Crawler
+{
+    protected $es;
+    protected $queue;
+
+    static $supportedIndexTypes = array(
+        'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
+        'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
+        'text/html'             => '\\phinde\\LinkExtractor\\Html',
+    );
+
+    public function __construct()
+    {
+        $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
+        $this->queue = new Queue();
+    }
+
+    public function crawl($url)
+    {
+        $res       = $this->fetch($url);
+        $linkInfos = $this->extractLinks($res);
+        $this->enqueue($linkInfos);
+    }
+
+    protected function fetch($url)
+    {
+        $req = new HttpRequest($url);
+        $res = $req->send();
+        if ($res->getStatus() !== 200) {
+            throw new \Exception(
+                "Response code is not 200 but "
+                . $res->getStatus() . ", stopping"
+            );
+        }
+        return $res;
+    }
+
+    protected function extractLinks(\HTTP_Request2_Response $res)
+    {
+        $mimetype = explode(';', $res->getHeader('content-type'))[0];
+        if (!isset(static::$supportedIndexTypes[$mimetype])) {
+            echo "MIME type not supported for indexing: $mimetype\n";
+            return array();
+        }
+
+        $class = static::$supportedIndexTypes[$mimetype];
+        $extractor = new $class();
+        return $extractor->extract($res);
+    }
+
+    protected function enqueue($linkInfos)
+    {
+        foreach ($linkInfos as $linkInfo) {
+            if ($this->es->isKnown($linkInfo->url)) {
+                continue;
+            }
+            $this->es->markQueued($linkInfo->url);
+            $this->queue->addToIndex(
+                $linkInfo->url, $linkInfo->title, $linkInfo->source
+            );
+            if (Helper::isUrlAllowed($linkInfo->url)) {
+                $this->queue->addToCrawl($linkInfo->url);
+            }
+        }
+    }
+}
+?>