Big patch merging crawling+indexing into one command, new json document structure
[phinde.git] / src / phinde / Fetcher.php
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php
new file mode 100644 (file)
index 0000000..b5644af
--- /dev/null
@@ -0,0 +1,93 @@
+<?php
+namespace phinde;
+
+class Fetcher
+{
+    protected $es;
+
+    public function __construct()
+    {
+        $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
+    }
+
+    /**
+     * @return Retrieved HTTP response and elasticsearch document
+     */
+    public function fetch($url, $actions, $force = false)
+    {
+        $esDoc = $this->es->get($url);
+        if (isset($esDoc->status->location)
+            && $esDoc->status->location != ''
+        ) {
+            //TODO: what if location redirects change?
+            $url = $esDoc->status->location;
+            $esDoc = $this->es->get($url);
+        }
+
+        $types = array();
+        foreach ($actions as $action) {
+            $types = array_merge($action::$supportedTypes);
+        }
+        $types = array_unique($types);
+
+        $req = new HttpRequest($url);
+        $req->setHeader('accept', implode(',', $types));
+        if (!$force && $esDoc
+            && isset($esDoc->status->processed)
+            && $esDoc->status->processed != ''
+        ) {
+            $nCrawlTime = strtotime($esDoc->status->processed);
+            $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime));
+        }
+
+        $res = $req->send();
+        if ($res->getStatus() === 304) {
+            //not modified since last time, so don't crawl again
+            echo "Not modified since last fetch\n";
+            return false;
+        } else if ($res->getStatus() !== 200) {
+            throw new \Exception(
+                "Response code is not 200 but "
+                . $res->getStatus() . ", stopping"
+            );
+        }
+
+        $effUrl = $res->getEffectiveUrl();
+        if ($effUrl != $url) {
+            $this->storeRedirect($url, $effUrl);
+            $url = $effUrl;
+            $esDoc = $this->es->get($url);
+        }
+        //FIXME: etag, hash on content
+
+        $retrieved = new Retrieved();
+        $retrieved->httpRes = $res;
+        $retrieved->esDoc   = $esDoc;
+        $retrieved->url     = $url;
+        return $retrieved;
+    }
+
+    protected function storeRedirect($url, $target)
+    {
+        $esDoc = new \stdClass();
+        $esDoc->status = (object) array(
+            'location' => $target
+        );
+        $esDoc->url = $url;
+        $this->storeDoc($url, $esDoc);
+    }
+
+    public function storeDoc($url, $esDoc)
+    {
+        echo "Store $url\n";
+        $esDoc->status->processed = gmdate('c');
+        $r = new Elasticsearch_Request(
+            $GLOBALS['phinde']['elasticsearch'] . 'document/'
+            . ElasticSearch::getDocId($url),
+            \HTTP_Request2::METHOD_PUT
+        );
+        $r->setBody(json_encode($esDoc));
+        $r->send();
+    }
+}
+?>