wip elastic6 github/elastic6
authorChristian Weiske <cweiske@cweiske.de>
Sat, 29 Feb 2020 20:16:44 +0000 (21:16 +0100)
committerChristian Weiske <cweiske@cweiske.de>
Sat, 29 Feb 2020 20:16:44 +0000 (21:16 +0100)
data/elasticsearch-mapping.json
src/phinde/Elasticsearch.php
src/phinde/Elasticsearch/Request.php
src/phinde/Fetcher.php
src/phinde/Helper.php
src/phinde/Indexer.php

index aad27d27e31a5a30460eda67e457ad2f6196ae4c..8cc4d2abed45c4b8161196fc9ffe38d86f4ced01 100644 (file)
@@ -9,7 +9,7 @@
                             "type": "integer"
                         },
                         "location": {
-                            "type": "string"
+                            "type": "text"
                         },
                         "processed": {
                             "type": "date"
                     }
                 },
                 "type": {
-                    "type": "string",
-                    "index": "not_analyzed"
+                    "type": "keyword"
                 },
                 "subtype": {
-                    "type": "string",
-                    "index": "not_analyzed"
+                    "type": "keyword"
                 },
                 "mimetype": {
-                    "type": "string",
-                    "index": "not_analyzed"
+                    "type": "keyword"
                 },
                 "url": {
-                    "type": "string",
-                    "index": "not_analyzed",
+                    "type": "keyword",
                     "boost": 1.5
                 },
                 "schemalessUrl": {
-                    "type": "string",
-                    "index": "not_analyzed",
+                    "type": "keyword",
                     "boost": 1.5
                 },
                 "domain": {
-                    "type": "string",
-                    "index": "not_analyzed",
+                    "type": "keyword",
                     "boost": 1.8
                 },
                 "source": {
-                    "type": "string",
-                    "index": "not_analyzed"
+                    "type": "keyword"
                 },
                 "language": {
-                    "type": "string",
-                    "index": "not_analyzed"
+                    "type": "keyword"
                 },
                 "title": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 2
                 },
                 "h1": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.8
                 },
                 "h2": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.7
                 },
                 "h3": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.6
                 },
                 "h4": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.5
                 },
                 "h5": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.4
                 },
                 "h6": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.3
                 },
                 "text": {
-                    "type": "string",
+                    "type": "text",
                     "boost": 1.0
                 },
                 "tags": {
-                    "type": "string",
+                    "type": "keyword",
                     "boost": 1.5
                 }
             }
index 5ca2180c8b66a936d731f8e34691af2385a95174..8a6d8c784900b92e48d13dfc329f32a3d0b5ee5a 100644 (file)
@@ -167,6 +167,7 @@ class Elasticsearch
             ),
             'highlight' => array(
                 'pre_tags' => array('<em class="hl">'),
+                'post_tags' => array('</em>'),
                 'order' => 'score',
                 'encoder' => 'html',
                 'fields' => array(
@@ -231,8 +232,12 @@ class Elasticsearch
 
         //unset($doc['_source']);
 
-        //ini_set('xdebug.var_display_max_depth', 10);
-        //echo json_encode($doc);die();
+        if (false) {
+            ini_set('xdebug.var_display_max_depth', 10);
+            header('Content-type: application/json');
+            echo json_encode($doc, JSON_PRETTY_PRINT);die();
+        }
+
         $r->setBody(json_encode($doc));
         $res = $r->send();
         return json_decode($res->getBody());
index 7bb6add07f36efa7e0c6230a70f2221ab36cd7d4..1f9cd99923dee58267dc20cef5f425abc207d656 100644 (file)
@@ -30,6 +30,24 @@ class Elasticsearch_Request extends \HTTP_Request2
             . $error
         );
     }
-}
 
+    /**
+     * Sets the request body - inject content type
+     *
+     * @param mixed $body       Either a string with the body or filename
+     *                          containing body or pointer to an open file or
+     *                          object with multipart body data
+     * @param bool  $isFilename Whether first parameter is a filename
+     *
+     * @return HTTP_Request2
+     * @throws HTTP_Request2_LogicException
+     *
+     * @link https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests
+     */
+    public function setBody($body, $isFilename = false)
+    {
+        $this->setHeader('content-type', 'application/json');
+        return parent::setBody($body, $isFilename);
+    }
+}
 ?>
index dccb118c0ac52f0bd47fd73029b65cb7b5780a69..63f5a435c8ffbec5ca5a87e41769934d624512d5 100644 (file)
@@ -60,6 +60,21 @@ class Fetcher
         }
         //FIXME: etag, hash on content
 
+        if ($esDoc === null) {
+            //not known yet
+            $esDoc = Helper::baseDoc($url);
+        }
+
+        $lm = $res->getHeader('last-modified');
+        if ($lm !== null) {
+            $esDoc->status->modate = gmdate('c', strtotime($lm));
+        } else {
+            $esDoc->status->modate = gmdate('c');
+        }
+        if ($esDoc->status->crdate == '') {
+            $esDoc->status->crdate = $esDoc->status->modate;
+        }
+
         $retrieved = new Retrieved();
         $retrieved->httpRes = $res;
         $retrieved->esDoc   = $esDoc;
index aeb8ba5d4e8c08874963fc5cf2b28843677d753f..55c8bbd2350ece6d165da9ee7dd7fa5fcc3494df 100644 (file)
@@ -81,9 +81,15 @@ class Helper
     public static function baseDoc($url)
     {
         $esDoc = new \stdClass();
-        $esDoc->status = new \stdClass();
+        $esDoc->status = (object) array(
+            'findable'  => false,
+            'modate'    => '',
+            'crdate'    => '',
+            'processed' => '',
+        );
         $esDoc->url = $url;
         $esDoc->schemalessUrl = Helper::noSchema($url);
+        $esDoc->domain        = parse_url($url, PHP_URL_HOST);
         return $esDoc;
     }
 }
index bdd5236a72d973b1eaa6b03cd355163deb9149cd..4efef4284f3c0e1eca1d009298b7d100af592941 100644 (file)
@@ -77,12 +77,9 @@ class Indexer
             $xpContext = $doc->getElementById('content');
         }
 
-        $esDoc->url = $url;
-        $esDoc->schemalessUrl = Helper::noSchema($url);
         $esDoc->type = 'html';
         $esDoc->subtype = '';
         $esDoc->mimetype = $mimetype;
-        $esDoc->domain   = parse_url($url, PHP_URL_HOST);
 
         //$esDoc->source = 'FIXME';
         //$esDoc->sourcetitle = 'FIXME';