From 18d36cb052c42c20edda2814545c9bdf3fb1cbc5 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Sat, 29 Feb 2020 21:16:44 +0100 Subject: [PATCH] wip --- data/elasticsearch-mapping.json | 44 ++++++++++++---------------- src/phinde/Elasticsearch.php | 9 ++++-- src/phinde/Elasticsearch/Request.php | 20 ++++++++++++- src/phinde/Fetcher.php | 15 ++++++++++ src/phinde/Helper.php | 8 ++++- src/phinde/Indexer.php | 3 -- 6 files changed, 66 insertions(+), 33 deletions(-) diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json index aad27d2..8cc4d2a 100644 --- a/data/elasticsearch-mapping.json +++ b/data/elasticsearch-mapping.json @@ -9,7 +9,7 @@ "type": "integer" }, "location": { - "type": "string" + "type": "text" }, "processed": { "type": "date" @@ -26,74 +26,66 @@ } }, "type": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "subtype": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "mimetype": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "url": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "boost": 1.5 }, "schemalessUrl": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "boost": 1.5 }, "domain": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "boost": 1.8 }, "source": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "language": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "title": { - "type": "string", + "type": "text", "boost": 2 }, "h1": { - "type": "string", + "type": "text", "boost": 1.8 }, "h2": { - "type": "string", + "type": "text", "boost": 1.7 }, "h3": { - "type": "string", + "type": "text", "boost": 1.6 }, "h4": { - "type": "string", + "type": "text", "boost": 1.5 }, "h5": { - "type": "string", + "type": "text", "boost": 1.4 }, "h6": { - "type": "string", + "type": "text", "boost": 1.3 }, "text": { - "type": "string", + "type": "text", "boost": 1.0 }, "tags": { - "type": "string", + "type": "keyword", "boost": 1.5 } } diff --git a/src/phinde/Elasticsearch.php b/src/phinde/Elasticsearch.php index 5ca2180..8a6d8c7 100644 --- a/src/phinde/Elasticsearch.php +++ b/src/phinde/Elasticsearch.php @@ -167,6 +167,7 @@ class Elasticsearch ), 'highlight' => array( 'pre_tags' => array(''), + 'post_tags' => array(''), 'order' => 'score', 'encoder' => 'html', 'fields' => array( @@ -231,8 +232,12 @@ class Elasticsearch //unset($doc['_source']); - //ini_set('xdebug.var_display_max_depth', 10); - //echo json_encode($doc);die(); + if (false) { + ini_set('xdebug.var_display_max_depth', 10); + header('Content-type: application/json'); + echo json_encode($doc, JSON_PRETTY_PRINT);die(); + } + $r->setBody(json_encode($doc)); $res = $r->send(); return json_decode($res->getBody()); diff --git a/src/phinde/Elasticsearch/Request.php b/src/phinde/Elasticsearch/Request.php index 7bb6add..1f9cd99 100644 --- a/src/phinde/Elasticsearch/Request.php +++ b/src/phinde/Elasticsearch/Request.php @@ -30,6 +30,24 @@ class Elasticsearch_Request extends \HTTP_Request2 . $error ); } -} + /** + * Sets the request body - inject content type + * + * @param mixed $body Either a string with the body or filename + * containing body or pointer to an open file or + * object with multipart body data + * @param bool $isFilename Whether first parameter is a filename + * + * @return HTTP_Request2 + * @throws HTTP_Request2_LogicException + * + * @link https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests + */ + public function setBody($body, $isFilename = false) + { + $this->setHeader('content-type', 'application/json'); + return parent::setBody($body, $isFilename); + } +} ?> diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php index dccb118..63f5a43 100644 --- a/src/phinde/Fetcher.php +++ b/src/phinde/Fetcher.php @@ -60,6 +60,21 @@ class Fetcher } //FIXME: etag, hash on content + if ($esDoc === null) { + //not known yet + $esDoc = Helper::baseDoc($url); + } + + $lm = $res->getHeader('last-modified'); + if ($lm !== null) { + $esDoc->status->modate = gmdate('c', strtotime($lm)); + } else { + $esDoc->status->modate = gmdate('c'); + } + if ($esDoc->status->crdate == '') { + $esDoc->status->crdate = $esDoc->status->modate; + } + $retrieved = new Retrieved(); $retrieved->httpRes = $res; $retrieved->esDoc = $esDoc; diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index aeb8ba5..55c8bbd 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -81,9 +81,15 @@ class Helper public static function baseDoc($url) { $esDoc = new \stdClass(); - $esDoc->status = new \stdClass(); + $esDoc->status = (object) array( + 'findable' => false, + 'modate' => '', + 'crdate' => '', + 'processed' => '', + ); $esDoc->url = $url; $esDoc->schemalessUrl = Helper::noSchema($url); + $esDoc->domain = parse_url($url, PHP_URL_HOST); return $esDoc; } } diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php index bdd5236..4efef42 100644 --- a/src/phinde/Indexer.php +++ b/src/phinde/Indexer.php @@ -77,12 +77,9 @@ class Indexer $xpContext = $doc->getElementById('content'); } - $esDoc->url = $url; - $esDoc->schemalessUrl = Helper::noSchema($url); $esDoc->type = 'html'; $esDoc->subtype = ''; $esDoc->mimetype = $mimetype; - $esDoc->domain = parse_url($url, PHP_URL_HOST); //$esDoc->source = 'FIXME'; //$esDoc->sourcetitle = 'FIXME'; -- 2.30.2