aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/elasticsearch-mapping.json44
-rw-r--r--src/phinde/Elasticsearch.php9
-rw-r--r--src/phinde/Elasticsearch/Request.php20
-rw-r--r--src/phinde/Fetcher.php15
-rw-r--r--src/phinde/Helper.php8
-rw-r--r--src/phinde/Indexer.php3
6 files changed, 66 insertions, 33 deletions
diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json
index aad27d2..8cc4d2a 100644
--- a/data/elasticsearch-mapping.json
+++ b/data/elasticsearch-mapping.json
@@ -9,7 +9,7 @@
"type": "integer"
},
"location": {
- "type": "string"
+ "type": "text"
},
"processed": {
"type": "date"
@@ -26,74 +26,66 @@
}
},
"type": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"subtype": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"mimetype": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"url": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.5
},
"schemalessUrl": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.5
},
"domain": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.8
},
"source": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"language": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"title": {
- "type": "string",
+ "type": "text",
"boost": 2
},
"h1": {
- "type": "string",
+ "type": "text",
"boost": 1.8
},
"h2": {
- "type": "string",
+ "type": "text",
"boost": 1.7
},
"h3": {
- "type": "string",
+ "type": "text",
"boost": 1.6
},
"h4": {
- "type": "string",
+ "type": "text",
"boost": 1.5
},
"h5": {
- "type": "string",
+ "type": "text",
"boost": 1.4
},
"h6": {
- "type": "string",
+ "type": "text",
"boost": 1.3
},
"text": {
- "type": "string",
+ "type": "text",
"boost": 1.0
},
"tags": {
- "type": "string",
+ "type": "keyword",
"boost": 1.5
}
}
diff --git a/src/phinde/Elasticsearch.php b/src/phinde/Elasticsearch.php
index 5ca2180..8a6d8c7 100644
--- a/src/phinde/Elasticsearch.php
+++ b/src/phinde/Elasticsearch.php
@@ -167,6 +167,7 @@ class Elasticsearch
),
'highlight' => array(
'pre_tags' => array('<em class="hl">'),
+ 'post_tags' => array('</em>'),
'order' => 'score',
'encoder' => 'html',
'fields' => array(
@@ -231,8 +232,12 @@ class Elasticsearch
//unset($doc['_source']);
- //ini_set('xdebug.var_display_max_depth', 10);
- //echo json_encode($doc);die();
+ if (false) {
+ ini_set('xdebug.var_display_max_depth', 10);
+ header('Content-type: application/json');
+ echo json_encode($doc, JSON_PRETTY_PRINT);die();
+ }
+
$r->setBody(json_encode($doc));
$res = $r->send();
return json_decode($res->getBody());
diff --git a/src/phinde/Elasticsearch/Request.php b/src/phinde/Elasticsearch/Request.php
index 7bb6add..1f9cd99 100644
--- a/src/phinde/Elasticsearch/Request.php
+++ b/src/phinde/Elasticsearch/Request.php
@@ -30,6 +30,24 @@ class Elasticsearch_Request extends \HTTP_Request2
. $error
);
}
-}
+ /**
+ * Sets the request body - inject content type
+ *
+ * @param mixed $body Either a string with the body or filename
+ * containing body or pointer to an open file or
+ * object with multipart body data
+ * @param bool $isFilename Whether first parameter is a filename
+ *
+ * @return HTTP_Request2
+ * @throws HTTP_Request2_LogicException
+ *
+ * @link https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests
+ */
+ public function setBody($body, $isFilename = false)
+ {
+ $this->setHeader('content-type', 'application/json');
+ return parent::setBody($body, $isFilename);
+ }
+}
?>
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php
index dccb118..63f5a43 100644
--- a/src/phinde/Fetcher.php
+++ b/src/phinde/Fetcher.php
@@ -60,6 +60,21 @@ class Fetcher
}
//FIXME: etag, hash on content
+ if ($esDoc === null) {
+ //not known yet
+ $esDoc = Helper::baseDoc($url);
+ }
+
+ $lm = $res->getHeader('last-modified');
+ if ($lm !== null) {
+ $esDoc->status->modate = gmdate('c', strtotime($lm));
+ } else {
+ $esDoc->status->modate = gmdate('c');
+ }
+ if ($esDoc->status->crdate == '') {
+ $esDoc->status->crdate = $esDoc->status->modate;
+ }
+
$retrieved = new Retrieved();
$retrieved->httpRes = $res;
$retrieved->esDoc = $esDoc;
diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php
index aeb8ba5..55c8bbd 100644
--- a/src/phinde/Helper.php
+++ b/src/phinde/Helper.php
@@ -81,9 +81,15 @@ class Helper
public static function baseDoc($url)
{
$esDoc = new \stdClass();
- $esDoc->status = new \stdClass();
+ $esDoc->status = (object) array(
+ 'findable' => false,
+ 'modate' => '',
+ 'crdate' => '',
+ 'processed' => '',
+ );
$esDoc->url = $url;
$esDoc->schemalessUrl = Helper::noSchema($url);
+ $esDoc->domain = parse_url($url, PHP_URL_HOST);
return $esDoc;
}
}
diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php
index bdd5236..4efef42 100644
--- a/src/phinde/Indexer.php
+++ b/src/phinde/Indexer.php
@@ -77,12 +77,9 @@ class Indexer
$xpContext = $doc->getElementById('content');
}
- $esDoc->url = $url;
- $esDoc->schemalessUrl = Helper::noSchema($url);
$esDoc->type = 'html';
$esDoc->subtype = '';
$esDoc->mimetype = $mimetype;
- $esDoc->domain = parse_url($url, PHP_URL_HOST);
//$esDoc->source = 'FIXME';
//$esDoc->sourcetitle = 'FIXME';