"type": "integer"
},
"location": {
- "type": "string"
+ "type": "text"
},
"processed": {
"type": "date"
}
},
"type": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"subtype": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"mimetype": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"url": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.5
},
"schemalessUrl": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.5
},
"domain": {
- "type": "string",
- "index": "not_analyzed",
+ "type": "keyword",
"boost": 1.8
},
"source": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"language": {
- "type": "string",
- "index": "not_analyzed"
+ "type": "keyword"
},
"title": {
- "type": "string",
+ "type": "text",
"boost": 2
},
"h1": {
- "type": "string",
+ "type": "text",
"boost": 1.8
},
"h2": {
- "type": "string",
+ "type": "text",
"boost": 1.7
},
"h3": {
- "type": "string",
+ "type": "text",
"boost": 1.6
},
"h4": {
- "type": "string",
+ "type": "text",
"boost": 1.5
},
"h5": {
- "type": "string",
+ "type": "text",
"boost": 1.4
},
"h6": {
- "type": "string",
+ "type": "text",
"boost": 1.3
},
"text": {
- "type": "string",
+ "type": "text",
"boost": 1.0
},
"tags": {
- "type": "string",
+ "type": "keyword",
"boost": 1.5
}
}
),
'highlight' => array(
'pre_tags' => array('<em class="hl">'),
+ 'post_tags' => array('</em>'),
'order' => 'score',
'encoder' => 'html',
'fields' => array(
//unset($doc['_source']);
- //ini_set('xdebug.var_display_max_depth', 10);
- //echo json_encode($doc);die();
+ if (false) {
+ ini_set('xdebug.var_display_max_depth', 10);
+ header('Content-type: application/json');
+ echo json_encode($doc, JSON_PRETTY_PRINT);die();
+ }
+
$r->setBody(json_encode($doc));
$res = $r->send();
return json_decode($res->getBody());
. $error
);
}
-}
+ /**
+ * Sets the request body - inject content type
+ *
+ * @param mixed $body Either a string with the body or filename
+ * containing body or pointer to an open file or
+ * object with multipart body data
+ * @param bool $isFilename Whether first parameter is a filename
+ *
+ * @return HTTP_Request2
+ * @throws HTTP_Request2_LogicException
+ *
+ * @link https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests
+ */
+ public function setBody($body, $isFilename = false)
+ {
+ $this->setHeader('content-type', 'application/json');
+ return parent::setBody($body, $isFilename);
+ }
+}
?>
}
//FIXME: etag, hash on content
+ if ($esDoc === null) {
+ //not known yet
+ $esDoc = Helper::baseDoc($url);
+ }
+
+ $lm = $res->getHeader('last-modified');
+ if ($lm !== null) {
+ $esDoc->status->modate = gmdate('c', strtotime($lm));
+ } else {
+ $esDoc->status->modate = gmdate('c');
+ }
+ if ($esDoc->status->crdate == '') {
+ $esDoc->status->crdate = $esDoc->status->modate;
+ }
+
$retrieved = new Retrieved();
$retrieved->httpRes = $res;
$retrieved->esDoc = $esDoc;
public static function baseDoc($url)
{
$esDoc = new \stdClass();
- $esDoc->status = new \stdClass();
+ $esDoc->status = (object) array(
+ 'findable' => false,
+ 'modate' => '',
+ 'crdate' => '',
+ 'processed' => '',
+ );
$esDoc->url = $url;
$esDoc->schemalessUrl = Helper::noSchema($url);
+ $esDoc->domain = parse_url($url, PHP_URL_HOST);
return $esDoc;
}
}
$xpContext = $doc->getElementById('content');
}
- $esDoc->url = $url;
- $esDoc->schemalessUrl = Helper::noSchema($url);
$esDoc->type = 'html';
$esDoc->subtype = '';
$esDoc->mimetype = $mimetype;
- $esDoc->domain = parse_url($url, PHP_URL_HOST);
//$esDoc->source = 'FIXME';
//$esDoc->sourcetitle = 'FIXME';