//var_dump($linkTitle, $linkUrl);
$es->markQueued($linkUrl);
addToIndex($linkUrl, $linkTitle, $url);
- if (isUrlAllowed($linkUrl)) {
+ if (Helper::isUrlAllowed($linkUrl)) {
addToCrawl($linkUrl);
}
$alreadySeen[$linkUrl] = true;
//language
//there may be "en-US" and "de-DE"
-$indexDoc->language = strtolower(
- substr(
- $doc->documentElement->attributes->getNamedItem('lang')->textContent,
- 0, 2
- )
-);
+$xlang = $doc->documentElement->attributes->getNamedItem('lang');
+if ($xlang) {
+ $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2));
+}
//FIXME: fallback, autodetection
//FIXME: check noindex
<?php
namespace phinde;
//configure the elasticsearch index
-set_include_path(__DIR__ . '/../src/' . PATH_SEPARATOR . get_include_path());
-require_once __DIR__ . '/../data/config.php';
-require_once 'HTTP/Request2.php';
-require_once 'Elasticsearch/Request.php';
+require_once __DIR__ . '/../src/init.php';
//delete old index
$r = new Elasticsearch_Request(
},
"url": {
"type": "string",
- "index": "not_analyzed"
+ "index": "not_analyzed",
+ "boost": 1.5
+ },
+ "schemalessUrl": {
+ "type": "string",
+ "index": "not_analyzed",
+ "boost": 1.5
},
"domain": {
"type": "string",
- "index": "not_analyzed"
+ "index": "not_analyzed",
+ "boost": 1.8
},
"source": {
"type": "string",
"language": {
"type": "string",
"index": "not_analyzed"
+ },
+ "title": {
+ "type": "string",
+ "boost": 2
+ },
+ "h1": {
+ "type": "string",
+ "boost": 1.8
+ },
+ "h2": {
+ "type": "string",
+ "boost": 1.7
+ },
+ "h3": {
+ "type": "string",
+ "boost": 1.6
+ },
+ "h4": {
+ "type": "string",
+ "boost": 1.5
+ },
+ "h5": {
+ "type": "string",
+ "boost": 1.4
+ },
+ "h6": {
+ "type": "string",
+ "boost": 1.3
+ },
+ "text": {
+ "type": "string",
+ "boost": 1.0
+ },
+ "tags": {
+ "type": "string",
+ "boost": 1.5
}
}
}
{
- 'type': 'html',
- 'subtype': 'rsvp',
- 'mimetype': 'application/xhtml+xml',
- 'url': 'http://example.org/foo.htm',
- 'domain': 'example.org',
- 'source': [
- 'http://example.org/linkfarm.htm'
+ "status": "indexed",
+ "type": "html",
+ "subtype": "rsvp",
+ "mimetype": "application/xhtml+xml",
+ "url": "http://example.org/foo.htm",
+ "schemalessUrl": "example.org/foo.htm",
+ "domain": "example.org",
+ "source": [
+ "http://example.org/linkfarm.htm"
],
- 'sourcetitle': [
- 'Click here',
- 'Something about bar',
+ "sourcetitle": [
+ "Click here",
+ "Something about bar",
],
- 'language': 'en',
- 'author': {
- 'name': 'Alice Example',
- 'url': 'http://example.org/alice.htm',
+ "language": "en",
+ "author": {
+ "name": "Alice Example",
+ "url": "http://example.org/alice.htm",
},
- 'title': 'Sample HTML page',
- 'h1': [
- 'Headline 1',
- 'Another headline 1'
+ "title": "Sample HTML page",
+ "h1": [
+ "Headline 1",
+ "Another headline 1"
],
- 'h2': [
- 'Subheadline'
+ "h2": [
+ "Subheadline"
],
- 'h3': [
- 'Subsub',
- 'Another Subsub'
+ "h3": [
+ "Subsub",
+ "Another Subsub"
],
- 'h4': [],
- 'h5': [],
- 'h6': [],
- 'text': [
- 'HTML converted to plain text',
- 'Another paragraph in the text'
+ "h4": [],
+ "h5": [],
+ "h6": [],
+ "text": [
+ "HTML converted to plain text",
+ "Another paragraph in the text"
],
- 'tags': ['example', 'test', 'documentation'],
- 'crdate': '2016-01-30T12:23:42+01:00',
- 'modate': '2016-01-30T12:23:42+01:00',
+ "tags": ["example", "test", "documentation"],
+ "crdate": "2016-01-30T12:23:42+01:00",
+ "modate": "2016-01-30T12:23:42+01:00",
}