aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-02-03 22:37:15 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-02-03 22:37:15 +0100
commitf156497be5637d7815ae57370e8b175ce6960a19 (patch)
tree6363a52913e006e90b223f17f6aa39206f153ffd
parent915b66fe6ca517610a41acec0a71597e7cee0807 (diff)
downloadphinde-f156497be5637d7815ae57370e8b175ce6960a19.tar.gz
phinde-f156497be5637d7815ae57370e8b175ce6960a19.zip
fix indexing, boost config
-rwxr-xr-xbin/crawl.php2
-rwxr-xr-xbin/index.php10
-rwxr-xr-xbin/setup.php5
-rw-r--r--data/elasticsearch-mapping.json47
-rw-r--r--docs/elasticsearch/doc-html.json66
5 files changed, 85 insertions, 45 deletions
diff --git a/bin/crawl.php b/bin/crawl.php
index 17b1fc3..e39a622 100755
--- a/bin/crawl.php
+++ b/bin/crawl.php
@@ -86,7 +86,7 @@ foreach ($links as $link) {
//var_dump($linkTitle, $linkUrl);
$es->markQueued($linkUrl);
addToIndex($linkUrl, $linkTitle, $url);
- if (isUrlAllowed($linkUrl)) {
+ if (Helper::isUrlAllowed($linkUrl)) {
addToCrawl($linkUrl);
}
$alreadySeen[$linkUrl] = true;
diff --git a/bin/index.php b/bin/index.php
index c6de5a9..374923c 100755
--- a/bin/index.php
+++ b/bin/index.php
@@ -182,12 +182,10 @@ if ($arXpdates->length) {
//language
//there may be "en-US" and "de-DE"
-$indexDoc->language = strtolower(
- substr(
- $doc->documentElement->attributes->getNamedItem('lang')->textContent,
- 0, 2
- )
-);
+$xlang = $doc->documentElement->attributes->getNamedItem('lang');
+if ($xlang) {
+ $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2));
+}
//FIXME: fallback, autodetection
//FIXME: check noindex
diff --git a/bin/setup.php b/bin/setup.php
index 7dacedd..1e6c66d 100755
--- a/bin/setup.php
+++ b/bin/setup.php
@@ -2,10 +2,7 @@
<?php
namespace phinde;
//configure the elasticsearch index
-set_include_path(__DIR__ . '/../src/' . PATH_SEPARATOR . get_include_path());
-require_once __DIR__ . '/../data/config.php';
-require_once 'HTTP/Request2.php';
-require_once 'Elasticsearch/Request.php';
+require_once __DIR__ . '/../src/init.php';
//delete old index
$r = new Elasticsearch_Request(
diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json
index ec9bcc9..d1e83ec 100644
--- a/data/elasticsearch-mapping.json
+++ b/data/elasticsearch-mapping.json
@@ -20,11 +20,18 @@
},
"url": {
"type": "string",
- "index": "not_analyzed"
+ "index": "not_analyzed",
+ "boost": 1.5
+ },
+ "schemalessUrl": {
+ "type": "string",
+ "index": "not_analyzed",
+ "boost": 1.5
},
"domain": {
"type": "string",
- "index": "not_analyzed"
+ "index": "not_analyzed",
+ "boost": 1.8
},
"source": {
"type": "string",
@@ -33,6 +40,42 @@
"language": {
"type": "string",
"index": "not_analyzed"
+ },
+ "title": {
+ "type": "string",
+ "boost": 2
+ },
+ "h1": {
+ "type": "string",
+ "boost": 1.8
+ },
+ "h2": {
+ "type": "string",
+ "boost": 1.7
+ },
+ "h3": {
+ "type": "string",
+ "boost": 1.6
+ },
+ "h4": {
+ "type": "string",
+ "boost": 1.5
+ },
+ "h5": {
+ "type": "string",
+ "boost": 1.4
+ },
+ "h6": {
+ "type": "string",
+ "boost": 1.3
+ },
+ "text": {
+ "type": "string",
+ "boost": 1.0
+ },
+ "tags": {
+ "type": "string",
+ "boost": 1.5
}
}
}
diff --git a/docs/elasticsearch/doc-html.json b/docs/elasticsearch/doc-html.json
index 43d4206..9f93b8e 100644
--- a/docs/elasticsearch/doc-html.json
+++ b/docs/elasticsearch/doc-html.json
@@ -1,41 +1,43 @@
{
- 'type': 'html',
- 'subtype': 'rsvp',
- 'mimetype': 'application/xhtml+xml',
- 'url': 'http://example.org/foo.htm',
- 'domain': 'example.org',
- 'source': [
- 'http://example.org/linkfarm.htm'
+ "status": "indexed",
+ "type": "html",
+ "subtype": "rsvp",
+ "mimetype": "application/xhtml+xml",
+ "url": "http://example.org/foo.htm",
+ "schemalessUrl": "example.org/foo.htm",
+ "domain": "example.org",
+ "source": [
+ "http://example.org/linkfarm.htm"
],
- 'sourcetitle': [
- 'Click here',
- 'Something about bar',
+ "sourcetitle": [
+ "Click here",
+ "Something about bar",
],
- 'language': 'en',
- 'author': {
- 'name': 'Alice Example',
- 'url': 'http://example.org/alice.htm',
+ "language": "en",
+ "author": {
+ "name": "Alice Example",
+ "url": "http://example.org/alice.htm",
},
- 'title': 'Sample HTML page',
- 'h1': [
- 'Headline 1',
- 'Another headline 1'
+ "title": "Sample HTML page",
+ "h1": [
+ "Headline 1",
+ "Another headline 1"
],
- 'h2': [
- 'Subheadline'
+ "h2": [
+ "Subheadline"
],
- 'h3': [
- 'Subsub',
- 'Another Subsub'
+ "h3": [
+ "Subsub",
+ "Another Subsub"
],
- 'h4': [],
- 'h5': [],
- 'h6': [],
- 'text': [
- 'HTML converted to plain text',
- 'Another paragraph in the text'
+ "h4": [],
+ "h5": [],
+ "h6": [],
+ "text": [
+ "HTML converted to plain text",
+ "Another paragraph in the text"
],
- 'tags': ['example', 'test', 'documentation'],
- 'crdate': '2016-01-30T12:23:42+01:00',
- 'modate': '2016-01-30T12:23:42+01:00',
+ "tags": ["example", "test", "documentation"],
+ "crdate": "2016-01-30T12:23:42+01:00",
+ "modate": "2016-01-30T12:23:42+01:00",
}