diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-02-03 22:37:15 +0100 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-02-03 22:37:15 +0100 |
| commit | f156497be5637d7815ae57370e8b175ce6960a19 (patch) | |
| tree | 6363a52913e006e90b223f17f6aa39206f153ffd | |
| parent | 915b66fe6ca517610a41acec0a71597e7cee0807 (diff) | |
| download | phinde-f156497be5637d7815ae57370e8b175ce6960a19.tar.gz phinde-f156497be5637d7815ae57370e8b175ce6960a19.zip | |
fix indexing, boost config
| -rwxr-xr-x | bin/crawl.php | 2 | ||||
| -rwxr-xr-x | bin/index.php | 10 | ||||
| -rwxr-xr-x | bin/setup.php | 5 | ||||
| -rw-r--r-- | data/elasticsearch-mapping.json | 47 | ||||
| -rw-r--r-- | docs/elasticsearch/doc-html.json | 66 |
5 files changed, 85 insertions, 45 deletions
diff --git a/bin/crawl.php b/bin/crawl.php index 17b1fc3..e39a622 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -86,7 +86,7 @@ foreach ($links as $link) { //var_dump($linkTitle, $linkUrl); $es->markQueued($linkUrl); addToIndex($linkUrl, $linkTitle, $url); - if (isUrlAllowed($linkUrl)) { + if (Helper::isUrlAllowed($linkUrl)) { addToCrawl($linkUrl); } $alreadySeen[$linkUrl] = true; diff --git a/bin/index.php b/bin/index.php index c6de5a9..374923c 100755 --- a/bin/index.php +++ b/bin/index.php @@ -182,12 +182,10 @@ if ($arXpdates->length) { //language //there may be "en-US" and "de-DE" -$indexDoc->language = strtolower( - substr( - $doc->documentElement->attributes->getNamedItem('lang')->textContent, - 0, 2 - ) -); +$xlang = $doc->documentElement->attributes->getNamedItem('lang'); +if ($xlang) { + $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); +} //FIXME: fallback, autodetection //FIXME: check noindex diff --git a/bin/setup.php b/bin/setup.php index 7dacedd..1e6c66d 100755 --- a/bin/setup.php +++ b/bin/setup.php @@ -2,10 +2,7 @@ <?php namespace phinde; //configure the elasticsearch index -set_include_path(__DIR__ . '/../src/' . PATH_SEPARATOR . get_include_path()); -require_once __DIR__ . '/../data/config.php'; -require_once 'HTTP/Request2.php'; -require_once 'Elasticsearch/Request.php'; +require_once __DIR__ . '/../src/init.php'; //delete old index $r = new Elasticsearch_Request( diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json index ec9bcc9..d1e83ec 100644 --- a/data/elasticsearch-mapping.json +++ b/data/elasticsearch-mapping.json @@ -20,11 +20,18 @@ }, "url": { "type": "string", - "index": "not_analyzed" + "index": "not_analyzed", + "boost": 1.5 + }, + "schemalessUrl": { + "type": "string", + "index": "not_analyzed", + "boost": 1.5 }, "domain": { "type": "string", - "index": "not_analyzed" + "index": "not_analyzed", + "boost": 1.8 }, "source": { "type": "string", @@ -33,6 +40,42 @@ "language": { "type": "string", "index": "not_analyzed" + }, + "title": { + "type": "string", + "boost": 2 + }, + "h1": { + "type": "string", + "boost": 1.8 + }, + "h2": { + "type": "string", + "boost": 1.7 + }, + "h3": { + "type": "string", + "boost": 1.6 + }, + "h4": { + "type": "string", + "boost": 1.5 + }, + "h5": { + "type": "string", + "boost": 1.4 + }, + "h6": { + "type": "string", + "boost": 1.3 + }, + "text": { + "type": "string", + "boost": 1.0 + }, + "tags": { + "type": "string", + "boost": 1.5 } } } diff --git a/docs/elasticsearch/doc-html.json b/docs/elasticsearch/doc-html.json index 43d4206..9f93b8e 100644 --- a/docs/elasticsearch/doc-html.json +++ b/docs/elasticsearch/doc-html.json @@ -1,41 +1,43 @@ { - 'type': 'html', - 'subtype': 'rsvp', - 'mimetype': 'application/xhtml+xml', - 'url': 'http://example.org/foo.htm', - 'domain': 'example.org', - 'source': [ - 'http://example.org/linkfarm.htm' + "status": "indexed", + "type": "html", + "subtype": "rsvp", + "mimetype": "application/xhtml+xml", + "url": "http://example.org/foo.htm", + "schemalessUrl": "example.org/foo.htm", + "domain": "example.org", + "source": [ + "http://example.org/linkfarm.htm" ], - 'sourcetitle': [ - 'Click here', - 'Something about bar', + "sourcetitle": [ + "Click here", + "Something about bar", ], - 'language': 'en', - 'author': { - 'name': 'Alice Example', - 'url': 'http://example.org/alice.htm', + "language": "en", + "author": { + "name": "Alice Example", + "url": "http://example.org/alice.htm", }, - 'title': 'Sample HTML page', - 'h1': [ - 'Headline 1', - 'Another headline 1' + "title": "Sample HTML page", + "h1": [ + "Headline 1", + "Another headline 1" ], - 'h2': [ - 'Subheadline' + "h2": [ + "Subheadline" ], - 'h3': [ - 'Subsub', - 'Another Subsub' + "h3": [ + "Subsub", + "Another Subsub" ], - 'h4': [], - 'h5': [], - 'h6': [], - 'text': [ - 'HTML converted to plain text', - 'Another paragraph in the text' + "h4": [], + "h5": [], + "h6": [], + "text": [ + "HTML converted to plain text", + "Another paragraph in the text" ], - 'tags': ['example', 'test', 'documentation'], - 'crdate': '2016-01-30T12:23:42+01:00', - 'modate': '2016-01-30T12:23:42+01:00', + "tags": ["example", "test", "documentation"], + "crdate": "2016-01-30T12:23:42+01:00", + "modate": "2016-01-30T12:23:42+01:00", } |
