fix indexing, boost config
authorChristian Weiske <cweiske@cweiske.de>
Wed, 3 Feb 2016 21:37:15 +0000 (22:37 +0100)
committerChristian Weiske <cweiske@cweiske.de>
Wed, 3 Feb 2016 21:37:15 +0000 (22:37 +0100)
bin/crawl.php
bin/index.php
bin/setup.php
data/elasticsearch-mapping.json
docs/elasticsearch/doc-html.json

index 17b1fc3..e39a622 100755 (executable)
@@ -86,7 +86,7 @@ foreach ($links as $link) {
     //var_dump($linkTitle, $linkUrl);
     $es->markQueued($linkUrl);
     addToIndex($linkUrl, $linkTitle, $url);
-    if (isUrlAllowed($linkUrl)) {
+    if (Helper::isUrlAllowed($linkUrl)) {
         addToCrawl($linkUrl);
     }
     $alreadySeen[$linkUrl] = true;
index c6de5a9..374923c 100755 (executable)
@@ -182,12 +182,10 @@ if ($arXpdates->length) {
 
 //language
 //there may be "en-US" and "de-DE"
-$indexDoc->language = strtolower(
-    substr(
-        $doc->documentElement->attributes->getNamedItem('lang')->textContent,
-        0, 2
-    )
-);
+$xlang = $doc->documentElement->attributes->getNamedItem('lang');
+if ($xlang) {
+    $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2));
+}
 //FIXME: fallback, autodetection
 //FIXME: check noindex
 
index 7dacedd..1e6c66d 100755 (executable)
@@ -2,10 +2,7 @@
 <?php
 namespace phinde;
 //configure the elasticsearch index
-set_include_path(__DIR__ . '/../src/' . PATH_SEPARATOR . get_include_path());
-require_once __DIR__ . '/../data/config.php';
-require_once 'HTTP/Request2.php';
-require_once 'Elasticsearch/Request.php';
+require_once __DIR__ . '/../src/init.php';
 
 //delete old index
 $r = new Elasticsearch_Request(
index ec9bcc9..d1e83ec 100644 (file)
                 },
                 "url": {
                     "type": "string",
-                    "index": "not_analyzed"
+                    "index": "not_analyzed",
+                    "boost": 1.5
+                },
+                "schemalessUrl": {
+                    "type": "string",
+                    "index": "not_analyzed",
+                    "boost": 1.5
                 },
                 "domain": {
                     "type": "string",
-                    "index": "not_analyzed"
+                    "index": "not_analyzed",
+                    "boost": 1.8
                 },
                 "source": {
                     "type": "string",
                 "language": {
                     "type": "string",
                     "index": "not_analyzed"
+                },
+                "title": {
+                    "type": "string",
+                    "boost": 2
+                },
+                "h1": {
+                    "type": "string",
+                    "boost": 1.8
+                },
+                "h2": {
+                    "type": "string",
+                    "boost": 1.7
+                },
+                "h3": {
+                    "type": "string",
+                    "boost": 1.6
+                },
+                "h4": {
+                    "type": "string",
+                    "boost": 1.5
+                },
+                "h5": {
+                    "type": "string",
+                    "boost": 1.4
+                },
+                "h6": {
+                    "type": "string",
+                    "boost": 1.3
+                },
+                "text": {
+                    "type": "string",
+                    "boost": 1.0
+                },
+                "tags": {
+                    "type": "string",
+                    "boost": 1.5
                 }
             }
         }
index 43d4206..9f93b8e 100644 (file)
@@ -1,41 +1,43 @@
 {
-    'type': 'html',
-    'subtype': 'rsvp',
-    'mimetype': 'application/xhtml+xml',
-    'url': 'http://example.org/foo.htm',
-    'domain': 'example.org',
-    'source': [
-        'http://example.org/linkfarm.htm'
+    "status": "indexed",
+    "type": "html",
+    "subtype": "rsvp",
+    "mimetype": "application/xhtml+xml",
+    "url": "http://example.org/foo.htm",
+    "schemalessUrl": "example.org/foo.htm",
+    "domain": "example.org",
+    "source": [
+        "http://example.org/linkfarm.htm"
     ],
-    'sourcetitle': [
-        'Click here',
-        'Something about bar',
+    "sourcetitle": [
+        "Click here",
+        "Something about bar",
     ],
-    'language': 'en',
-    'author': {
-        'name': 'Alice Example',
-        'url': 'http://example.org/alice.htm',
+    "language": "en",
+    "author": {
+        "name": "Alice Example",
+        "url": "http://example.org/alice.htm",
     },
-    'title': 'Sample HTML page',
-    'h1': [
-        'Headline 1',
-        'Another headline 1'
+    "title": "Sample HTML page",
+    "h1": [
+        "Headline 1",
+        "Another headline 1"
     ],
-    'h2': [
-        'Subheadline'
+    "h2": [
+        "Subheadline"
     ],
-    'h3': [
-        'Subsub',
-        'Another Subsub'
+    "h3": [
+        "Subsub",
+        "Another Subsub"
     ],
-    'h4': [],
-    'h5': [],
-    'h6': [],
-    'text': [
-        'HTML converted to plain text',
-        'Another paragraph in the text'
+    "h4": [],
+    "h5": [],
+    "h6": [],
+    "text": [
+        "HTML converted to plain text",
+        "Another paragraph in the text"
     ],
-    'tags': ['example', 'test', 'documentation'],
-    'crdate': '2016-01-30T12:23:42+01:00',
-    'modate': '2016-01-30T12:23:42+01:00',
+    "tags": ["example", "test", "documentation"],
+    "crdate": "2016-01-30T12:23:42+01:00",
+    "modate": "2016-01-30T12:23:42+01:00",
 }