From f156497be5637d7815ae57370e8b175ce6960a19 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Wed, 3 Feb 2016 22:37:15 +0100 Subject: [PATCH] fix indexing, boost config --- bin/crawl.php | 2 +- bin/index.php | 10 ++--- bin/setup.php | 5 +-- data/elasticsearch-mapping.json | 47 ++++++++++++++++++++++- docs/elasticsearch/doc-html.json | 66 ++++++++++++++++---------------- 5 files changed, 85 insertions(+), 45 deletions(-) diff --git a/bin/crawl.php b/bin/crawl.php index 17b1fc3..e39a622 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -86,7 +86,7 @@ foreach ($links as $link) { //var_dump($linkTitle, $linkUrl); $es->markQueued($linkUrl); addToIndex($linkUrl, $linkTitle, $url); - if (isUrlAllowed($linkUrl)) { + if (Helper::isUrlAllowed($linkUrl)) { addToCrawl($linkUrl); } $alreadySeen[$linkUrl] = true; diff --git a/bin/index.php b/bin/index.php index c6de5a9..374923c 100755 --- a/bin/index.php +++ b/bin/index.php @@ -182,12 +182,10 @@ if ($arXpdates->length) { //language //there may be "en-US" and "de-DE" -$indexDoc->language = strtolower( - substr( - $doc->documentElement->attributes->getNamedItem('lang')->textContent, - 0, 2 - ) -); +$xlang = $doc->documentElement->attributes->getNamedItem('lang'); +if ($xlang) { + $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); +} //FIXME: fallback, autodetection //FIXME: check noindex diff --git a/bin/setup.php b/bin/setup.php index 7dacedd..1e6c66d 100755 --- a/bin/setup.php +++ b/bin/setup.php @@ -2,10 +2,7 @@