X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/18dc4a1560c24ffd70d659ef1a96caf90ec9a4c8..686f1cec3fd35782c30d20f891fec2f434e5d02f:/bin/index.php diff --git a/bin/index.php b/bin/index.php index dd32dea..5985a3e 100755 --- a/bin/index.php +++ b/bin/index.php @@ -14,26 +14,48 @@ if ($argc < 2) { exit(1); } -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); +function removeTags($doc, $tag) { + $elems = array(); + foreach ($doc->getElementsbyTagName($tag) as $elem) { + $elems[] = $elem; + } + foreach ($elems as $elem) { + $elem->parentNode->removeChild($elem); + } +} $url = $argv[1]; -$existingDoc = $es->get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - echo "URL already indexed: $url\n"; - exit(0); -} -//FIXME: sourcetitle, sourcelink $req = new \HTTP_Request2($url); $req->setConfig('follow_redirects', true); $req->setConfig('connect_timeout', 5); $req->setConfig('timeout', 10); $req->setConfig('ssl_verify_peer', false); +//FIXME: size limit + +$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); +$existingDoc = $es->get($url); +if ($existingDoc && $existingDoc->status == 'indexed') { + $nMoDate = strtotime($existingDoc->modate); + $refreshtime = $GLOBALS['phinde']['refreshtime']; + if (time() - $nMoDate < $refreshtime) { + echo "URL already indexed less than $refreshtime seconds ago: $url\n"; + exit(0); + } + + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); +} +//FIXME: sourcetitle, sourcelink + $res = $req->send(); //FIXME: try-catch -//FIXME: delete if 401 gone or 404 when updating -if ($res->getStatus() !== 200) { +if ($res->getStatus() === 304) { + //not modified since last time + //FIXME: store "last try" time + exit(0); +} else if ($res->getStatus() !== 200) { + //FIXME: delete if 401 gone or 404 when updating echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; //FIXME: update status exit(3); @@ -49,6 +71,7 @@ if (!in_array($mimetype, $supportedIndexTypes)) { //FIXME: update index only if changed since last index time //FIXME: extract base url from html +//FIXME: check if effective url needs updating $url = $res->getEffectiveUrl(); $base = new \Net_URL2($url); @@ -58,7 +81,47 @@ $indexDoc = new \stdClass(); $doc = new \DOMDocument(); //@ to hide parse warning messages in invalid html @$doc->loadHTML($res->getBody()); -$sx = simplexml_import_dom($doc); +$dx = new \DOMXPath($doc); + +$xbase = $dx->evaluate('/html/head/base[@href]')->item(0); +if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); +} + +$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') + ->item(0); +if ($meta) { + $robots = $meta->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'noindex') { + echo "URL does not want to be indexed: $url\n"; + exit(0); + } + } +} + +//remove script tags +removeTags($doc, 'script'); +removeTags($doc, 'style'); +removeTags($doc, 'nav'); + +//default content: +$xpContext = $doc->getElementsByTagName('body')->item(0); +//FIXME: follow meta refresh, no body +// example: https://www.gnu.org/software/coreutils/ + +//use microformats content if it exists +$xpElems = $dx->query( + "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" +); +if ($xpElems->length) { + $xpContext = $xpElems->item(0); +} else if ($doc->getElementById('content')) { + //if there is an element with ID "content", we'll use this + $xpContext = $doc->getElementById('content'); +} $indexDoc->url = $url; $indexDoc->schemalessUrl = Helper::noSchema($url); @@ -72,61 +135,81 @@ $indexDoc->domain = parse_url($url, PHP_URL_HOST); $indexDoc->author = new \stdClass(); -$arSxElems = $sx->xpath('/html/head/meta[@name="author"]'); -if (count($arSxElems)) { - $indexDoc->author->name = trim($arSxElems[0]['content']); +$arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); +if ($arXpElems->length) { + $indexDoc->author->name = trim( + $arXpElems->item(0)->attributes->getNamedItem('content')->textContent + ); } -$arSxElems = $sx->xpath('/html/head/link[@rel="author"]'); -if (count($arSxElems)) { - $indexDoc->author->url = (string) $base->resolve($arSxElems[0]['href']); +$arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); +if ($arXpElems->length) { + $indexDoc->author->url = trim( + $base->resolve( + $arXpElems->item(0)->attributes->getNamedItem('href')->textContent + ) + ); +} + + +$arXpElems = $dx->query('/html/head/title'); +if ($arXpElems->length) { + $indexDoc->title = trim( + $arXpElems->item(0)->textContent + ); } -$indexDoc->title = (string) $sx->head->title; foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { $indexDoc->$headlinetype = array(); - //FIXME: limit to h-entry children - foreach ($sx->xpath('//' . $headlinetype) as $xheadline) { + foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { array_push( $indexDoc->$headlinetype, - trim(dom_import_simplexml($xheadline)->textContent) + trim($xheadline->textContent) ); } } -//FIXME: limit to h-entry e-content +//FIXME: split paragraphs //FIXME: insert space after br -//FIXME: remove javascript $indexDoc->text = array(); -foreach ($doc->getElementsByTagName('body') as $body) { - $indexDoc->text[] = trim( - str_replace( - array("\r\n", "\n", "\r", ' '), - ' ', - $body->textContent - ) - ); -} +$indexDoc->text[] = trim( + str_replace( + array("\r\n", "\n", "\r", ' '), + ' ', + $xpContext->textContent + ) +); //tags $tags = array(); -foreach ($sx->xpath('/html/head/meta[@name="keywords"]') as $xkeywords) { - foreach (explode(',', $xkeywords['content']) as $keyword) { +foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { + $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $keywords) as $keyword) { $tags[trim($keyword)] = true; } } $indexDoc->tags = array_keys($tags); //dates -$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.created"]'); -if (count($arSxdates)) { - $indexDoc->crdate = date('c', strtotime((string) $arSxdates[0]['content'])); +$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); +if ($arXpdates->length) { + $indexDoc->crdate = date( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); } //FIXME: keep creation date from database, or use modified date if we // do not have it there -$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.modified"]'); -if (count($arSxdates)) { - $indexDoc->modate = date('c', strtotime((string) $arSxdates[0]['content'])); +$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); +if ($arXpdates->length) { + $indexDoc->modate = date( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); } else { $lm = $res->getHeader('last-modified'); if ($lm !== null) { @@ -139,12 +222,14 @@ if (count($arSxdates)) { //language //there may be "en-US" and "de-DE" -$indexDoc->language = strtolower(substr((string) $sx['lang'], 0, 2)); +$xlang = $doc->documentElement->attributes->getNamedItem('lang'); +if ($xlang) { + $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); +} //FIXME: fallback, autodetection //FIXME: check noindex - -//var_dump($indexDoc); +//var_dump($indexDoc);die(); $indexDoc->status = 'indexed';