X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/915b66fe6ca517610a41acec0a71597e7cee0807..686f1cec3fd35782c30d20f891fec2f434e5d02f:/bin/index.php diff --git a/bin/index.php b/bin/index.php index c6de5a9..5985a3e 100755 --- a/bin/index.php +++ b/bin/index.php @@ -14,26 +14,48 @@ if ($argc < 2) { exit(1); } -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); +function removeTags($doc, $tag) { + $elems = array(); + foreach ($doc->getElementsbyTagName($tag) as $elem) { + $elems[] = $elem; + } + foreach ($elems as $elem) { + $elem->parentNode->removeChild($elem); + } +} $url = $argv[1]; -$existingDoc = $es->get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - echo "URL already indexed: $url\n"; - exit(0); -} -//FIXME: sourcetitle, sourcelink $req = new \HTTP_Request2($url); $req->setConfig('follow_redirects', true); $req->setConfig('connect_timeout', 5); $req->setConfig('timeout', 10); $req->setConfig('ssl_verify_peer', false); +//FIXME: size limit + +$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); +$existingDoc = $es->get($url); +if ($existingDoc && $existingDoc->status == 'indexed') { + $nMoDate = strtotime($existingDoc->modate); + $refreshtime = $GLOBALS['phinde']['refreshtime']; + if (time() - $nMoDate < $refreshtime) { + echo "URL already indexed less than $refreshtime seconds ago: $url\n"; + exit(0); + } + + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); +} +//FIXME: sourcetitle, sourcelink + $res = $req->send(); //FIXME: try-catch -//FIXME: delete if 401 gone or 404 when updating -if ($res->getStatus() !== 200) { +if ($res->getStatus() === 304) { + //not modified since last time + //FIXME: store "last try" time + exit(0); +} else if ($res->getStatus() !== 200) { + //FIXME: delete if 401 gone or 404 when updating echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; //FIXME: update status exit(3); @@ -49,6 +71,7 @@ if (!in_array($mimetype, $supportedIndexTypes)) { //FIXME: update index only if changed since last index time //FIXME: extract base url from html +//FIXME: check if effective url needs updating $url = $res->getEffectiveUrl(); $base = new \Net_URL2($url); @@ -60,17 +83,34 @@ $doc = new \DOMDocument(); @$doc->loadHTML($res->getBody()); $dx = new \DOMXPath($doc); -//remove script tags -$elems = array(); -foreach ($doc->getElementsbyTagName('script') as $elem) { - $elems[] = $elem; +$xbase = $dx->evaluate('/html/head/base[@href]')->item(0); +if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); } -foreach ($elems as $elem) { - $elem->parentNode->removeChild($elem); + +$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') + ->item(0); +if ($meta) { + $robots = $meta->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'noindex') { + echo "URL does not want to be indexed: $url\n"; + exit(0); + } + } } +//remove script tags +removeTags($doc, 'script'); +removeTags($doc, 'style'); +removeTags($doc, 'nav'); + //default content: $xpContext = $doc->getElementsByTagName('body')->item(0); +//FIXME: follow meta refresh, no body +// example: https://www.gnu.org/software/coreutils/ //use microformats content if it exists $xpElems = $dx->query( @@ -95,13 +135,13 @@ $indexDoc->domain = parse_url($url, PHP_URL_HOST); $indexDoc->author = new \stdClass(); -$arXpElems = $dx->query('/html/head/meta[@name="author"]'); +$arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); if ($arXpElems->length) { $indexDoc->author->name = trim( $arXpElems->item(0)->attributes->getNamedItem('content')->textContent ); } -$arXpElems = $dx->query('/html/head/link[@rel="author"]'); +$arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); if ($arXpElems->length) { $indexDoc->author->url = trim( $base->resolve( @@ -128,7 +168,7 @@ foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { } } -//FIXME: limit to h-entry e-content +//FIXME: split paragraphs //FIXME: insert space after br $indexDoc->text = array(); $indexDoc->text[] = trim( @@ -141,7 +181,7 @@ $indexDoc->text[] = trim( //tags $tags = array(); -foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) { +foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; foreach (explode(',', $keywords) as $keyword) { $tags[trim($keyword)] = true; @@ -150,7 +190,7 @@ foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) { $indexDoc->tags = array_keys($tags); //dates -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"]'); +$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); if ($arXpdates->length) { $indexDoc->crdate = date( 'c', @@ -162,7 +202,7 @@ if ($arXpdates->length) { //FIXME: keep creation date from database, or use modified date if we // do not have it there -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"]'); +$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); if ($arXpdates->length) { $indexDoc->modate = date( 'c', @@ -182,12 +222,10 @@ if ($arXpdates->length) { //language //there may be "en-US" and "de-DE" -$indexDoc->language = strtolower( - substr( - $doc->documentElement->attributes->getNamedItem('lang')->textContent, - 0, 2 - ) -); +$xlang = $doc->documentElement->attributes->getNamedItem('lang'); +if ($xlang) { + $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); +} //FIXME: fallback, autodetection //FIXME: check noindex