X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/915b66fe6ca517610a41acec0a71597e7cee0807..d7651fd96dcfa2829519504e4c8ec1ce511cd57f:/bin/index.php diff --git a/bin/index.php b/bin/index.php deleted file mode 100755 index c6de5a9..0000000 --- a/bin/index.php +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env php -get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - echo "URL already indexed: $url\n"; - exit(0); -} -//FIXME: sourcetitle, sourcelink - -$req = new \HTTP_Request2($url); -$req->setConfig('follow_redirects', true); -$req->setConfig('connect_timeout', 5); -$req->setConfig('timeout', 10); -$req->setConfig('ssl_verify_peer', false); -$res = $req->send(); -//FIXME: try-catch - -//FIXME: delete if 401 gone or 404 when updating -if ($res->getStatus() !== 200) { - echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; - //FIXME: update status - exit(3); -} - -$mimetype = explode(';', $res->getHeader('content-type'))[0]; -if (!in_array($mimetype, $supportedIndexTypes)) { - echo "MIME type not supported for indexing: $mimetype\n"; - //FIXME: update status - exit(4); -} - - -//FIXME: update index only if changed since last index time -//FIXME: extract base url from html -$url = $res->getEffectiveUrl(); -$base = new \Net_URL2($url); - -$indexDoc = new \stdClass(); - -//FIXME: MIME type switch -$doc = new \DOMDocument(); -//@ to hide parse warning messages in invalid html -@$doc->loadHTML($res->getBody()); -$dx = new \DOMXPath($doc); - -//remove script tags -$elems = array(); -foreach ($doc->getElementsbyTagName('script') as $elem) { - $elems[] = $elem; -} -foreach ($elems as $elem) { - $elem->parentNode->removeChild($elem); -} - -//default content: -$xpContext = $doc->getElementsByTagName('body')->item(0); - -//use microformats content if it exists -$xpElems = $dx->query( - "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" -); -if ($xpElems->length) { - $xpContext = $xpElems->item(0); -} else if ($doc->getElementById('content')) { - //if there is an element with ID "content", we'll use this - $xpContext = $doc->getElementById('content'); -} - -$indexDoc->url = $url; -$indexDoc->schemalessUrl = Helper::noSchema($url); -$indexDoc->type = 'html'; -$indexDoc->subtype = ''; -$indexDoc->mimetype = $mimetype; -$indexDoc->domain = parse_url($url, PHP_URL_HOST); - -//$indexDoc->source = 'FIXME'; -//$indexDoc->sourcetitle = 'FIXME'; - -$indexDoc->author = new \stdClass(); - -$arXpElems = $dx->query('/html/head/meta[@name="author"]'); -if ($arXpElems->length) { - $indexDoc->author->name = trim( - $arXpElems->item(0)->attributes->getNamedItem('content')->textContent - ); -} -$arXpElems = $dx->query('/html/head/link[@rel="author"]'); -if ($arXpElems->length) { - $indexDoc->author->url = trim( - $base->resolve( - $arXpElems->item(0)->attributes->getNamedItem('href')->textContent - ) - ); -} - - -$arXpElems = $dx->query('/html/head/title'); -if ($arXpElems->length) { - $indexDoc->title = trim( - $arXpElems->item(0)->textContent - ); -} - -foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { - $indexDoc->$headlinetype = array(); - foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { - array_push( - $indexDoc->$headlinetype, - trim($xheadline->textContent) - ); - } -} - -//FIXME: limit to h-entry e-content -//FIXME: insert space after br -$indexDoc->text = array(); -$indexDoc->text[] = trim( - str_replace( - array("\r\n", "\n", "\r", ' '), - ' ', - $xpContext->textContent - ) -); - -//tags -$tags = array(); -foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) { - $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; - foreach (explode(',', $keywords) as $keyword) { - $tags[trim($keyword)] = true; - } -} -$indexDoc->tags = array_keys($tags); - -//dates -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"]'); -if ($arXpdates->length) { - $indexDoc->crdate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} -//FIXME: keep creation date from database, or use modified date if we -// do not have it there - -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"]'); -if ($arXpdates->length) { - $indexDoc->modate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} else { - $lm = $res->getHeader('last-modified'); - if ($lm !== null) { - $indexDoc->modate = date('c', strtotime($lm)); - } else { - //use current time since we don't have any other data - $indexDoc->modate = date('c'); - } -} - -//language -//there may be "en-US" and "de-DE" -$indexDoc->language = strtolower( - substr( - $doc->documentElement->attributes->getNamedItem('lang')->textContent, - 0, 2 - ) -); -//FIXME: fallback, autodetection -//FIXME: check noindex - -//var_dump($indexDoc);die(); - -$indexDoc->status = 'indexed'; - -//FIXME: update index if it exists already -$r = new Elasticsearch_Request( - $GLOBALS['phinde']['elasticsearch'] . 'document/' . rawurlencode($url), - \HTTP_Request2::METHOD_PUT -); -$r->setBody(json_encode($indexDoc)); -$r->send(); - - -?>