exit(1);
}
+function removeTags($doc, $tag) {
+ $elems = array();
+ foreach ($doc->getElementsbyTagName($tag) as $elem) {
+ $elems[] = $elem;
+ }
+ foreach ($elems as $elem) {
+ $elem->parentNode->removeChild($elem);
+ }
+}
+
$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
$url = $argv[1];
echo "URL already indexed: $url\n";
exit(0);
}
+//FIXME: size limit
//FIXME: sourcetitle, sourcelink
$req = new \HTTP_Request2($url);
//FIXME: update index only if changed since last index time
//FIXME: extract base url from html
+//FIXME: check if effective url needs updating
$url = $res->getEffectiveUrl();
$base = new \Net_URL2($url);
$dx = new \DOMXPath($doc);
//remove script tags
-$elems = array();
-foreach ($doc->getElementsbyTagName('script') as $elem) {
- $elems[] = $elem;
-}
-foreach ($elems as $elem) {
- $elem->parentNode->removeChild($elem);
-}
+removeTags($doc, 'script');
+removeTags($doc, 'style');
+removeTags($doc, 'nav');
//default content: <body>
$xpContext = $doc->getElementsByTagName('body')->item(0);
+//FIXME: follow meta refresh, no body
+// example: https://www.gnu.org/software/coreutils/
//use microformats content if it exists
$xpElems = $dx->query(
$indexDoc->author = new \stdClass();
-$arXpElems = $dx->query('/html/head/meta[@name="author"]');
+$arXpElems = $dx->query('/html/head/meta[@name="author" and @content]');
if ($arXpElems->length) {
$indexDoc->author->name = trim(
$arXpElems->item(0)->attributes->getNamedItem('content')->textContent
);
}
-$arXpElems = $dx->query('/html/head/link[@rel="author"]');
+$arXpElems = $dx->query('/html/head/link[@rel="author" and @href]');
if ($arXpElems->length) {
$indexDoc->author->url = trim(
$base->resolve(
}
}
-//FIXME: limit to h-entry e-content
+//FIXME: split paragraphs
//FIXME: insert space after br
$indexDoc->text = array();
$indexDoc->text[] = trim(
//tags
$tags = array();
-foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) {
+foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) {
$keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $keywords) as $keyword) {
$tags[trim($keyword)] = true;
$indexDoc->tags = array_keys($tags);
//dates
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]');
if ($arXpdates->length) {
$indexDoc->crdate = date(
'c',
//FIXME: keep creation date from database, or use modified date if we
// do not have it there
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]');
if ($arXpdates->length) {
$indexDoc->modate = date(
'c',
//language
//there may be "en-US" and "de-DE"
-$indexDoc->language = strtolower(
- substr(
- $doc->documentElement->attributes->getNamedItem('lang')->textContent,
- 0, 2
- )
-);
+$xlang = $doc->documentElement->attributes->getNamedItem('lang');
+if ($xlang) {
+ $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2));
+}
//FIXME: fallback, autodetection
//FIXME: check noindex