git.cweiske.de
/
phinde.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
add date sorting
[phinde.git]
/
bin
/
index.php
diff --git
a/bin/index.php
b/bin/index.php
index c6de5a9deea02cd5325618cbc9ccb07228de406e..7550ad3c0667778e15852715c828b7cf5b162939 100755
(executable)
--- a/
bin/index.php
+++ b/
bin/index.php
@@
-14,6
+14,16
@@
if ($argc < 2) {
exit(1);
}
exit(1);
}
+function removeTags($doc, $tag) {
+ $elems = array();
+ foreach ($doc->getElementsbyTagName($tag) as $elem) {
+ $elems[] = $elem;
+ }
+ foreach ($elems as $elem) {
+ $elem->parentNode->removeChild($elem);
+ }
+}
+
$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
$url = $argv[1];
$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
$url = $argv[1];
@@
-22,6
+32,7
@@
if ($existingDoc && $existingDoc->status == 'indexed') {
echo "URL already indexed: $url\n";
exit(0);
}
echo "URL already indexed: $url\n";
exit(0);
}
+//FIXME: size limit
//FIXME: sourcetitle, sourcelink
$req = new \HTTP_Request2($url);
//FIXME: sourcetitle, sourcelink
$req = new \HTTP_Request2($url);
@@
-49,6
+60,7
@@
if (!in_array($mimetype, $supportedIndexTypes)) {
//FIXME: update index only if changed since last index time
//FIXME: extract base url from html
//FIXME: update index only if changed since last index time
//FIXME: extract base url from html
+//FIXME: check if effective url needs updating
$url = $res->getEffectiveUrl();
$base = new \Net_URL2($url);
$url = $res->getEffectiveUrl();
$base = new \Net_URL2($url);
@@
-61,16
+73,14
@@
$doc = new \DOMDocument();
$dx = new \DOMXPath($doc);
//remove script tags
$dx = new \DOMXPath($doc);
//remove script tags
-$elems = array();
-foreach ($doc->getElementsbyTagName('script') as $elem) {
- $elems[] = $elem;
-}
-foreach ($elems as $elem) {
- $elem->parentNode->removeChild($elem);
-}
+removeTags($doc, 'script');
+removeTags($doc, 'style');
+removeTags($doc, 'nav');
//default content: <body>
$xpContext = $doc->getElementsByTagName('body')->item(0);
//default content: <body>
$xpContext = $doc->getElementsByTagName('body')->item(0);
+//FIXME: follow meta refresh, no body
+// example: https://www.gnu.org/software/coreutils/
//use microformats content if it exists
$xpElems = $dx->query(
//use microformats content if it exists
$xpElems = $dx->query(
@@
-95,13
+105,13
@@
$indexDoc->domain = parse_url($url, PHP_URL_HOST);
$indexDoc->author = new \stdClass();
$indexDoc->author = new \stdClass();
-$arXpElems = $dx->query('/html/head/meta[@name="author"]');
+$arXpElems = $dx->query('/html/head/meta[@name="author"
and @content
]');
if ($arXpElems->length) {
$indexDoc->author->name = trim(
$arXpElems->item(0)->attributes->getNamedItem('content')->textContent
);
}
if ($arXpElems->length) {
$indexDoc->author->name = trim(
$arXpElems->item(0)->attributes->getNamedItem('content')->textContent
);
}
-$arXpElems = $dx->query('/html/head/link[@rel="author"]');
+$arXpElems = $dx->query('/html/head/link[@rel="author"
and @href
]');
if ($arXpElems->length) {
$indexDoc->author->url = trim(
$base->resolve(
if ($arXpElems->length) {
$indexDoc->author->url = trim(
$base->resolve(
@@
-128,7
+138,7
@@
foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) {
}
}
}
}
-//FIXME:
limit to h-entry e-content
+//FIXME:
split paragraphs
//FIXME: insert space after br
$indexDoc->text = array();
$indexDoc->text[] = trim(
//FIXME: insert space after br
$indexDoc->text = array();
$indexDoc->text[] = trim(
@@
-141,7
+151,7
@@
$indexDoc->text[] = trim(
//tags
$tags = array();
//tags
$tags = array();
-foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) {
+foreach ($dx->query('/html/head/meta[@name="keywords"
and @content
]') as $xkeywords) {
$keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $keywords) as $keyword) {
$tags[trim($keyword)] = true;
$keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $keywords) as $keyword) {
$tags[trim($keyword)] = true;
@@
-150,7
+160,7
@@
foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) {
$indexDoc->tags = array_keys($tags);
//dates
$indexDoc->tags = array_keys($tags);
//dates
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"
and @content
]');
if ($arXpdates->length) {
$indexDoc->crdate = date(
'c',
if ($arXpdates->length) {
$indexDoc->crdate = date(
'c',
@@
-162,7
+172,7
@@
if ($arXpdates->length) {
//FIXME: keep creation date from database, or use modified date if we
// do not have it there
//FIXME: keep creation date from database, or use modified date if we
// do not have it there
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"
and @content
]');
if ($arXpdates->length) {
$indexDoc->modate = date(
'c',
if ($arXpdates->length) {
$indexDoc->modate = date(
'c',
@@
-182,12
+192,10
@@
if ($arXpdates->length) {
//language
//there may be "en-US" and "de-DE"
//language
//there may be "en-US" and "de-DE"
-$indexDoc->language = strtolower(
- substr(
- $doc->documentElement->attributes->getNamedItem('lang')->textContent,
- 0, 2
- )
-);
+$xlang = $doc->documentElement->attributes->getNamedItem('lang');
+if ($xlang) {
+ $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2));
+}
//FIXME: fallback, autodetection
//FIXME: check noindex
//FIXME: fallback, autodetection
//FIXME: check noindex