git.cweiske.de
/
phinde.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
9f0bdf5
)
check for content attributes
author
Christian Weiske
<cweiske@cweiske.de>
Thu, 4 Feb 2016 22:55:41 +0000
(23:55 +0100)
committer
Christian Weiske
<cweiske@cweiske.de>
Thu, 4 Feb 2016 22:55:41 +0000
(23:55 +0100)
bin/index.php
patch
|
blob
|
history
diff --git
a/bin/index.php
b/bin/index.php
index 04cc9ac01c39b2244c465a1702ce37ae809694bc..7550ad3c0667778e15852715c828b7cf5b162939 100755
(executable)
--- a/
bin/index.php
+++ b/
bin/index.php
@@
-32,6
+32,7
@@
if ($existingDoc && $existingDoc->status == 'indexed') {
echo "URL already indexed: $url\n";
exit(0);
}
echo "URL already indexed: $url\n";
exit(0);
}
+//FIXME: size limit
//FIXME: sourcetitle, sourcelink
$req = new \HTTP_Request2($url);
//FIXME: sourcetitle, sourcelink
$req = new \HTTP_Request2($url);
@@
-59,6
+60,7
@@
if (!in_array($mimetype, $supportedIndexTypes)) {
//FIXME: update index only if changed since last index time
//FIXME: extract base url from html
//FIXME: update index only if changed since last index time
//FIXME: extract base url from html
+//FIXME: check if effective url needs updating
$url = $res->getEffectiveUrl();
$base = new \Net_URL2($url);
$url = $res->getEffectiveUrl();
$base = new \Net_URL2($url);
@@
-77,6
+79,8
@@
removeTags($doc, 'nav');
//default content: <body>
$xpContext = $doc->getElementsByTagName('body')->item(0);
//default content: <body>
$xpContext = $doc->getElementsByTagName('body')->item(0);
+//FIXME: follow meta refresh, no body
+// example: https://www.gnu.org/software/coreutils/
//use microformats content if it exists
$xpElems = $dx->query(
//use microformats content if it exists
$xpElems = $dx->query(
@@
-101,13
+105,13
@@
$indexDoc->domain = parse_url($url, PHP_URL_HOST);
$indexDoc->author = new \stdClass();
$indexDoc->author = new \stdClass();
-$arXpElems = $dx->query('/html/head/meta[@name="author"]');
+$arXpElems = $dx->query('/html/head/meta[@name="author"
and @content
]');
if ($arXpElems->length) {
$indexDoc->author->name = trim(
$arXpElems->item(0)->attributes->getNamedItem('content')->textContent
);
}
if ($arXpElems->length) {
$indexDoc->author->name = trim(
$arXpElems->item(0)->attributes->getNamedItem('content')->textContent
);
}
-$arXpElems = $dx->query('/html/head/link[@rel="author"]');
+$arXpElems = $dx->query('/html/head/link[@rel="author"
and @href
]');
if ($arXpElems->length) {
$indexDoc->author->url = trim(
$base->resolve(
if ($arXpElems->length) {
$indexDoc->author->url = trim(
$base->resolve(
@@
-147,7
+151,7
@@
$indexDoc->text[] = trim(
//tags
$tags = array();
//tags
$tags = array();
-foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) {
+foreach ($dx->query('/html/head/meta[@name="keywords"
and @content
]') as $xkeywords) {
$keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $keywords) as $keyword) {
$tags[trim($keyword)] = true;
$keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
foreach (explode(',', $keywords) as $keyword) {
$tags[trim($keyword)] = true;
@@
-156,7
+160,7
@@
foreach ($dx->query('/html/head/meta[@name="keywords"]') as $xkeywords) {
$indexDoc->tags = array_keys($tags);
//dates
$indexDoc->tags = array_keys($tags);
//dates
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created"
and @content
]');
if ($arXpdates->length) {
$indexDoc->crdate = date(
'c',
if ($arXpdates->length) {
$indexDoc->crdate = date(
'c',
@@
-168,7
+172,7
@@
if ($arXpdates->length) {
//FIXME: keep creation date from database, or use modified date if we
// do not have it there
//FIXME: keep creation date from database, or use modified date if we
// do not have it there
-$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"]');
+$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified"
and @content
]');
if ($arXpdates->length) {
$indexDoc->modate = date(
'c',
if ($arXpdates->length) {
$indexDoc->modate = date(
'c',