git.cweiske.de
/
phinde.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Configuration for default sort order
[phinde.git]
/
src
/
phinde
/
LinkExtractor
/
Html.php
diff --git
a/src/phinde/LinkExtractor/Html.php
b/src/phinde/LinkExtractor/Html.php
index 0d6f3d8aa1159bd9053801464033abf063fad09b..b3a9ea65170f8f50bd5a09492c72eb7e028829f3 100644
(file)
--- a/
src/phinde/LinkExtractor/Html.php
+++ b/
src/phinde/LinkExtractor/Html.php
@@
-2,12
+2,13
@@
namespace phinde\LinkExtractor;
use phinde\LinkInfo;
namespace phinde\LinkExtractor;
use phinde\LinkInfo;
+use phinde\Helper;
class Html
{
public function extract(\HTTP_Request2_Response $res)
{
class Html
{
public function extract(\HTTP_Request2_Response $res)
{
- $url =
$res->getEffectiveUrl(
);
+ $url =
Helper::removeAnchor($res->getEffectiveUrl()
);
$linkInfos = array();
$linkInfos = array();
@@
-21,10
+22,17
@@
class Html
$dx = new \DOMXPath($doc);
$dx = new \DOMXPath($doc);
- $meta = $dx->evaluate('/html/head/meta[@name="robots" and @value]')
+ $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
+ if ($xbase) {
+ $base = $base->resolve(
+ $xbase->attributes->getNamedItem('href')->textContent
+ );
+ }
+
+ $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
->item(0);
if ($meta) {
->item(0);
if ($meta) {
- $robots = $meta->attributes->getNamedItem('
value
')->textContent;
+ $robots = $meta->attributes->getNamedItem('
content
')->textContent;
foreach (explode(',', $robots) as $value) {
if (trim($value) == 'nofollow') {
//we shall not follow the links
foreach (explode(',', $robots) as $value) {
if (trim($value) == 'nofollow') {
//we shall not follow the links
@@
-36,10
+44,10
@@
class Html
$links = $dx->evaluate('//a');
//FIXME: link rel, img, video
$links = $dx->evaluate('//a');
//FIXME: link rel, img, video
- $alreadySeen = array();
+ $alreadySeen = array(
$url => true
);
foreach ($links as $link) {
foreach ($links as $link) {
- $linkTitle =
$link->textContent
;
+ $linkTitle =
Helper::sanitizeTitle($link->textContent)
;
$href = '';
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {
$href = '';
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {