X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8..d8c39f2b0571b9734259b2f9dc218eed24412332:/src/phinde/LinkExtractor/Html.php diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index 538d6c4..a6fa8ef 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -2,6 +2,7 @@ namespace phinde\LinkExtractor; use phinde\LinkInfo; +use phinde\Helper; class Html { @@ -19,18 +20,38 @@ class Html //FIXME: extract base url from html $base = new \Net_URL2($url); - $xpath = new \DOMXPath($doc); - $links = $xpath->evaluate('//a'); + $dx = new \DOMXPath($doc); + + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') + ->item(0); + if ($meta) { + $robots = $meta->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'nofollow') { + //we shall not follow the links + return array(); + } + } + } + + $links = $dx->evaluate('//a'); //FIXME: link rel, img, video - $alreadySeen = array(); + $alreadySeen = array($url => true); foreach ($links as $link) { - $linkTitle = $link->textContent; + $linkTitle = Helper::sanitizeTitle($link->textContent); $href = ''; foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') { $href = $attribute->textContent; + } else if ($attribute->name == 'rel') { + foreach (explode(',', $attribute->textContent) as $value) { + if (trim($value) == 'nofollow') { + //we shall not follow this link + continue 3; + } + } } } if ($href == '' || $href{0} == '#') { @@ -54,7 +75,6 @@ class Html } //FIXME: check target type - //FIXME: check nofollow $linkInfos[] = new LinkInfo( $linkUrl, $linkTitle, $url );