aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/LinkExtractor/Html.php
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-02-10 17:26:15 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-02-10 17:26:15 +0100
commit298d81c60b6103cd29e9c219d243a5a8a8289f6f (patch)
treec8eff43781a748a2f5a24f6a2892b62b233024fb /src/phinde/LinkExtractor/Html.php
parentb018834e5c337be762cf9809e69e341061f4638a (diff)
downloadphinde-298d81c60b6103cd29e9c219d243a5a8a8289f6f.tar.gz
phinde-298d81c60b6103cd29e9c219d243a5a8a8289f6f.zip
crawler supports "nofollow" now
Diffstat (limited to 'src/phinde/LinkExtractor/Html.php')
-rw-r--r--src/phinde/LinkExtractor/Html.php25
1 files changed, 22 insertions, 3 deletions
diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php
index 538d6c4..0d6f3d8 100644
--- a/src/phinde/LinkExtractor/Html.php
+++ b/src/phinde/LinkExtractor/Html.php
@@ -19,8 +19,21 @@ class Html
//FIXME: extract base url from html
$base = new \Net_URL2($url);
- $xpath = new \DOMXPath($doc);
- $links = $xpath->evaluate('//a');
+ $dx = new \DOMXPath($doc);
+
+ $meta = $dx->evaluate('/html/head/meta[@name="robots" and @value]')
+ ->item(0);
+ if ($meta) {
+ $robots = $meta->attributes->getNamedItem('value')->textContent;
+ foreach (explode(',', $robots) as $value) {
+ if (trim($value) == 'nofollow') {
+ //we shall not follow the links
+ return array();
+ }
+ }
+ }
+
+ $links = $dx->evaluate('//a');
//FIXME: link rel, img, video
$alreadySeen = array();
@@ -31,6 +44,13 @@ class Html
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {
$href = $attribute->textContent;
+ } else if ($attribute->name == 'rel') {
+ foreach (explode(',', $attribute->textContent) as $value) {
+ if (trim($value) == 'nofollow') {
+ //we shall not follow this link
+ continue 3;
+ }
+ }
}
}
if ($href == '' || $href{0} == '#') {
@@ -54,7 +74,6 @@ class Html
}
//FIXME: check target type
- //FIXME: check nofollow
$linkInfos[] = new LinkInfo(
$linkUrl, $linkTitle, $url
);