From 298d81c60b6103cd29e9c219d243a5a8a8289f6f Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Wed, 10 Feb 2016 17:26:15 +0100 Subject: [PATCH 1/1] crawler supports "nofollow" now --- src/phinde/Crawler.php | 1 + src/phinde/LinkExtractor/Html.php | 25 ++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index f3158aa..ced40b8 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -57,6 +57,7 @@ class Crawler protected function enqueue($linkInfos) { + var_dump($linkInfos);die(); foreach ($linkInfos as $linkInfo) { if ($this->es->isKnown($linkInfo->url)) { continue; diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index 538d6c4..0d6f3d8 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -19,8 +19,21 @@ class Html //FIXME: extract base url from html $base = new \Net_URL2($url); - $xpath = new \DOMXPath($doc); - $links = $xpath->evaluate('//a'); + $dx = new \DOMXPath($doc); + + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @value]') + ->item(0); + if ($meta) { + $robots = $meta->attributes->getNamedItem('value')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'nofollow') { + //we shall not follow the links + return array(); + } + } + } + + $links = $dx->evaluate('//a'); //FIXME: link rel, img, video $alreadySeen = array(); @@ -31,6 +44,13 @@ class Html foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') { $href = $attribute->textContent; + } else if ($attribute->name == 'rel') { + foreach (explode(',', $attribute->textContent) as $value) { + if (trim($value) == 'nofollow') { + //we shall not follow this link + continue 3; + } + } } } if ($href == '' || $href{0} == '#') { @@ -54,7 +74,6 @@ class Html } //FIXME: check target type - //FIXME: check nofollow $linkInfos[] = new LinkInfo( $linkUrl, $linkTitle, $url ); -- 2.30.2