X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/d8c39f2b0571b9734259b2f9dc218eed24412332..8b9ae4fc9a3f8402001dd1a054658d5e1246efff:/src/phinde/LinkExtractor/Html.php diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index a6fa8ef..b3a9ea6 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -8,7 +8,7 @@ class Html { public function extract(\HTTP_Request2_Response $res) { - $url = $res->getEffectiveUrl(); + $url = Helper::removeAnchor($res->getEffectiveUrl()); $linkInfos = array(); @@ -22,6 +22,13 @@ class Html $dx = new \DOMXPath($doc); + $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); + if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); + } + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') ->item(0); if ($meta) {