X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/d8c39f2b0571b9734259b2f9dc218eed24412332..a1c8309e4b3d8d5468c2defbc8bbae95633aff90:/src/phinde/LinkExtractor/Html.php diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index a6fa8ef..299ed91 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -8,7 +8,7 @@ class Html { public function extract(\HTTP_Request2_Response $res) { - $url = $res->getEffectiveUrl(); + $url = Helper::removeAnchor($res->getEffectiveUrl()); $linkInfos = array(); @@ -22,6 +22,13 @@ class Html $dx = new \DOMXPath($doc); + $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); + if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); + } + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') ->item(0); if ($meta) { @@ -54,7 +61,7 @@ class Html } } } - if ($href == '' || $href{0} == '#') { + if ($href == '' || $href[0] == '#') { //link on this page continue; }