X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/fd98bb30be8970309c52d3fc3a1585d7454b370a..f98e891b454e5677bdf61f476e366b01af713b50:/src/phinde/LinkExtractor/Html.php diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index 4acd19f..b3a9ea6 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -2,12 +2,13 @@ namespace phinde\LinkExtractor; use phinde\LinkInfo; +use phinde\Helper; class Html { public function extract(\HTTP_Request2_Response $res) { - $url = $res->getEffectiveUrl(); + $url = Helper::removeAnchor($res->getEffectiveUrl()); $linkInfos = array(); @@ -21,6 +22,13 @@ class Html $dx = new \DOMXPath($doc); + $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); + if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); + } + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') ->item(0); if ($meta) { @@ -36,10 +44,10 @@ class Html $links = $dx->evaluate('//a'); //FIXME: link rel, img, video - $alreadySeen = array(); + $alreadySeen = array($url => true); foreach ($links as $link) { - $linkTitle = $link->textContent; + $linkTitle = Helper::sanitizeTitle($link->textContent); $href = ''; foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') {