remove anchor from source URLs
[phinde.git] / src / phinde / LinkExtractor / Html.php
index a6fa8efef8fe9725722bf94bbd137671b73e3afe..b3a9ea65170f8f50bd5a09492c72eb7e028829f3 100644 (file)
@@ -8,7 +8,7 @@ class Html
 {
     public function extract(\HTTP_Request2_Response $res)
     {
-        $url = $res->getEffectiveUrl();
+        $url = Helper::removeAnchor($res->getEffectiveUrl());
 
         $linkInfos = array();
 
@@ -22,6 +22,13 @@ class Html
 
         $dx = new \DOMXPath($doc);
 
+        $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
+        if ($xbase) {
+            $base = $base->resolve(
+                $xbase->attributes->getNamedItem('href')->textContent
+            );
+        }
+
         $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
             ->item(0);
         if ($meta) {