X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/aa1b08f7f5ab5a0ab723f7c698858a37ec3cba40..HEAD:/src/phinde/HubUrlExtractor.php diff --git a/src/phinde/HubUrlExtractor.php b/src/phinde/HubUrlExtractor.php index 81a612c..da29650 100644 --- a/src/phinde/HubUrlExtractor.php +++ b/src/phinde/HubUrlExtractor.php @@ -1,6 +1,11 @@ getRequest(); $req->setUrl($url); $req->setMethod(\HTTP_Request2::METHOD_HEAD); + $req->setConfig('follow_redirects', false); $res = $req->send(); if (intval($res->getStatus() / 100) >= 4 && $res->getStatus() != 405 //method not supported/allowed ) { - return null; + return []; } $url = $res->getEffectiveUrl(); @@ -40,6 +51,15 @@ class HubUrlExtractor return $this->absolutifyUrls($urls, $base); } + if ($res->isRedirect()) { + //we tried header links and that failed, now follow the redirect + if ($redirects > 5) { + return []; + } + $redirectUrl = (string) $base->resolve($res->getHeader('location')); + return $this->getUrls($redirectUrl, $redirects + 1); + } + list($type) = explode(';', $res->getHeader('Content-type')); if ($type != 'text/html' && $type != 'text/xml' && $type != 'application/xhtml+xml' @@ -66,6 +86,8 @@ class HubUrlExtractor return $this->absolutifyUrls($urls, $base); } + $urls = [];//do not mix header and content links + $body = $res->getBody(); $doc = $this->loadHtml($body, $res); @@ -111,15 +133,16 @@ class HubUrlExtractor if ($type == 'canonical') { $type = 'self'; } - if ($type == 'hub' || $type == 'self' - && !isset($urls[$type]) - ) { - $urls[$type] = $uri; + if ($type == 'self' && !isset($urls['self'])) { + $urls['self'] = $uri; + } else if ($type == 'hub') { + $urls['hub'][] = $uri; } } } - //FIXME: base href + // extraction is not necessary; RFC 5988 says: + // Note that any base IRI from the message's content is not applied. return $this->absolutifyUrls($urls, $base); } @@ -138,10 +161,8 @@ class HubUrlExtractor $links = $http->parseLinks($res->getHeader('Link')); foreach ($links as $link) { if (isset($link['_uri']) && isset($link['rel'])) { - if (!isset($urls['hub']) - && array_search('hub', $link['rel']) !== false - ) { - $urls['hub'] = $link['_uri']; + if (array_search('hub', $link['rel']) !== false) { + $urls['hub'][] = $link['_uri']; } if (!isset($urls['self']) && array_search('self', $link['rel']) !== false @@ -221,7 +242,7 @@ class HubUrlExtractor /** * Make the list of urls absolute * - * @param array $urls Array of maybe relative URLs + * @param array $urls Array of maybe relative URLs, or array of URLs * @param object $base Base URL to resolve the relatives against * * @return array List of absolute URLs @@ -229,7 +250,13 @@ class HubUrlExtractor protected function absolutifyUrls($urls, \Net_URL2 $base) { foreach ($urls as $key => $url) { - $urls[$key] = (string) $base->resolve($url); + if (is_array($url)) { + foreach ($url as $singleKey => $singleUrl) { + $urls[$key][$singleKey] = (string) $base->resolve($singleUrl); + } + } else { + $urls[$key] = (string) $base->resolve($url); + } } return $urls; }