getRequest(); $req->setUrl($url); $req->setMethod(\HTTP_Request2::METHOD_HEAD); $res = $req->send(); if (intval($res->getStatus() / 100) >= 4 && $res->getStatus() != 405 //method not supported/allowed ) { return []; } $url = $res->getEffectiveUrl(); $base = new \Net_URL2($url); $urls = $this->extractHeader($res); if (count($urls) === 2) { return $this->absolutifyUrls($urls, $base); } list($type) = explode(';', $res->getHeader('Content-type')); if ($type != 'text/html' && $type != 'text/xml' && $type != 'application/xhtml+xml' && $type != 'application/atom+xml' && $type != 'application/rss+xml' && $res->getStatus() != 405//HEAD method not allowed ) { //we will not be able to extract links from the content return $urls; } //HEAD failed, do a normal GET $req->setMethod(\HTTP_Request2::METHOD_GET); $res = $req->send(); if (intval($res->getStatus() / 100) >= 4) { return $urls; } //yes, maybe the server does return this header now // e.g. PHP's Phar::webPhar() does not work with HEAD // https://bugs.php.net/bug.php?id=51918 $urls = array_merge($this->extractHeader($res), $urls); if (count($urls) === 2) { return $this->absolutifyUrls($urls, $base); } $urls = [];//do not mix header and content links $body = $res->getBody(); $doc = $this->loadHtml($body, $res); $xpath = new \DOMXPath($doc); $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom'); if ($type === 'application/atom+xml') { $tagQuery = '/atom:feed/atom:link['; } else if ($type === 'application/rss+xml') { $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and '; } else { $tagQuery = '/*[self::html or self::h:html]' . '/*[self::head or self::h:head]' . '/*[(self::link or self::h:link)' . ' and'; } $nodeList = $xpath->query( $tagQuery . ' (' . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")' . ' or' . ' contains(concat(" ", normalize-space(@rel), " "), " canonical ")' . ' or' . ' contains(concat(" ", normalize-space(@rel), " "), " self ")' . ' )' . ']' ); if ($nodeList->length == 0) { //topic has no links return $urls; } foreach ($nodeList as $link) { $uri = $link->attributes->getNamedItem('href')->nodeValue; $types = explode( ' ', $link->attributes->getNamedItem('rel')->nodeValue ); foreach ($types as $type) { if ($type == 'canonical') { $type = 'self'; } if ($type == 'self' && !isset($urls['self'])) { $urls['self'] = $uri; } else if ($type == 'hub') { $urls['hub'][] = $uri; } } } // extraction is not necessary; RFC 5988 says: // Note that any base IRI from the message's content is not applied. return $this->absolutifyUrls($urls, $base); } /** * Extract hub url from the HTTP response headers. * * @param object $res HTTP response * * @return array Array with maximal two keys: hub and self */ protected function extractHeader(\HTTP_Request2_Response $res) { $http = new \HTTP2(); $urls = array(); $links = $http->parseLinks($res->getHeader('Link')); foreach ($links as $link) { if (isset($link['_uri']) && isset($link['rel'])) { if (array_search('hub', $link['rel']) !== false) { $urls['hub'][] = $link['_uri']; } if (!isset($urls['self']) && array_search('self', $link['rel']) !== false ) { $urls['self'] = $link['_uri']; } } } return $urls; } /** * Load a DOMDocument from the given HTML or XML * * @param string $sourceBody Content of $source URI * @param object $res HTTP response from fetching $source * * @return \DOMDocument DOM document object with HTML/XML loaded */ protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res) { $doc = new \DOMDocument(); libxml_clear_errors(); $old = libxml_use_internal_errors(true); $typeParts = explode(';', $res->getHeader('content-type')); $type = $typeParts[0]; if ($type == 'application/xhtml+xml' || $type == 'application/xml' || $type == 'text/xml' || $type == 'application/atom+xml' || $type == 'application/rss+xml' ) { $doc->loadXML($sourceBody); } else { $doc->loadHTML($sourceBody); } libxml_clear_errors(); libxml_use_internal_errors($old); return $doc; } /** * Returns the HTTP request object clone that can be used * for one HTTP request. * * @return HTTP_Request2 Clone of the setRequest() object */ public function getRequest() { if ($this->request === null) { $request = new HttpRequest(); $this->setRequestTemplate($request); } //we need to clone because previous requests could have //set internal variables like POST data that we don't want now return clone $this->request; } /** * Sets a custom HTTP request object that will be used to do HTTP requests * * @param object $request Request object * * @return self */ public function setRequestTemplate(\HTTP_Request2 $request) { $this->request = $request; return $this; } /** * Make the list of urls absolute * * @param array $urls Array of maybe relative URLs, or array of URLs * @param object $base Base URL to resolve the relatives against * * @return array List of absolute URLs */ protected function absolutifyUrls($urls, \Net_URL2 $base) { foreach ($urls as $key => $url) { if (is_array($url)) { foreach ($url as $singleKey => $singleUrl) { $urls[$key][$singleKey] = (string) $base->resolve($singleUrl); } } else { $urls[$key] = (string) $base->resolve($url); } } return $urls; } } ?>