7 * HTTP request object that's used to do the requests
14 * Get the hub and self/canonical URL of a given topic URL.
15 * Uses link headers and parses HTML link rels.
17 * @param string $url Topic URL
19 * @return array Array of URLs with keys: hub, self
21 public function getUrls($url)
23 //at first, try a HEAD request that does not transfer so much data
24 $req = $this->getRequest();
26 $req->setMethod(\HTTP_Request2::METHOD_HEAD);
29 if (intval($res->getStatus() / 100) >= 4
30 && $res->getStatus() != 405 //method not supported/allowed
35 $url = $res->getEffectiveUrl();
36 $base = new \Net_URL2($url);
38 $urls = $this->extractHeader($res);
39 if (count($urls) === 2) {
40 return $this->absolutifyUrls($urls, $base);
43 list($type) = explode(';', $res->getHeader('Content-type'));
44 if ($type != 'text/html' && $type != 'text/xml'
45 && $type != 'application/xhtml+xml'
46 && $type != 'application/atom+xml'
47 && $type != 'application/rss+xml'
48 && $res->getStatus() != 405//HEAD method not allowed
50 //we will not be able to extract links from the content
54 //HEAD failed, do a normal GET
55 $req->setMethod(\HTTP_Request2::METHOD_GET);
57 if (intval($res->getStatus() / 100) >= 4) {
61 //yes, maybe the server does return this header now
62 // e.g. PHP's Phar::webPhar() does not work with HEAD
63 // https://bugs.php.net/bug.php?id=51918
64 $urls = array_merge($this->extractHeader($res), $urls);
65 if (count($urls) === 2) {
66 return $this->absolutifyUrls($urls, $base);
69 $body = $res->getBody();
70 $doc = $this->loadHtml($body, $res);
72 $xpath = new \DOMXPath($doc);
73 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
74 $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
76 if ($type === 'application/atom+xml') {
77 $tagQuery = '/atom:feed/atom:link[';
79 } else if ($type === 'application/rss+xml') {
80 $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
83 $tagQuery = '/*[self::html or self::h:html]'
84 . '/*[self::head or self::h:head]'
85 . '/*[(self::link or self::h:link)'
88 $nodeList = $xpath->query(
91 . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")'
93 . ' contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
95 . ' contains(concat(" ", normalize-space(@rel), " "), " self ")'
100 if ($nodeList->length == 0) {
105 foreach ($nodeList as $link) {
106 $uri = $link->attributes->getNamedItem('href')->nodeValue;
108 ' ', $link->attributes->getNamedItem('rel')->nodeValue
110 foreach ($types as $type) {
111 if ($type == 'canonical') {
114 if ($type == 'hub' || $type == 'self'
115 && !isset($urls[$type])
123 return $this->absolutifyUrls($urls, $base);
127 * Extract hub url from the HTTP response headers.
129 * @param object $res HTTP response
131 * @return array Array with maximal two keys: hub and self
133 protected function extractHeader(\HTTP_Request2_Response $res)
135 $http = new \HTTP2();
138 $links = $http->parseLinks($res->getHeader('Link'));
139 foreach ($links as $link) {
140 if (isset($link['_uri']) && isset($link['rel'])) {
141 if (!isset($urls['hub'])
142 && array_search('hub', $link['rel']) !== false
144 $urls['hub'] = $link['_uri'];
146 if (!isset($urls['self'])
147 && array_search('self', $link['rel']) !== false
149 $urls['self'] = $link['_uri'];
157 * Load a DOMDocument from the given HTML or XML
159 * @param string $sourceBody Content of $source URI
160 * @param object $res HTTP response from fetching $source
162 * @return \DOMDocument DOM document object with HTML/XML loaded
164 protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
166 $doc = new \DOMDocument();
168 libxml_clear_errors();
169 $old = libxml_use_internal_errors(true);
171 $typeParts = explode(';', $res->getHeader('content-type'));
172 $type = $typeParts[0];
173 if ($type == 'application/xhtml+xml'
174 || $type == 'application/xml'
175 || $type == 'text/xml'
176 || $type == 'application/atom+xml'
177 || $type == 'application/rss+xml'
179 $doc->loadXML($sourceBody);
181 $doc->loadHTML($sourceBody);
184 libxml_clear_errors();
185 libxml_use_internal_errors($old);
191 * Returns the HTTP request object clone that can be used
192 * for one HTTP request.
194 * @return HTTP_Request2 Clone of the setRequest() object
196 public function getRequest()
198 if ($this->request === null) {
199 $request = new \HTTP_Request2();
200 $request->setConfig('follow_redirects', true);
201 $this->setRequestTemplate($request);
204 //we need to clone because previous requests could have
205 //set internal variables like POST data that we don't want now
206 return clone $this->request;
210 * Sets a custom HTTP request object that will be used to do HTTP requests
212 * @param object $request Request object
216 public function setRequestTemplate(\HTTP_Request2 $request)
218 $this->request = $request;
223 * Make the list of urls absolute
225 * @param array $urls Array of maybe relative URLs
226 * @param object $base Base URL to resolve the relatives against
228 * @return array List of absolute URLs
230 protected function absolutifyUrls($urls, \Net_URL2 $base)
232 foreach ($urls as $key => $url) {
233 $urls[$key] = (string) $base->resolve($url);