5 * Perform WebSub discovery for "hub" and "self" URLs
7 * @link https://www.w3.org/TR/websub/#discovery
12 * HTTP request object that's used to do the requests
19 * Get the hub and self/canonical URL of a given topic URL.
20 * Uses link headers and parses HTML link rels.
22 * @param string $url Topic URL
24 * @return array Array of URLs with keys: hub, self.
25 * - "self" value is the URL
26 * - "hub" value is an array of URLs
27 * Keys may be there but most not if the URL
28 * does not advertise them.
30 public function getUrls($url)
32 //at first, try a HEAD request that does not transfer so much data
33 $req = $this->getRequest();
35 $req->setMethod(\HTTP_Request2::METHOD_HEAD);
38 if (intval($res->getStatus() / 100) >= 4
39 && $res->getStatus() != 405 //method not supported/allowed
44 $url = $res->getEffectiveUrl();
45 $base = new \Net_URL2($url);
47 $urls = $this->extractHeader($res);
48 if (count($urls) === 2) {
49 return $this->absolutifyUrls($urls, $base);
52 list($type) = explode(';', $res->getHeader('Content-type'));
53 if ($type != 'text/html' && $type != 'text/xml'
54 && $type != 'application/xhtml+xml'
55 && $type != 'application/atom+xml'
56 && $type != 'application/rss+xml'
57 && $res->getStatus() != 405//HEAD method not allowed
59 //we will not be able to extract links from the content
63 //HEAD failed, do a normal GET
64 $req->setMethod(\HTTP_Request2::METHOD_GET);
66 if (intval($res->getStatus() / 100) >= 4) {
70 //yes, maybe the server does return this header now
71 // e.g. PHP's Phar::webPhar() does not work with HEAD
72 // https://bugs.php.net/bug.php?id=51918
73 $urls = array_merge($this->extractHeader($res), $urls);
74 if (count($urls) === 2) {
75 return $this->absolutifyUrls($urls, $base);
78 $urls = [];//do not mix header and content links
80 $body = $res->getBody();
81 $doc = $this->loadHtml($body, $res);
83 $xpath = new \DOMXPath($doc);
84 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
85 $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
87 if ($type === 'application/atom+xml') {
88 $tagQuery = '/atom:feed/atom:link[';
90 } else if ($type === 'application/rss+xml') {
91 $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
94 $tagQuery = '/*[self::html or self::h:html]'
95 . '/*[self::head or self::h:head]'
96 . '/*[(self::link or self::h:link)'
99 $nodeList = $xpath->query(
102 . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")'
104 . ' contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
106 . ' contains(concat(" ", normalize-space(@rel), " "), " self ")'
111 if ($nodeList->length == 0) {
116 foreach ($nodeList as $link) {
117 $uri = $link->attributes->getNamedItem('href')->nodeValue;
119 ' ', $link->attributes->getNamedItem('rel')->nodeValue
121 foreach ($types as $type) {
122 if ($type == 'canonical') {
125 if ($type == 'self' && !isset($urls['self'])) {
126 $urls['self'] = $uri;
127 } else if ($type == 'hub') {
128 $urls['hub'][] = $uri;
133 //<base href=".."> extraction is not necessary; RFC 5988 says:
134 // Note that any base IRI from the message's content is not applied.
135 return $this->absolutifyUrls($urls, $base);
139 * Extract hub url from the HTTP response headers.
141 * @param object $res HTTP response
143 * @return array Array with maximal two keys: hub and self
145 protected function extractHeader(\HTTP_Request2_Response $res)
147 $http = new \HTTP2();
150 $links = $http->parseLinks($res->getHeader('Link'));
151 foreach ($links as $link) {
152 if (isset($link['_uri']) && isset($link['rel'])) {
153 if (array_search('hub', $link['rel']) !== false) {
154 $urls['hub'][] = $link['_uri'];
156 if (!isset($urls['self'])
157 && array_search('self', $link['rel']) !== false
159 $urls['self'] = $link['_uri'];
167 * Load a DOMDocument from the given HTML or XML
169 * @param string $sourceBody Content of $source URI
170 * @param object $res HTTP response from fetching $source
172 * @return \DOMDocument DOM document object with HTML/XML loaded
174 protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
176 $doc = new \DOMDocument();
178 libxml_clear_errors();
179 $old = libxml_use_internal_errors(true);
181 $typeParts = explode(';', $res->getHeader('content-type'));
182 $type = $typeParts[0];
183 if ($type == 'application/xhtml+xml'
184 || $type == 'application/xml'
185 || $type == 'text/xml'
186 || $type == 'application/atom+xml'
187 || $type == 'application/rss+xml'
189 $doc->loadXML($sourceBody);
191 $doc->loadHTML($sourceBody);
194 libxml_clear_errors();
195 libxml_use_internal_errors($old);
201 * Returns the HTTP request object clone that can be used
202 * for one HTTP request.
204 * @return HTTP_Request2 Clone of the setRequest() object
206 public function getRequest()
208 if ($this->request === null) {
209 $request = new HttpRequest();
210 $this->setRequestTemplate($request);
213 //we need to clone because previous requests could have
214 //set internal variables like POST data that we don't want now
215 return clone $this->request;
219 * Sets a custom HTTP request object that will be used to do HTTP requests
221 * @param object $request Request object
225 public function setRequestTemplate(\HTTP_Request2 $request)
227 $this->request = $request;
232 * Make the list of urls absolute
234 * @param array $urls Array of maybe relative URLs, or array of URLs
235 * @param object $base Base URL to resolve the relatives against
237 * @return array List of absolute URLs
239 protected function absolutifyUrls($urls, \Net_URL2 $base)
241 foreach ($urls as $key => $url) {
242 if (is_array($url)) {
243 foreach ($url as $singleKey => $singleUrl) {
244 $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
247 $urls[$key] = (string) $base->resolve($url);