5 * Perform WebSub discovery for "hub" and "self" URLs
7 * @link https://www.w3.org/TR/websub/#discovery
12 * HTTP request object that's used to do the requests
19 * Get the hub and self/canonical URL of a given topic URL.
20 * Uses link headers and parses HTML link rels.
22 * @param string $url Topic URL
23 * @param int $redirects Number of redirects that were followed
25 * @return array Array of URLs with keys: hub, self.
26 * - "self" value is the URL
27 * - "hub" value is an array of URLs
28 * Keys may be there but most not if the URL
29 * does not advertise them.
31 public function getUrls($url, $redirects = 0)
33 //at first, try a HEAD request that does not transfer so much data
34 $req = $this->getRequest();
36 $req->setMethod(\HTTP_Request2::METHOD_HEAD);
37 $req->setConfig('follow_redirects', false);
40 if (intval($res->getStatus() / 100) >= 4
41 && $res->getStatus() != 405 //method not supported/allowed
46 $url = $res->getEffectiveUrl();
47 $base = new \Net_URL2($url);
49 $urls = $this->extractHeader($res);
50 if (count($urls) === 2) {
51 return $this->absolutifyUrls($urls, $base);
54 if ($res->isRedirect()) {
55 //we tried header links and that failed, now follow the redirect
59 $redirectUrl = (string) $base->resolve($res->getHeader('location'));
60 return $this->getUrls($redirectUrl, $redirects + 1);
63 list($type) = explode(';', $res->getHeader('Content-type'));
64 if ($type != 'text/html' && $type != 'text/xml'
65 && $type != 'application/xhtml+xml'
66 && $type != 'application/atom+xml'
67 && $type != 'application/rss+xml'
68 && $res->getStatus() != 405//HEAD method not allowed
70 //we will not be able to extract links from the content
74 //HEAD failed, do a normal GET
75 $req->setMethod(\HTTP_Request2::METHOD_GET);
77 if (intval($res->getStatus() / 100) >= 4) {
81 //yes, maybe the server does return this header now
82 // e.g. PHP's Phar::webPhar() does not work with HEAD
83 // https://bugs.php.net/bug.php?id=51918
84 $urls = array_merge($this->extractHeader($res), $urls);
85 if (count($urls) === 2) {
86 return $this->absolutifyUrls($urls, $base);
89 $urls = [];//do not mix header and content links
91 $body = $res->getBody();
92 $doc = $this->loadHtml($body, $res);
94 $xpath = new \DOMXPath($doc);
95 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
96 $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
98 if ($type === 'application/atom+xml') {
99 $tagQuery = '/atom:feed/atom:link[';
101 } else if ($type === 'application/rss+xml') {
102 $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
105 $tagQuery = '/*[self::html or self::h:html]'
106 . '/*[self::head or self::h:head]'
107 . '/*[(self::link or self::h:link)'
110 $nodeList = $xpath->query(
113 . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")'
115 . ' contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
117 . ' contains(concat(" ", normalize-space(@rel), " "), " self ")'
122 if ($nodeList->length == 0) {
127 foreach ($nodeList as $link) {
128 $uri = $link->attributes->getNamedItem('href')->nodeValue;
130 ' ', $link->attributes->getNamedItem('rel')->nodeValue
132 foreach ($types as $type) {
133 if ($type == 'canonical') {
136 if ($type == 'self' && !isset($urls['self'])) {
137 $urls['self'] = $uri;
138 } else if ($type == 'hub') {
139 $urls['hub'][] = $uri;
144 //<base href=".."> extraction is not necessary; RFC 5988 says:
145 // Note that any base IRI from the message's content is not applied.
146 return $this->absolutifyUrls($urls, $base);
150 * Extract hub url from the HTTP response headers.
152 * @param object $res HTTP response
154 * @return array Array with maximal two keys: hub and self
156 protected function extractHeader(\HTTP_Request2_Response $res)
158 $http = new \HTTP2();
161 $links = $http->parseLinks($res->getHeader('Link'));
162 foreach ($links as $link) {
163 if (isset($link['_uri']) && isset($link['rel'])) {
164 if (array_search('hub', $link['rel']) !== false) {
165 $urls['hub'][] = $link['_uri'];
167 if (!isset($urls['self'])
168 && array_search('self', $link['rel']) !== false
170 $urls['self'] = $link['_uri'];
178 * Load a DOMDocument from the given HTML or XML
180 * @param string $sourceBody Content of $source URI
181 * @param object $res HTTP response from fetching $source
183 * @return \DOMDocument DOM document object with HTML/XML loaded
185 protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
187 $doc = new \DOMDocument();
189 libxml_clear_errors();
190 $old = libxml_use_internal_errors(true);
192 $typeParts = explode(';', $res->getHeader('content-type'));
193 $type = $typeParts[0];
194 if ($type == 'application/xhtml+xml'
195 || $type == 'application/xml'
196 || $type == 'text/xml'
197 || $type == 'application/atom+xml'
198 || $type == 'application/rss+xml'
200 $doc->loadXML($sourceBody);
202 $doc->loadHTML($sourceBody);
205 libxml_clear_errors();
206 libxml_use_internal_errors($old);
212 * Returns the HTTP request object clone that can be used
213 * for one HTTP request.
215 * @return HTTP_Request2 Clone of the setRequest() object
217 public function getRequest()
219 if ($this->request === null) {
220 $request = new HttpRequest();
221 $this->setRequestTemplate($request);
224 //we need to clone because previous requests could have
225 //set internal variables like POST data that we don't want now
226 return clone $this->request;
230 * Sets a custom HTTP request object that will be used to do HTTP requests
232 * @param object $request Request object
236 public function setRequestTemplate(\HTTP_Request2 $request)
238 $this->request = $request;
243 * Make the list of urls absolute
245 * @param array $urls Array of maybe relative URLs, or array of URLs
246 * @param object $base Base URL to resolve the relatives against
248 * @return array List of absolute URLs
250 protected function absolutifyUrls($urls, \Net_URL2 $base)
252 foreach ($urls as $key => $url) {
253 if (is_array($url)) {
254 foreach ($url as $singleKey => $singleUrl) {
255 $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
258 $urls[$key] = (string) $base->resolve($url);