<?php
namespace phinde;
+/**
+ * Perform WebSub discovery for "hub" and "self" URLs
+ *
+ * @link https://www.w3.org/TR/websub/#discovery
+ */
class HubUrlExtractor
{
/**
* Get the hub and self/canonical URL of a given topic URL.
* Uses link headers and parses HTML link rels.
*
- * @param string $url Topic URL
+ * @param string $url Topic URL
+ * @param int $redirects Number of redirects that were followed
*
- * @return array Array of URLs with keys: hub, self
+ * @return array Array of URLs with keys: hub, self.
+ * - "self" value is the URL
+ * - "hub" value is an array of URLs
+ * Keys may be there but most not if the URL
+ * does not advertise them.
*/
- public function getUrls($url)
+ public function getUrls($url, $redirects = 0)
{
//at first, try a HEAD request that does not transfer so much data
$req = $this->getRequest();
$req->setUrl($url);
$req->setMethod(\HTTP_Request2::METHOD_HEAD);
+ $req->setConfig('follow_redirects', false);
$res = $req->send();
if (intval($res->getStatus() / 100) >= 4
&& $res->getStatus() != 405 //method not supported/allowed
) {
- return null;
+ return [];
}
$url = $res->getEffectiveUrl();
return $this->absolutifyUrls($urls, $base);
}
+ if ($res->isRedirect()) {
+ //we tried header links and that failed, now follow the redirect
+ if ($redirects > 5) {
+ return [];
+ }
+ $redirectUrl = (string) $base->resolve($res->getHeader('location'));
+ return $this->getUrls($redirectUrl, $redirects + 1);
+ }
+
list($type) = explode(';', $res->getHeader('Content-type'));
if ($type != 'text/html' && $type != 'text/xml'
&& $type != 'application/xhtml+xml'
- //FIXME: atom, rss
+ && $type != 'application/atom+xml'
+ && $type != 'application/rss+xml'
&& $res->getStatus() != 405//HEAD method not allowed
) {
//we will not be able to extract links from the content
return $this->absolutifyUrls($urls, $base);
}
- //FIXME: atom/rss
+ $urls = [];//do not mix header and content links
+
$body = $res->getBody();
$doc = $this->loadHtml($body, $res);
$xpath = new \DOMXPath($doc);
$xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
+ $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
+
+ if ($type === 'application/atom+xml') {
+ $tagQuery = '/atom:feed/atom:link[';
+
+ } else if ($type === 'application/rss+xml') {
+ $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
+ } else {
+ $tagQuery = '/*[self::html or self::h:html]'
+ . '/*[self::head or self::h:head]'
+ . '/*[(self::link or self::h:link)'
+ . ' and';
+ }
$nodeList = $xpath->query(
- '/*[self::html or self::h:html]'
- . '/*[self::head or self::h:head]'
- . '/*[(self::link or self::h:link)'
- . ' and'
+ $tagQuery
. ' ('
. ' contains(concat(" ", normalize-space(@rel), " "), " hub ")'
. ' or'
if ($type == 'canonical') {
$type = 'self';
}
- if ($type == 'hub' || $type == 'self'
- && !isset($urls[$type])
- ) {
- $urls[$type] = $uri;
+ if ($type == 'self' && !isset($urls['self'])) {
+ $urls['self'] = $uri;
+ } else if ($type == 'hub') {
+ $urls['hub'][] = $uri;
}
}
}
- //FIXME: base href
+ //<base href=".."> extraction is not necessary; RFC 5988 says:
+ // Note that any base IRI from the message's content is not applied.
return $this->absolutifyUrls($urls, $base);
}
$links = $http->parseLinks($res->getHeader('Link'));
foreach ($links as $link) {
if (isset($link['_uri']) && isset($link['rel'])) {
- if (!isset($urls['hub'])
- && array_search('hub', $link['rel']) !== false
- ) {
- $urls['hub'] = $link['_uri'];
+ if (array_search('hub', $link['rel']) !== false) {
+ $urls['hub'][] = $link['_uri'];
}
if (!isset($urls['self'])
&& array_search('self', $link['rel']) !== false
if ($type == 'application/xhtml+xml'
|| $type == 'application/xml'
|| $type == 'text/xml'
+ || $type == 'application/atom+xml'
+ || $type == 'application/rss+xml'
) {
$doc->loadXML($sourceBody);
} else {
public function getRequest()
{
if ($this->request === null) {
- $request = new \HTTP_Request2();
- $request->setConfig('follow_redirects', true);
+ $request = new HttpRequest();
$this->setRequestTemplate($request);
}
/**
* Make the list of urls absolute
*
- * @param array $urls Array of maybe relative URLs
+ * @param array $urls Array of maybe relative URLs, or array of URLs
* @param object $base Base URL to resolve the relatives against
*
* @return array List of absolute URLs
protected function absolutifyUrls($urls, \Net_URL2 $base)
{
foreach ($urls as $key => $url) {
- $urls[$key] = (string) $base->resolve($url);
+ if (is_array($url)) {
+ foreach ($url as $singleKey => $singleUrl) {
+ $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
+ }
+ } else {
+ $urls[$key] = (string) $base->resolve($url);
+ }
}
return $urls;
}