src/phinde/HubUrlExtractor.php

   1 <?php
   2 namespace phinde;
   3
   4 /**
   5  * Perform WebSub discovery for "hub" and "self" URLs
   6  *
   7  * @link https://www.w3.org/TR/websub/#discovery
   8  */
   9 class HubUrlExtractor
  10 {
  11     /**
  12      * HTTP request object that's used to do the requests
  13      *
  14      * @var \HTTP_Request2
  15      */
  16     protected $request;
  17
  18     /**
  19      * Get the hub and self/canonical URL of a given topic URL.
  20      * Uses link headers and parses HTML link rels.
  21      *
  22      * @param string $url       Topic URL
  23      * @param int    $redirects Number of redirects that were followed
  24      *
  25      * @return array Array of URLs with keys: hub, self.
  26      *               - "self" value is the URL
  27      *               - "hub"  value is an array of URLs
  28      *               Keys may be there but most not if the URL
  29      *               does not advertise them.
  30      */
  31     public function getUrls($url, $redirects = 0)
  32     {
  33         //at first, try a HEAD request that does not transfer so much data
  34         $req = $this->getRequest();
  35         $req->setUrl($url);
  36         $req->setMethod(\HTTP_Request2::METHOD_HEAD);
  37         $req->setConfig('follow_redirects', false);
  38         $res = $req->send();
  39
  40         if (intval($res->getStatus() / 100) >= 4
  41             && $res->getStatus() != 405 //method not supported/allowed
  42         ) {
  43             return [];
  44         }
  45
  46         $url  = $res->getEffectiveUrl();
  47         $base = new \Net_URL2($url);
  48
  49         $urls = $this->extractHeader($res);
  50         if (count($urls) === 2) {
  51             return $this->absolutifyUrls($urls, $base);
  52         }
  53
  54         if ($res->isRedirect()) {
  55             //we tried header links and that failed, now follow the redirect
  56             if ($redirects > 5) {
  57                 return [];
  58             }
  59             $redirectUrl = (string) $base->resolve($res->getHeader('location'));
  60             return $this->getUrls($redirectUrl, $redirects + 1);
  61         }
  62
  63         list($type) = explode(';', $res->getHeader('Content-type'));
  64         if ($type != 'text/html' && $type != 'text/xml'
  65             && $type != 'application/xhtml+xml'
  66             && $type != 'application/atom+xml'
  67             && $type != 'application/rss+xml'
  68             && $res->getStatus() != 405//HEAD method not allowed
  69         ) {
  70             //we will not be able to extract links from the content
  71             return $urls;
  72         }
  73
  74         //HEAD failed, do a normal GET
  75         $req->setMethod(\HTTP_Request2::METHOD_GET);
  76         $res = $req->send();
  77         if (intval($res->getStatus() / 100) >= 4) {
  78             return $urls;
  79         }
  80
  81         //yes, maybe the server does return this header now
  82         // e.g. PHP's Phar::webPhar() does not work with HEAD
  83         // https://bugs.php.net/bug.php?id=51918
  84         $urls = array_merge($this->extractHeader($res), $urls);
  85         if (count($urls) === 2) {
  86             return $this->absolutifyUrls($urls, $base);
  87         }
  88
  89         $urls = [];//do not mix header and content links
  90
  91         $body = $res->getBody();
  92         $doc = $this->loadHtml($body, $res);
  93
  94         $xpath = new \DOMXPath($doc);
  95         $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
  96         $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
  97
  98         if ($type === 'application/atom+xml') {
  99             $tagQuery = '/atom:feed/atom:link[';
 100
 101         } else if ($type === 'application/rss+xml') {
 102             $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
 103
 104         } else {
 105             $tagQuery = '/*[self::html or self::h:html]'
 106                 . '/*[self::head or self::h:head]'
 107                 . '/*[(self::link or self::h:link)'
 108                 . ' and';
 109         }
 110         $nodeList = $xpath->query(
 111             $tagQuery
 112             . ' ('
 113             . '  contains(concat(" ", normalize-space(@rel), " "), " hub ")'
 114             . '  or'
 115             . '  contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
 116             . '  or'
 117             . '  contains(concat(" ", normalize-space(@rel), " "), " self ")'
 118             . ' )'
 119             . ']'
 120         );
 121
 122         if ($nodeList->length == 0) {
 123             //topic has no links
 124             return $urls;
 125         }
 126
 127         foreach ($nodeList as $link) {
 128             $uri  = $link->attributes->getNamedItem('href')->nodeValue;
 129             $types = explode(
 130                 ' ', $link->attributes->getNamedItem('rel')->nodeValue
 131             );
 132             foreach ($types as $type) {
 133                 if ($type == 'canonical') {
 134                     $type = 'self';
 135                 }
 136                 if ($type == 'self' && !isset($urls['self'])) {
 137                     $urls['self'] = $uri;
 138                 } else if ($type == 'hub') {
 139                     $urls['hub'][] = $uri;
 140                 }
 141             }
 142         }
 143
 144         //<base href=".."> extraction is not necessary; RFC 5988 says:
 145         // Note that any base IRI from the message's content is not applied.
 146         return $this->absolutifyUrls($urls, $base);
 147     }
 148
 149     /**
 150      * Extract hub url from the HTTP response headers.
 151      *
 152      * @param object $res HTTP response
 153      *
 154      * @return array Array with maximal two keys: hub and self
 155      */
 156     protected function extractHeader(\HTTP_Request2_Response $res)
 157     {
 158         $http = new \HTTP2();
 159
 160         $urls = array();
 161         $links = $http->parseLinks($res->getHeader('Link'));
 162         foreach ($links as $link) {
 163             if (isset($link['_uri']) && isset($link['rel'])) {
 164                 if (array_search('hub', $link['rel']) !== false) {
 165                     $urls['hub'][] = $link['_uri'];
 166                 }
 167                 if (!isset($urls['self'])
 168                     && array_search('self', $link['rel']) !== false
 169                 ) {
 170                     $urls['self'] = $link['_uri'];
 171                 }
 172             }
 173         }
 174         return $urls;
 175     }
 176
 177     /**
 178      * Load a DOMDocument from the given HTML or XML
 179      *
 180      * @param string $sourceBody Content of $source URI
 181      * @param object $res        HTTP response from fetching $source
 182      *
 183      * @return \DOMDocument DOM document object with HTML/XML loaded
 184      */
 185     protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
 186     {
 187         $doc = new \DOMDocument();
 188
 189         libxml_clear_errors();
 190         $old = libxml_use_internal_errors(true);
 191
 192         $typeParts = explode(';', $res->getHeader('content-type'));
 193         $type = $typeParts[0];
 194         if ($type == 'application/xhtml+xml'
 195             || $type == 'application/xml'
 196             || $type == 'text/xml'
 197             || $type == 'application/atom+xml'
 198             || $type == 'application/rss+xml'
 199         ) {
 200             $doc->loadXML($sourceBody);
 201         } else {
 202             $doc->loadHTML($sourceBody);
 203         }
 204
 205         libxml_clear_errors();
 206         libxml_use_internal_errors($old);
 207
 208         return $doc;
 209     }
 210
 211     /**
 212      * Returns the HTTP request object clone that can be used
 213      * for one HTTP request.
 214      *
 215      * @return HTTP_Request2 Clone of the setRequest() object
 216      */
 217     public function getRequest()
 218     {
 219         if ($this->request === null) {
 220             $request = new HttpRequest();
 221             $this->setRequestTemplate($request);
 222         }
 223
 224         //we need to clone because previous requests could have
 225         //set internal variables like POST data that we don't want now
 226         return clone $this->request;
 227     }
 228
 229     /**
 230      * Sets a custom HTTP request object that will be used to do HTTP requests
 231      *
 232      * @param object $request Request object
 233      *
 234      * @return self
 235      */
 236     public function setRequestTemplate(\HTTP_Request2 $request)
 237     {
 238         $this->request = $request;
 239         return $this;
 240     }
 241
 242     /**
 243      * Make the list of urls absolute
 244      *
 245      * @param array  $urls Array of maybe relative URLs, or array of URLs
 246      * @param object $base Base URL to resolve the relatives against
 247      *
 248      * @return array List of absolute URLs
 249      */
 250     protected function absolutifyUrls($urls, \Net_URL2 $base)
 251     {
 252         foreach ($urls as $key => $url) {
 253             if (is_array($url)) {
 254                 foreach ($url as $singleKey => $singleUrl) {
 255                     $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
 256                 }
 257             } else {
 258                 $urls[$key] = (string) $base->resolve($url);
 259             }
 260         }
 261         return $urls;
 262     }
 263 }
 264 ?>