src/phinde/HubUrlExtractor.php

   1 <?php
   2 namespace phinde;
   3
   4 /**
   5  * Perform WebSub discovery for "hub" and "self" URLs
   6  *
   7  * @link https://www.w3.org/TR/websub/#discovery
   8  */
   9 class HubUrlExtractor
  10 {
  11     /**
  12      * HTTP request object that's used to do the requests
  13      *
  14      * @var \HTTP_Request2
  15      */
  16     protected $request;
  17
  18     /**
  19      * Get the hub and self/canonical URL of a given topic URL.
  20      * Uses link headers and parses HTML link rels.
  21      *
  22      * @param string $url Topic URL
  23      *
  24      * @return array Array of URLs with keys: hub, self.
  25      *               - "self" value is the URL
  26      *               - "hub"  value is an array of URLs
  27      *               Keys may be there but most not if the URL
  28      *               does not advertise them.
  29      */
  30     public function getUrls($url)
  31     {
  32         //at first, try a HEAD request that does not transfer so much data
  33         $req = $this->getRequest();
  34         $req->setUrl($url);
  35         $req->setMethod(\HTTP_Request2::METHOD_HEAD);
  36         $res = $req->send();
  37
  38         if (intval($res->getStatus() / 100) >= 4
  39             && $res->getStatus() != 405 //method not supported/allowed
  40         ) {
  41             return [];
  42         }
  43
  44         $url  = $res->getEffectiveUrl();
  45         $base = new \Net_URL2($url);
  46
  47         $urls = $this->extractHeader($res);
  48         if (count($urls) === 2) {
  49             return $this->absolutifyUrls($urls, $base);
  50         }
  51
  52         list($type) = explode(';', $res->getHeader('Content-type'));
  53         if ($type != 'text/html' && $type != 'text/xml'
  54             && $type != 'application/xhtml+xml'
  55             && $type != 'application/atom+xml'
  56             && $type != 'application/rss+xml'
  57             && $res->getStatus() != 405//HEAD method not allowed
  58         ) {
  59             //we will not be able to extract links from the content
  60             return $urls;
  61         }
  62
  63         //HEAD failed, do a normal GET
  64         $req->setMethod(\HTTP_Request2::METHOD_GET);
  65         $res = $req->send();
  66         if (intval($res->getStatus() / 100) >= 4) {
  67             return $urls;
  68         }
  69
  70         //yes, maybe the server does return this header now
  71         // e.g. PHP's Phar::webPhar() does not work with HEAD
  72         // https://bugs.php.net/bug.php?id=51918
  73         $urls = array_merge($this->extractHeader($res), $urls);
  74         if (count($urls) === 2) {
  75             return $this->absolutifyUrls($urls, $base);
  76         }
  77
  78         $urls = [];//do not mix header and content links
  79
  80         $body = $res->getBody();
  81         $doc = $this->loadHtml($body, $res);
  82
  83         $xpath = new \DOMXPath($doc);
  84         $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
  85         $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
  86
  87         if ($type === 'application/atom+xml') {
  88             $tagQuery = '/atom:feed/atom:link[';
  89
  90         } else if ($type === 'application/rss+xml') {
  91             $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
  92
  93         } else {
  94             $tagQuery = '/*[self::html or self::h:html]'
  95                 . '/*[self::head or self::h:head]'
  96                 . '/*[(self::link or self::h:link)'
  97                 . ' and';
  98         }
  99         $nodeList = $xpath->query(
 100             $tagQuery
 101             . ' ('
 102             . '  contains(concat(" ", normalize-space(@rel), " "), " hub ")'
 103             . '  or'
 104             . '  contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
 105             . '  or'
 106             . '  contains(concat(" ", normalize-space(@rel), " "), " self ")'
 107             . ' )'
 108             . ']'
 109         );
 110
 111         if ($nodeList->length == 0) {
 112             //topic has no links
 113             return $urls;
 114         }
 115
 116         foreach ($nodeList as $link) {
 117             $uri  = $link->attributes->getNamedItem('href')->nodeValue;
 118             $types = explode(
 119                 ' ', $link->attributes->getNamedItem('rel')->nodeValue
 120             );
 121             foreach ($types as $type) {
 122                 if ($type == 'canonical') {
 123                     $type = 'self';
 124                 }
 125                 if ($type == 'self' && !isset($urls['self'])) {
 126                     $urls['self'] = $uri;
 127                 } else if ($type == 'hub') {
 128                     $urls['hub'][] = $uri;
 129                 }
 130             }
 131         }
 132
 133         //<base href=".."> extraction is not necessary; RFC 5988 says:
 134         // Note that any base IRI from the message's content is not applied.
 135         return $this->absolutifyUrls($urls, $base);
 136     }
 137
 138     /**
 139      * Extract hub url from the HTTP response headers.
 140      *
 141      * @param object $res HTTP response
 142      *
 143      * @return array Array with maximal two keys: hub and self
 144      */
 145     protected function extractHeader(\HTTP_Request2_Response $res)
 146     {
 147         $http = new \HTTP2();
 148
 149         $urls = array();
 150         $links = $http->parseLinks($res->getHeader('Link'));
 151         foreach ($links as $link) {
 152             if (isset($link['_uri']) && isset($link['rel'])) {
 153                 if (array_search('hub', $link['rel']) !== false) {
 154                     $urls['hub'][] = $link['_uri'];
 155                 }
 156                 if (!isset($urls['self'])
 157                     && array_search('self', $link['rel']) !== false
 158                 ) {
 159                     $urls['self'] = $link['_uri'];
 160                 }
 161             }
 162         }
 163         return $urls;
 164     }
 165
 166     /**
 167      * Load a DOMDocument from the given HTML or XML
 168      *
 169      * @param string $sourceBody Content of $source URI
 170      * @param object $res        HTTP response from fetching $source
 171      *
 172      * @return \DOMDocument DOM document object with HTML/XML loaded
 173      */
 174     protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
 175     {
 176         $doc = new \DOMDocument();
 177
 178         libxml_clear_errors();
 179         $old = libxml_use_internal_errors(true);
 180
 181         $typeParts = explode(';', $res->getHeader('content-type'));
 182         $type = $typeParts[0];
 183         if ($type == 'application/xhtml+xml'
 184             || $type == 'application/xml'
 185             || $type == 'text/xml'
 186             || $type == 'application/atom+xml'
 187             || $type == 'application/rss+xml'
 188         ) {
 189             $doc->loadXML($sourceBody);
 190         } else {
 191             $doc->loadHTML($sourceBody);
 192         }
 193
 194         libxml_clear_errors();
 195         libxml_use_internal_errors($old);
 196
 197         return $doc;
 198     }
 199
 200     /**
 201      * Returns the HTTP request object clone that can be used
 202      * for one HTTP request.
 203      *
 204      * @return HTTP_Request2 Clone of the setRequest() object
 205      */
 206     public function getRequest()
 207     {
 208         if ($this->request === null) {
 209             $request = new HttpRequest();
 210             $this->setRequestTemplate($request);
 211         }
 212
 213         //we need to clone because previous requests could have
 214         //set internal variables like POST data that we don't want now
 215         return clone $this->request;
 216     }
 217
 218     /**
 219      * Sets a custom HTTP request object that will be used to do HTTP requests
 220      *
 221      * @param object $request Request object
 222      *
 223      * @return self
 224      */
 225     public function setRequestTemplate(\HTTP_Request2 $request)
 226     {
 227         $this->request = $request;
 228         return $this;
 229     }
 230
 231     /**
 232      * Make the list of urls absolute
 233      *
 234      * @param array  $urls Array of maybe relative URLs, or array of URLs
 235      * @param object $base Base URL to resolve the relatives against
 236      *
 237      * @return array List of absolute URLs
 238      */
 239     protected function absolutifyUrls($urls, \Net_URL2 $base)
 240     {
 241         foreach ($urls as $key => $url) {
 242             if (is_array($url)) {
 243                 foreach ($url as $singleKey => $singleUrl) {
 244                     $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
 245                 }
 246             } else {
 247                 $urls[$key] = (string) $base->resolve($url);
 248             }
 249         }
 250         return $urls;
 251     }
 252 }
 253 ?>