src/phinde/HubUrlExtractor.php

   1 <?php
   2 namespace phinde;
   3
   4 class HubUrlExtractor
   5 {
   6     /**
   7      * HTTP request object that's used to do the requests
   8      *
   9      * @var \HTTP_Request2
  10      */
  11     protected $request;
  12
  13     /**
  14      * Get the hub and self/canonical URL of a given topic URL.
  15      * Uses link headers and parses HTML link rels.
  16      *
  17      * @param string $url Topic URL
  18      *
  19      * @return array Array of URLs with keys: hub, self
  20      */
  21     public function getUrls($url)
  22     {
  23         //at first, try a HEAD request that does not transfer so much data
  24         $req = $this->getRequest();
  25         $req->setUrl($url);
  26         $req->setMethod(\HTTP_Request2::METHOD_HEAD);
  27         $res = $req->send();
  28
  29         if (intval($res->getStatus() / 100) >= 4
  30             && $res->getStatus() != 405 //method not supported/allowed
  31         ) {
  32             return null;
  33         }
  34
  35         $url  = $res->getEffectiveUrl();
  36         $base = new \Net_URL2($url);
  37
  38         $urls = $this->extractHeader($res);
  39         if (count($urls) === 2) {
  40             return $this->absolutifyUrls($urls, $base);
  41         }
  42
  43         list($type) = explode(';', $res->getHeader('Content-type'));
  44         if ($type != 'text/html' && $type != 'text/xml'
  45             && $type != 'application/xhtml+xml'
  46             && $type != 'application/atom+xml'
  47             && $type != 'application/rss+xml'
  48             && $res->getStatus() != 405//HEAD method not allowed
  49         ) {
  50             //we will not be able to extract links from the content
  51             return $urls;
  52         }
  53
  54         //HEAD failed, do a normal GET
  55         $req->setMethod(\HTTP_Request2::METHOD_GET);
  56         $res = $req->send();
  57         if (intval($res->getStatus() / 100) >= 4) {
  58             return $urls;
  59         }
  60
  61         //yes, maybe the server does return this header now
  62         // e.g. PHP's Phar::webPhar() does not work with HEAD
  63         // https://bugs.php.net/bug.php?id=51918
  64         $urls = array_merge($this->extractHeader($res), $urls);
  65         if (count($urls) === 2) {
  66             return $this->absolutifyUrls($urls, $base);
  67         }
  68
  69         $body = $res->getBody();
  70         $doc = $this->loadHtml($body, $res);
  71
  72         $xpath = new \DOMXPath($doc);
  73         $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
  74         $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
  75
  76         if ($type === 'application/atom+xml') {
  77             $tagQuery = '/atom:feed/atom:link[';
  78
  79         } else if ($type === 'application/rss+xml') {
  80             $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
  81
  82         } else {
  83             $tagQuery = '/*[self::html or self::h:html]'
  84                 . '/*[self::head or self::h:head]'
  85                 . '/*[(self::link or self::h:link)'
  86                 . ' and';
  87         }
  88         $nodeList = $xpath->query(
  89             $tagQuery
  90             . ' ('
  91             . '  contains(concat(" ", normalize-space(@rel), " "), " hub ")'
  92             . '  or'
  93             . '  contains(concat(" ", normalize-space(@rel), " "), " canonical ")'
  94             . '  or'
  95             . '  contains(concat(" ", normalize-space(@rel), " "), " self ")'
  96             . ' )'
  97             . ']'
  98         );
  99
 100         if ($nodeList->length == 0) {
 101             //topic has no links
 102             return $urls;
 103         }
 104
 105         foreach ($nodeList as $link) {
 106             $uri  = $link->attributes->getNamedItem('href')->nodeValue;
 107             $types = explode(
 108                 ' ', $link->attributes->getNamedItem('rel')->nodeValue
 109             );
 110             foreach ($types as $type) {
 111                 if ($type == 'canonical') {
 112                     $type = 'self';
 113                 }
 114                 if ($type == 'hub' || $type == 'self'
 115                     && !isset($urls[$type])
 116                 ) {
 117                     $urls[$type] = $uri;
 118                 }
 119             }
 120         }
 121
 122         //FIXME: base href
 123         return $this->absolutifyUrls($urls, $base);
 124     }
 125
 126     /**
 127      * Extract hub url from the HTTP response headers.
 128      *
 129      * @param object $res HTTP response
 130      *
 131      * @return array Array with maximal two keys: hub and self
 132      */
 133     protected function extractHeader(\HTTP_Request2_Response $res)
 134     {
 135         $http = new \HTTP2();
 136
 137         $urls = array();
 138         $links = $http->parseLinks($res->getHeader('Link'));
 139         foreach ($links as $link) {
 140             if (isset($link['_uri']) && isset($link['rel'])) {
 141                 if (!isset($urls['hub'])
 142                     && array_search('hub', $link['rel']) !== false
 143                 ) {
 144                     $urls['hub'] = $link['_uri'];
 145                 }
 146                 if (!isset($urls['self'])
 147                     && array_search('self', $link['rel']) !== false
 148                 ) {
 149                     $urls['self'] = $link['_uri'];
 150                 }
 151             }
 152         }
 153         return $urls;
 154     }
 155
 156     /**
 157      * Load a DOMDocument from the given HTML or XML
 158      *
 159      * @param string $sourceBody Content of $source URI
 160      * @param object $res        HTTP response from fetching $source
 161      *
 162      * @return \DOMDocument DOM document object with HTML/XML loaded
 163      */
 164     protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res)
 165     {
 166         $doc = new \DOMDocument();
 167
 168         libxml_clear_errors();
 169         $old = libxml_use_internal_errors(true);
 170
 171         $typeParts = explode(';', $res->getHeader('content-type'));
 172         $type = $typeParts[0];
 173         if ($type == 'application/xhtml+xml'
 174             || $type == 'application/xml'
 175             || $type == 'text/xml'
 176             || $type == 'application/atom+xml'
 177             || $type == 'application/rss+xml'
 178         ) {
 179             $doc->loadXML($sourceBody);
 180         } else {
 181             $doc->loadHTML($sourceBody);
 182         }
 183
 184         libxml_clear_errors();
 185         libxml_use_internal_errors($old);
 186
 187         return $doc;
 188     }
 189
 190     /**
 191      * Returns the HTTP request object clone that can be used
 192      * for one HTTP request.
 193      *
 194      * @return HTTP_Request2 Clone of the setRequest() object
 195      */
 196     public function getRequest()
 197     {
 198         if ($this->request === null) {
 199             $request = new HttpRequest();
 200             $this->setRequestTemplate($request);
 201         }
 202
 203         //we need to clone because previous requests could have
 204         //set internal variables like POST data that we don't want now
 205         return clone $this->request;
 206     }
 207
 208     /**
 209      * Sets a custom HTTP request object that will be used to do HTTP requests
 210      *
 211      * @param object $request Request object
 212      *
 213      * @return self
 214      */
 215     public function setRequestTemplate(\HTTP_Request2 $request)
 216     {
 217         $this->request = $request;
 218         return $this;
 219     }
 220
 221     /**
 222      * Make the list of urls absolute
 223      *
 224      * @param array  $urls Array of maybe relative URLs
 225      * @param object $base Base URL to resolve the relatives against
 226      *
 227      * @return array List of absolute URLs
 228      */
 229     protected function absolutifyUrls($urls, \Net_URL2 $base)
 230     {
 231         foreach ($urls as $key => $url) {
 232             $urls[$key] = (string) $base->resolve($url);
 233         }
 234         return $urls;
 235     }
 236 }
 237 ?>