Update dependencies to latest version

[phinde.git] / src / phinde / HubUrlExtractor.php
diff --git a/src/phinde/HubUrlExtractor.php b/src/phinde/HubUrlExtractor.php

index e2d328a0f2eaa29b02f345fd4e1594b92aa450ab..da29650cf0b4363927f48778a5ca372db2754d1d 100644 (file)
--- a/src/phinde/HubUrlExtractor.php
+++ b/src/phinde/HubUrlExtractor.php
@@ -1,6 +1,11 @@
  <?php
  namespace phinde;
  
+/**
+ * Perform WebSub discovery for "hub" and "self" URLs
+ *
+ * @link https://www.w3.org/TR/websub/#discovery
+ */
  class HubUrlExtractor
  {
      /**
@@ -14,22 +19,28 @@ class HubUrlExtractor
       * Get the hub and self/canonical URL of a given topic URL.
       * Uses link headers and parses HTML link rels.
       *
-     * @param string $url Topic URL
+     * @param string $url       Topic URL
+     * @param int    $redirects Number of redirects that were followed
       *
-     * @return array Array of URLs with keys: hub, self
+     * @return array Array of URLs with keys: hub, self.
+     *               - "self" value is the URL
+     *               - "hub"  value is an array of URLs
+     *               Keys may be there but most not if the URL
+     *               does not advertise them.
       */
-    public function getUrls($url)
+    public function getUrls($url, $redirects = 0)
      {
          //at first, try a HEAD request that does not transfer so much data
          $req = $this->getRequest();
          $req->setUrl($url);
          $req->setMethod(\HTTP_Request2::METHOD_HEAD);
+        $req->setConfig('follow_redirects', false);
          $res = $req->send();
  
          if (intval($res->getStatus() / 100) >= 4
              && $res->getStatus() != 405 //method not supported/allowed
          ) {
-            return null;
+            return [];
          }
  
          $url  = $res->getEffectiveUrl();
@@ -40,10 +51,20 @@ class HubUrlExtractor
              return $this->absolutifyUrls($urls, $base);
          }
  
+        if ($res->isRedirect()) {
+            //we tried header links and that failed, now follow the redirect
+            if ($redirects > 5) {
+                return [];
+            }
+            $redirectUrl = (string) $base->resolve($res->getHeader('location'));
+            return $this->getUrls($redirectUrl, $redirects + 1);
+        }
+
          list($type) = explode(';', $res->getHeader('Content-type'));
          if ($type != 'text/html' && $type != 'text/xml'
              && $type != 'application/xhtml+xml'
-            //FIXME: atom, rss
+            && $type != 'application/atom+xml'
+            && $type != 'application/rss+xml'
              && $res->getStatus() != 405//HEAD method not allowed
          ) {
              //we will not be able to extract links from the content
@@ -65,18 +86,29 @@ class HubUrlExtractor
              return $this->absolutifyUrls($urls, $base);
          }
  
-        //FIXME: atom/rss
+        $urls = [];//do not mix header and content links
+
          $body = $res->getBody();
          $doc = $this->loadHtml($body, $res);
  
          $xpath = new \DOMXPath($doc);
          $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
+        $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
+
+        if ($type === 'application/atom+xml') {
+            $tagQuery = '/atom:feed/atom:link[';
+
+        } else if ($type === 'application/rss+xml') {
+            $tagQuery = '/rss/channel/*[(self::link or self::atom:link) and ';
  
+        } else {
+            $tagQuery = '/*[self::html or self::h:html]'
+                . '/*[self::head or self::h:head]'
+                . '/*[(self::link or self::h:link)'
+                . ' and';
+        }
          $nodeList = $xpath->query(
-            '/*[self::html or self::h:html]'
-            . '/*[self::head or self::h:head]'
-            . '/*[(self::link or self::h:link)'
-            . ' and'
+            $tagQuery
              . ' ('
              . '  contains(concat(" ", normalize-space(@rel), " "), " hub ")'
              . '  or'
@@ -101,15 +133,16 @@ class HubUrlExtractor
                  if ($type == 'canonical') {
                      $type = 'self';
                  }
-                if ($type == 'hub' || $type == 'self'
-                    && !isset($urls[$type])
-                ) {
-                    $urls[$type] = $uri;
+                if ($type == 'self' && !isset($urls['self'])) {
+                    $urls['self'] = $uri;
+                } else if ($type == 'hub') {
+                    $urls['hub'][] = $uri;
                  }
              }
          }
  
-        //FIXME: base href
+        //<base href=".."> extraction is not necessary; RFC 5988 says:
+        // Note that any base IRI from the message's content is not applied.
          return $this->absolutifyUrls($urls, $base);
      }
  
@@ -128,10 +161,8 @@ class HubUrlExtractor
          $links = $http->parseLinks($res->getHeader('Link'));
          foreach ($links as $link) {
              if (isset($link['_uri']) && isset($link['rel'])) {
-                if (!isset($urls['hub'])
-                    && array_search('hub', $link['rel']) !== false
-                ) {
-                    $urls['hub'] = $link['_uri'];
+                if (array_search('hub', $link['rel']) !== false) {
+                    $urls['hub'][] = $link['_uri'];
                  }
                  if (!isset($urls['self'])
                      && array_search('self', $link['rel']) !== false
@@ -163,6 +194,8 @@ class HubUrlExtractor
          if ($type == 'application/xhtml+xml'
              || $type == 'application/xml'
              || $type == 'text/xml'
+            || $type == 'application/atom+xml'
+            || $type == 'application/rss+xml'
          ) {
              $doc->loadXML($sourceBody);
          } else {
@@ -184,8 +217,7 @@ class HubUrlExtractor
      public function getRequest()
      {
          if ($this->request === null) {
-            $request = new \HTTP_Request2();
-            $request->setConfig('follow_redirects', true);
+            $request = new HttpRequest();
              $this->setRequestTemplate($request);
          }
  
@@ -210,7 +242,7 @@ class HubUrlExtractor
      /**
       * Make the list of urls absolute
       *
-     * @param array  $urls Array of maybe relative URLs
+     * @param array  $urls Array of maybe relative URLs, or array of URLs
       * @param object $base Base URL to resolve the relatives against
       *
       * @return array List of absolute URLs
@@ -218,7 +250,13 @@ class HubUrlExtractor
      protected function absolutifyUrls($urls, \Net_URL2 $base)
      {
          foreach ($urls as $key => $url) {
-            $urls[$key] = (string) $base->resolve($url);
+            if (is_array($url)) {
+                foreach ($url as $singleKey => $singleUrl) {
+                    $urls[$key][$singleKey] = (string) $base->resolve($singleUrl);
+                }
+            } else {
+                $urls[$key] = (string) $base->resolve($url);
+            }
          }
          return $urls;
      }