Add atom and rss feed link url extraction
authorChristian Weiske <cweiske@cweiske.de>
Thu, 5 Mar 2020 20:26:57 +0000 (21:26 +0100)
committerChristian Weiske <cweiske@cweiske.de>
Thu, 5 Mar 2020 20:26:57 +0000 (21:26 +0100)
src/phinde/HubUrlExtractor.php
tests/HubUrlExtractorTest.php [new file with mode: 0644]

index e2d328a..b33abfe 100644 (file)
@@ -43,7 +43,8 @@ class HubUrlExtractor
         list($type) = explode(';', $res->getHeader('Content-type'));
         if ($type != 'text/html' && $type != 'text/xml'
             && $type != 'application/xhtml+xml'
-            //FIXME: atom, rss
+            && $type != 'application/atom+xml'
+            && $type != 'application/rss+xml'
             && $res->getStatus() != 405//HEAD method not allowed
         ) {
             //we will not be able to extract links from the content
@@ -65,18 +66,27 @@ class HubUrlExtractor
             return $this->absolutifyUrls($urls, $base);
         }
 
-        //FIXME: atom/rss
         $body = $res->getBody();
         $doc = $this->loadHtml($body, $res);
 
         $xpath = new \DOMXPath($doc);
         $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
+        $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
 
+        if ($type === 'application/atom+xml') {
+            $tagQuery = '/atom:feed/atom:link[';
+
+        } else if ($type === 'application/rss+xml') {
+            $tagQuery = '/rss/channel/link[';
+
+        } else {
+            $tagQuery = '/*[self::html or self::h:html]'
+                . '/*[self::head or self::h:head]'
+                . '/*[(self::link or self::h:link)'
+                . ' and';
+        }
         $nodeList = $xpath->query(
-            '/*[self::html or self::h:html]'
-            . '/*[self::head or self::h:head]'
-            . '/*[(self::link or self::h:link)'
-            . ' and'
+            $tagQuery
             . ' ('
             . '  contains(concat(" ", normalize-space(@rel), " "), " hub ")'
             . '  or'
@@ -163,6 +173,8 @@ class HubUrlExtractor
         if ($type == 'application/xhtml+xml'
             || $type == 'application/xml'
             || $type == 'text/xml'
+            || $type == 'application/atom+xml'
+            || $type == 'application/rss+xml'
         ) {
             $doc->loadXML($sourceBody);
         } else {
diff --git a/tests/HubUrlExtractorTest.php b/tests/HubUrlExtractorTest.php
new file mode 100644 (file)
index 0000000..4c0a44b
--- /dev/null
@@ -0,0 +1,225 @@
+<?php
+class HubUrlExtractorTest extends \PHPUnit\Framework\TestCase
+{
+    public function testGetUrlsHEAD()
+    {
+        $mock = new HTTP_Request2_Adapter_Mock();
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: text/html\r\n"
+            . "Link: <https://hub.example.com/>; rel=\"hub\"\r\n"
+            . "Link: <http://example.com/feed>; rel=\"self\"\r\n"
+            . "\r\n",
+            'http://example.org/'
+        );
+        
+        $extractor = new phinde\HubUrlExtractor();
+        $extractor->setRequestTemplate(
+            new HTTP_Request2(null, null, ['adapter' => $mock])
+        );
+
+        $this->assertEquals(
+            [
+                'hub'  => 'https://hub.example.com/',
+                'self' => 'http://example.com/feed',
+            ],
+            $extractor->getUrls('http://example.org/')
+        );
+    }
+    
+    public function testGetUrlsHtml()
+    {
+        $mock = new HTTP_Request2_Adapter_Mock();
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: text/html\r\n"
+            . "\r\n",
+            'http://example.org/'
+        );
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: text/html\r\n"
+            . "\r\n"
+            . <<<HTM
+<html>
+ <head>
+  <link rel='hub' href='https://hub.example.com/'/>
+  <link rel='self' href='http://example.com/feed'/>
+ </head>
+</html>
+HTM,
+            'http://example.org/'
+        );
+        
+        $extractor = new phinde\HubUrlExtractor();
+        $extractor->setRequestTemplate(
+            new HTTP_Request2(null, null, ['adapter' => $mock])
+        );
+
+        $this->assertEquals(
+            [
+                'hub'  => 'https://hub.example.com/',
+                'self' => 'http://example.com/feed',
+            ],
+            $extractor->getUrls('http://example.org/')
+        );
+    }
+    
+    public function testGetUrlsXHtml()
+    {
+        $mock = new HTTP_Request2_Adapter_Mock();
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/xhtml+xml\r\n"
+            . "\r\n",
+            'http://example.org/'
+        );
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/xhtml+xml\r\n"
+            . "\r\n"
+            . <<<HTM
+<html>
+ <head>
+  <link rel='hub' href='https://hub.example.com/'/>
+  <link rel='self' href='http://example.com/feed'/>
+ </head>
+</html>
+HTM,
+            'http://example.org/'
+        );
+        
+        $extractor = new phinde\HubUrlExtractor();
+        $extractor->setRequestTemplate(
+            new HTTP_Request2(null, null, ['adapter' => $mock])
+        );
+
+        $this->assertEquals(
+            [
+                'hub'  => 'https://hub.example.com/',
+                'self' => 'http://example.com/feed',
+            ],
+            $extractor->getUrls('http://example.org/')
+        );
+    }
+    
+    public function testGetUrlsAtom()
+    {
+        $mock = new HTTP_Request2_Adapter_Mock();
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/atom+xml\r\n"
+            . "\r\n",
+            'http://example.org/'
+        );
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/atom+xml\r\n"
+            . "\r\n"
+            . <<<HTM
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <link href="http://example.org/"/>
+ <link rel="self" href="http://example.com/feed"/>
+ <link rel="hub" href="https://hub.example.com/"/>
+</feed>
+HTM,
+            'http://example.org/'
+        );
+        
+        $extractor = new phinde\HubUrlExtractor();
+        $extractor->setRequestTemplate(
+            new HTTP_Request2(null, null, ['adapter' => $mock])
+        );
+
+        $this->assertEquals(
+            [
+                'hub'  => 'https://hub.example.com/',
+                'self' => 'http://example.com/feed',
+            ],
+            $extractor->getUrls('http://example.org/')
+        );
+    }
+    
+    public function testGetUrlsRss2()
+    {
+        $mock = new HTTP_Request2_Adapter_Mock();
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/rss+xml\r\n"
+            . "\r\n",
+            'http://example.org/'
+        );
+        //HEAD
+        $this->addResponse(
+            $mock,
+            "HTTP/1.0 200 OK\r\n"
+            . "Content-type: application/rss+xml\r\n"
+            . "\r\n"
+            . <<<HTM
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0">
+ <channel>
+  <link>http://www.example.com/main.html</link>
+  <link rel="self" href="http://example.com/feed"/>
+  <link rel="hub" href="https://hub.example.com/"/>
+ </channel>
+</rss>
+HTM,
+            'http://example.org/'
+        );
+        
+        $extractor = new phinde\HubUrlExtractor();
+        $extractor->setRequestTemplate(
+            new HTTP_Request2(null, null, ['adapter' => $mock])
+        );
+
+        $this->assertEquals(
+            [
+                'hub'  => 'https://hub.example.com/',
+                'self' => 'http://example.com/feed',
+            ],
+            $extractor->getUrls('http://example.org/')
+        );
+    }
+
+    protected function addResponse($mock, $responseContent, $effectiveUrl)
+    {
+        $mock->addResponse(
+            static::createResponseFromString($responseContent, $effectiveUrl)
+        );
+    }
+
+    public static function createResponseFromString($str, $effectiveUrl)
+    {
+        $parts       = preg_split('!(\r?\n){2}!m', $str, 2);
+        $headerLines = explode("\n", $parts[0]);
+        $response    = new HTTP_Request2_Response(
+            array_shift($headerLines), true, $effectiveUrl
+        );
+        foreach ($headerLines as $headerLine) {
+            $response->parseHeaderLine($headerLine);
+        }
+        $response->parseHeaderLine('');
+        if (isset($parts[1])) {
+            $response->appendBody($parts[1]);
+        }
+        return $response;
+    }
+}
+?>