From: Christian Weiske Date: Thu, 5 Mar 2020 20:26:57 +0000 (+0100) Subject: Add atom and rss feed link url extraction X-Git-Url: https://git.cweiske.de/phinde.git/commitdiff_plain/ab2ebeda104555928ef044c662b1e672c067e218 Add atom and rss feed link url extraction --- diff --git a/src/phinde/HubUrlExtractor.php b/src/phinde/HubUrlExtractor.php index e2d328a..b33abfe 100644 --- a/src/phinde/HubUrlExtractor.php +++ b/src/phinde/HubUrlExtractor.php @@ -43,7 +43,8 @@ class HubUrlExtractor list($type) = explode(';', $res->getHeader('Content-type')); if ($type != 'text/html' && $type != 'text/xml' && $type != 'application/xhtml+xml' - //FIXME: atom, rss + && $type != 'application/atom+xml' + && $type != 'application/rss+xml' && $res->getStatus() != 405//HEAD method not allowed ) { //we will not be able to extract links from the content @@ -65,18 +66,27 @@ class HubUrlExtractor return $this->absolutifyUrls($urls, $base); } - //FIXME: atom/rss $body = $res->getBody(); $doc = $this->loadHtml($body, $res); $xpath = new \DOMXPath($doc); $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); + $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom'); + if ($type === 'application/atom+xml') { + $tagQuery = '/atom:feed/atom:link['; + + } else if ($type === 'application/rss+xml') { + $tagQuery = '/rss/channel/link['; + + } else { + $tagQuery = '/*[self::html or self::h:html]' + . '/*[self::head or self::h:head]' + . '/*[(self::link or self::h:link)' + . ' and'; + } $nodeList = $xpath->query( - '/*[self::html or self::h:html]' - . '/*[self::head or self::h:head]' - . '/*[(self::link or self::h:link)' - . ' and' + $tagQuery . ' (' . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")' . ' or' @@ -163,6 +173,8 @@ class HubUrlExtractor if ($type == 'application/xhtml+xml' || $type == 'application/xml' || $type == 'text/xml' + || $type == 'application/atom+xml' + || $type == 'application/rss+xml' ) { $doc->loadXML($sourceBody); } else { diff --git a/tests/HubUrlExtractorTest.php b/tests/HubUrlExtractorTest.php new file mode 100644 index 0000000..4c0a44b --- /dev/null +++ b/tests/HubUrlExtractorTest.php @@ -0,0 +1,225 @@ +addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: text/html\r\n" + . "Link: ; rel=\"hub\"\r\n" + . "Link: ; rel=\"self\"\r\n" + . "\r\n", + 'http://example.org/' + ); + + $extractor = new phinde\HubUrlExtractor(); + $extractor->setRequestTemplate( + new HTTP_Request2(null, null, ['adapter' => $mock]) + ); + + $this->assertEquals( + [ + 'hub' => 'https://hub.example.com/', + 'self' => 'http://example.com/feed', + ], + $extractor->getUrls('http://example.org/') + ); + } + + public function testGetUrlsHtml() + { + $mock = new HTTP_Request2_Adapter_Mock(); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: text/html\r\n" + . "\r\n", + 'http://example.org/' + ); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: text/html\r\n" + . "\r\n" + . << + + + + + +HTM, + 'http://example.org/' + ); + + $extractor = new phinde\HubUrlExtractor(); + $extractor->setRequestTemplate( + new HTTP_Request2(null, null, ['adapter' => $mock]) + ); + + $this->assertEquals( + [ + 'hub' => 'https://hub.example.com/', + 'self' => 'http://example.com/feed', + ], + $extractor->getUrls('http://example.org/') + ); + } + + public function testGetUrlsXHtml() + { + $mock = new HTTP_Request2_Adapter_Mock(); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/xhtml+xml\r\n" + . "\r\n", + 'http://example.org/' + ); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/xhtml+xml\r\n" + . "\r\n" + . << + + + + + +HTM, + 'http://example.org/' + ); + + $extractor = new phinde\HubUrlExtractor(); + $extractor->setRequestTemplate( + new HTTP_Request2(null, null, ['adapter' => $mock]) + ); + + $this->assertEquals( + [ + 'hub' => 'https://hub.example.com/', + 'self' => 'http://example.com/feed', + ], + $extractor->getUrls('http://example.org/') + ); + } + + public function testGetUrlsAtom() + { + $mock = new HTTP_Request2_Adapter_Mock(); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/atom+xml\r\n" + . "\r\n", + 'http://example.org/' + ); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/atom+xml\r\n" + . "\r\n" + . << + + + + + +HTM, + 'http://example.org/' + ); + + $extractor = new phinde\HubUrlExtractor(); + $extractor->setRequestTemplate( + new HTTP_Request2(null, null, ['adapter' => $mock]) + ); + + $this->assertEquals( + [ + 'hub' => 'https://hub.example.com/', + 'self' => 'http://example.com/feed', + ], + $extractor->getUrls('http://example.org/') + ); + } + + public function testGetUrlsRss2() + { + $mock = new HTTP_Request2_Adapter_Mock(); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/rss+xml\r\n" + . "\r\n", + 'http://example.org/' + ); + //HEAD + $this->addResponse( + $mock, + "HTTP/1.0 200 OK\r\n" + . "Content-type: application/rss+xml\r\n" + . "\r\n" + . << + + + http://www.example.com/main.html + + + + +HTM, + 'http://example.org/' + ); + + $extractor = new phinde\HubUrlExtractor(); + $extractor->setRequestTemplate( + new HTTP_Request2(null, null, ['adapter' => $mock]) + ); + + $this->assertEquals( + [ + 'hub' => 'https://hub.example.com/', + 'self' => 'http://example.com/feed', + ], + $extractor->getUrls('http://example.org/') + ); + } + + protected function addResponse($mock, $responseContent, $effectiveUrl) + { + $mock->addResponse( + static::createResponseFromString($responseContent, $effectiveUrl) + ); + } + + public static function createResponseFromString($str, $effectiveUrl) + { + $parts = preg_split('!(\r?\n){2}!m', $str, 2); + $headerLines = explode("\n", $parts[0]); + $response = new HTTP_Request2_Response( + array_shift($headerLines), true, $effectiveUrl + ); + foreach ($headerLines as $headerLine) { + $response->parseHeaderLine($headerLine); + } + $response->parseHeaderLine(''); + if (isset($parts[1])) { + $response->appendBody($parts[1]); + } + return $response; + } +} +?>