From 9928b96a20f17fe532dd0ac26914f83cbe34867a Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Thu, 24 Nov 2016 22:09:28 +0100 Subject: [PATCH] websub subcriptions work --- bin/subscribe.php | 75 +++++++++++ data/config.php.dist | 10 +- data/schema.sql | 14 ++ src/phinde/HttpRequest.php | 4 +- src/phinde/HubUrlExtractor.php | 226 +++++++++++++++++++++++++++++++++ src/phinde/Subscriptions.php | 153 +++++++++++++++++++++- www/push-subscription.php | 131 +++++++++++-------- 7 files changed, 548 insertions(+), 65 deletions(-) create mode 100755 bin/subscribe.php create mode 100644 data/schema.sql create mode 100644 src/phinde/HubUrlExtractor.php diff --git a/bin/subscribe.php b/bin/subscribe.php new file mode 100755 index 0000000..768ee10 --- /dev/null +++ b/bin/subscribe.php @@ -0,0 +1,75 @@ +#!/usr/bin/env php +description = 'Subscribe to URL updates'; +$cc->version = '0.0.1'; +$cc->addArgument( + 'url', + array( + 'description' => 'URL to process', + 'multiple' => false + ) +); +try { + $res = $cc->parse(); +} catch (\Exception $e) { + $cc->displayError($e->getMessage()); +} + +$url = $res->args['url']; +$url = Helper::addSchema($url); +$urlObj = new \Net_URL2($url); +$url = $urlObj->getNormalizedURL(); +if (!Helper::isUrlAllowed($url)) { + Log::error("Domain is not allowed; not crawling"); + exit(2); +} + +$subDb = new Subscriptions(); + +list($topic, $hub) = $subDb->detectHub($url); +if ($hub === null) { + Log::error('No hub URL found for topic'); + exit(10); +} +if ($topic != $url) { + Log::info('Topic URL differs from URL: ' . $topic); +} + +$sub = $subDb->get($topic); +if ($sub !== false) { + Log::error('Topic exists already in subscription table'); + Log::info('Current status: ' . $sub->sub_status); + exit(3); +} +$subDb->create($topic); +$sub = $subDb->get($topic); + +$callbackUrl = $GLOBALS['phinde']['baseurl'] . 'push-subscription.php' + . '?hub.topic=' . urlencode($topic) + . '&capkey=' . urlencode($sub->sub_capkey); +$req = new HttpRequest($hub, 'POST'); +$req->addPostParameter('hub.callback', $callbackUrl); +$req->addPostParameter('hub.mode', 'subscribe'); +$req->addPostParameter('hub.topic', $topic); +$req->addPostParameter('hub.lease_seconds', $sub->sub_lease_seconds); +$req->addPostParameter('hub.secret', $sub->sub_secret); +$res = $req->send(); + +if (intval($res->getStatus()) == 202) { + Log::info('Subscription initiated'); + exit(0); +} + +Log::error( + 'Error: Subscription response status code was not 202 but ' + . $res->getStatus() +); +Log::error($res->getBody()); +?> diff --git a/data/config.php.dist b/data/config.php.dist index 19a3221..5c3eba8 100644 --- a/data/config.php.dist +++ b/data/config.php.dist @@ -15,10 +15,6 @@ $GLOBALS['phinde'] = array( //list of regexes for URLs that should not be crawled 'crawlBlacklist' => array( ), - //list of URLs that should be subscribed to with PubSubHubbub - 'subscriptions' => array( - 'http://www.example.org/feed', - ), //verbose output 'debug' => true, //time in seconds after which URLs may be re-indexed @@ -36,5 +32,9 @@ $GLOBALS['phinde'] = array( 'hitTemplate' => 'hit.htm', //default sort order: "score" or "date" 'defaultSort' => 'score', + //database for PuSH subscriptions + 'db_dsn' => 'mysql:host=localhost;dbname=phinde', + 'db_user' => 'FIXME', + 'db_pass' => 'FIXME', ); -?> \ No newline at end of file +?> diff --git a/data/schema.sql b/data/schema.sql new file mode 100644 index 0000000..7f9312a --- /dev/null +++ b/data/schema.sql @@ -0,0 +1,14 @@ +CREATE TABLE `subscriptions` ( + `sub_id` int NOT NULL AUTO_INCREMENT PRIMARY KEY, + `sub_topic` varchar(4096) NOT NULL, + `sub_status` enum('subscribing','active','unsubscribing','unsubscribed','expired','denied') NOT NULL, + `sub_lease_seconds` int NOT NULL, + `sub_expires` datetime NOT NULL, + `sub_secret` varchar(256) NOT NULL, + `sub_capkey` varchar(128) NOT NULL, + `sub_created` datetime NOT NULL, + `sub_updated` datetime NOT NULL, + `sub_pings` int NOT NULL, + `sub_lastping` datetime NOT NULL, + `sub_statusmessage` varchar(512) NOT NULL +) COMMENT='' COLLATE 'utf8_general_ci'; diff --git a/src/phinde/HttpRequest.php b/src/phinde/HttpRequest.php index e68bd84..4635973 100644 --- a/src/phinde/HttpRequest.php +++ b/src/phinde/HttpRequest.php @@ -3,9 +3,9 @@ namespace phinde; class HttpRequest extends \HTTP_Request2 { - public function __construct($url) + public function __construct($url = null, $method = 'GET') { - parent::__construct($url); + parent::__construct($url, $method); $this->setConfig('follow_redirects', true); $this->setConfig('connect_timeout', 5); $this->setConfig('timeout', 10); diff --git a/src/phinde/HubUrlExtractor.php b/src/phinde/HubUrlExtractor.php new file mode 100644 index 0000000..e2d328a --- /dev/null +++ b/src/phinde/HubUrlExtractor.php @@ -0,0 +1,226 @@ +getRequest(); + $req->setUrl($url); + $req->setMethod(\HTTP_Request2::METHOD_HEAD); + $res = $req->send(); + + if (intval($res->getStatus() / 100) >= 4 + && $res->getStatus() != 405 //method not supported/allowed + ) { + return null; + } + + $url = $res->getEffectiveUrl(); + $base = new \Net_URL2($url); + + $urls = $this->extractHeader($res); + if (count($urls) === 2) { + return $this->absolutifyUrls($urls, $base); + } + + list($type) = explode(';', $res->getHeader('Content-type')); + if ($type != 'text/html' && $type != 'text/xml' + && $type != 'application/xhtml+xml' + //FIXME: atom, rss + && $res->getStatus() != 405//HEAD method not allowed + ) { + //we will not be able to extract links from the content + return $urls; + } + + //HEAD failed, do a normal GET + $req->setMethod(\HTTP_Request2::METHOD_GET); + $res = $req->send(); + if (intval($res->getStatus() / 100) >= 4) { + return $urls; + } + + //yes, maybe the server does return this header now + // e.g. PHP's Phar::webPhar() does not work with HEAD + // https://bugs.php.net/bug.php?id=51918 + $urls = array_merge($this->extractHeader($res), $urls); + if (count($urls) === 2) { + return $this->absolutifyUrls($urls, $base); + } + + //FIXME: atom/rss + $body = $res->getBody(); + $doc = $this->loadHtml($body, $res); + + $xpath = new \DOMXPath($doc); + $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); + + $nodeList = $xpath->query( + '/*[self::html or self::h:html]' + . '/*[self::head or self::h:head]' + . '/*[(self::link or self::h:link)' + . ' and' + . ' (' + . ' contains(concat(" ", normalize-space(@rel), " "), " hub ")' + . ' or' + . ' contains(concat(" ", normalize-space(@rel), " "), " canonical ")' + . ' or' + . ' contains(concat(" ", normalize-space(@rel), " "), " self ")' + . ' )' + . ']' + ); + + if ($nodeList->length == 0) { + //topic has no links + return $urls; + } + + foreach ($nodeList as $link) { + $uri = $link->attributes->getNamedItem('href')->nodeValue; + $types = explode( + ' ', $link->attributes->getNamedItem('rel')->nodeValue + ); + foreach ($types as $type) { + if ($type == 'canonical') { + $type = 'self'; + } + if ($type == 'hub' || $type == 'self' + && !isset($urls[$type]) + ) { + $urls[$type] = $uri; + } + } + } + + //FIXME: base href + return $this->absolutifyUrls($urls, $base); + } + + /** + * Extract hub url from the HTTP response headers. + * + * @param object $res HTTP response + * + * @return array Array with maximal two keys: hub and self + */ + protected function extractHeader(\HTTP_Request2_Response $res) + { + $http = new \HTTP2(); + + $urls = array(); + $links = $http->parseLinks($res->getHeader('Link')); + foreach ($links as $link) { + if (isset($link['_uri']) && isset($link['rel'])) { + if (!isset($urls['hub']) + && array_search('hub', $link['rel']) !== false + ) { + $urls['hub'] = $link['_uri']; + } + if (!isset($urls['self']) + && array_search('self', $link['rel']) !== false + ) { + $urls['self'] = $link['_uri']; + } + } + } + return $urls; + } + + /** + * Load a DOMDocument from the given HTML or XML + * + * @param string $sourceBody Content of $source URI + * @param object $res HTTP response from fetching $source + * + * @return \DOMDocument DOM document object with HTML/XML loaded + */ + protected static function loadHtml($sourceBody, \HTTP_Request2_Response $res) + { + $doc = new \DOMDocument(); + + libxml_clear_errors(); + $old = libxml_use_internal_errors(true); + + $typeParts = explode(';', $res->getHeader('content-type')); + $type = $typeParts[0]; + if ($type == 'application/xhtml+xml' + || $type == 'application/xml' + || $type == 'text/xml' + ) { + $doc->loadXML($sourceBody); + } else { + $doc->loadHTML($sourceBody); + } + + libxml_clear_errors(); + libxml_use_internal_errors($old); + + return $doc; + } + + /** + * Returns the HTTP request object clone that can be used + * for one HTTP request. + * + * @return HTTP_Request2 Clone of the setRequest() object + */ + public function getRequest() + { + if ($this->request === null) { + $request = new \HTTP_Request2(); + $request->setConfig('follow_redirects', true); + $this->setRequestTemplate($request); + } + + //we need to clone because previous requests could have + //set internal variables like POST data that we don't want now + return clone $this->request; + } + + /** + * Sets a custom HTTP request object that will be used to do HTTP requests + * + * @param object $request Request object + * + * @return self + */ + public function setRequestTemplate(\HTTP_Request2 $request) + { + $this->request = $request; + return $this; + } + + /** + * Make the list of urls absolute + * + * @param array $urls Array of maybe relative URLs + * @param object $base Base URL to resolve the relatives against + * + * @return array List of absolute URLs + */ + protected function absolutifyUrls($urls, \Net_URL2 $base) + { + foreach ($urls as $key => $url) { + $urls[$key] = (string) $base->resolve($url); + } + return $urls; + } +} +?> diff --git a/src/phinde/Subscriptions.php b/src/phinde/Subscriptions.php index 9db4b16..4d00ab8 100644 --- a/src/phinde/Subscriptions.php +++ b/src/phinde/Subscriptions.php @@ -1,12 +1,159 @@ db = new \PDO( + $GLOBALS['phinde']['db_dsn'], + $GLOBALS['phinde']['db_user'], + $GLOBALS['phinde']['db_pass'] + ); + $this->db->setAttribute(\PDO::ATTR_ERRMODE, \PDO::ERRMODE_EXCEPTION); + } + + /** + * Fetch a topic + * + * @param string $topic Topic URL + * + * @return false|object False if the row does not exist + */ + public function get($topic) + { + $stmt = $this->db->prepare( + 'SELECT * FROM subscriptions' + . ' WHERE sub_topic = :topic' + ); + $stmt->execute([':topic' => $topic]); + + //fetchObject() itself returns FALSE on failure + return $stmt->fetchObject(); + } + + /** + * Create a new subscription entry in database. + * Automatically generates secret, capkey and lease seconds. + * + * This method does NOT: + * - check for duplicates (do it yourself) + * - return the object (fetch it yourself) + * - send subscription requests to the hub + * + * @param string $topic URL to subscribe to + * + * @return void + */ + public function create($topic) + { + $stmt = $this->db->prepare( + 'INSERT INTO subscriptions' + . ' (sub_topic, sub_status, sub_lease_seconds, sub_expires' + . ', sub_secret, sub_capkey, sub_created, sub_updated' + . ', sub_pings, sub_lastping, sub_statusmessage)' + . ' VALUES ' + . ' (:topic, "subscribing", :lease_seconds, "0000-00-00 00:00:00"' + . ', :secret, :capkey, NOW(), NOW()' + . ', 0, "0000-00-00 00:00:00", "")' + ); + $stmt->execute( + [ + ':topic' => $topic, + ':lease_seconds' => 86400 * 30, + ':secret' => bin2hex(openssl_random_pseudo_bytes(16)), + ':capkey' => bin2hex(openssl_random_pseudo_bytes(16)), + ] + ); + } + + /** + * A subscription has been confirmed by the hub - mark it as active. + * + * @param integer $subId Subscription ID + * @param integer $leaseSeconds Number of seconds until subscription expires + * + * @return void + */ + public function subscribed($subId, $leaseSeconds) + { + $this->db->prepare( + 'UPDATE subscriptions' + . ' SET sub_status = "active"' + . ' , sub_lease_seconds = :leaseSeconds' + . ' , sub_expires = :expires' + . ' , sub_updated = NOW()' + . ' WHERE sub_id = :id' + )->execute( + [ + ':leaseSeconds' => $leaseSeconds, + ':expires' => gmdate('Y-m-d H:i:s', time() + $leaseSeconds), + ':id' => $subId, + ] + ); + } + + /** + * Mark a subscription as "unsubscribed" + * + * @param integer $subId Subscription ID + * + * @return void + */ + public function unsubscribed($subId) + { + $this->db->prepare( + 'UPDATE subscriptions' + . ' SET sub_status = "unsubscribed"' + . ' , sub_updated = NOW()' + . ' WHERE sub_id = :id' + )->execute([':id' => $subId]); + } + + public function denied($subId, $reason) + { + $this->db->prepare( + 'UPDATE subscriptions' + . ' SET sub_status = "denied"' + . ' , sub_statusmessage = :reason' + . ' , sub_updated = NOW()' + . ' WHERE sub_id = :id' + )->execute([':id' => $subId, ':reason' => $reason]); + } + + public function pinged($subId) + { + $this->db->prepare( + 'UPDATE subscriptions' + . ' SET sub_pings = sub_pings + 1' + . ' , sub_lastping = NOW()' + . ' , sub_updated = NOW()' + . ' WHERE sub_id = :id' + )->execute([':id' => $subId]); + } + + /** + * Detect the hub for the given topic URL + * + * @param string $url Topic URL + * + * @return array Topic URL and hub URL. Hub URL is NULL if there is none. + */ + public function detectHub($url) { - //FIXME - return false; + $hue = new HubUrlExtractor(); + $hue->setRequestTemplate(new HttpRequest()); + $urls = $hue->getUrls($url); + //we violate the spec by not requiring a self URL + $topicUrl = isset($urls['self']) ? $urls['self'] : $url; + $hubUrl = isset($urls['hub']) ? $urls['hub'] : null; + + return array($topicUrl, $hubUrl); } } ?> diff --git a/www/push-subscription.php b/www/push-subscription.php index 014f15d..6fc20f6 100644 --- a/www/push-subscription.php +++ b/www/push-subscription.php @@ -4,88 +4,102 @@ namespace phinde; * Handles PuSH subscription responses */ header('HTTP/1.0 500 Internal Server Error'); -require 'www-header.php'; - -//PHP converts dots to underscore, so hub.mode becomes hub_mode -if (!isset($_GET['hub_mode'])) { - header('HTTP/1.0 400 Bad Request'); - echo "Parameter missing: hub.mode\n"; - exit(1); -} -$hubMode = $_GET['hub_mode']; +require_once 'www-header.php'; +//PHP converts dots to underscore, so hub.topic becomes hub_topic if (!isset($_GET['hub_topic'])) { - header('HTTP/1.0 400 Bad Request'); - echo "Parameter missing: hub.topic\n"; - exit(1); + err('Parameter missing: hub.topic', '400 Bad Request'); } if (!isValidUrl($_GET['hub_topic'])) { - header('HTTP/1.0 400 Bad Request'); - echo "Invalid parameter value for hub.topic: Invalid URL\n"; - exit(1); + err( + 'Invalid parameter value for hub.topic: Invalid URL', + '400 Bad Request' + ); } $hubTopic = $_GET['hub_topic']; $subDb = new Subscriptions(); +$sub = $subDb->get($hubTopic); +if ($sub === false) { + //we do not have this topic in our database + err('We know nothing about this hub.topic', '404 Not Found'); +} -if ($hubMode == 'denied') { - //TODO: Inspect Location header to retry subscription - //TODO: remove subscription - return; -} else if ($hubMode == 'subscribe') { - //FIXME - $pos = array_search($hubTopic, $GLOBALS['phinde']['subscriptions']); - if ($pos === false) { - //we do not want to subscribe - header('HTTP/1.0 404 Not Found'); - echo "We are not interested in this hub.topic\n"; - exit(1); - } +//capability key verification so third parties can't forge requests +// see https://www.w3.org/TR/capability-urls/ +if (!isset($_GET['capkey'])) { + err('Parameter missing: capkey', '400 Bad Request'); +} +if ($sub->sub_capkey !== $_GET['capkey']) { + err('Invalid parameter value for capkey', '400 Bad Request'); +} + +if ($_SERVER['REQUEST_METHOD'] == 'POST') { + $queue = new Queue(); + $queue->addToProcessList($hubTopic, ['index', 'crawl']); + $subDb->pinged($sub->sub_id); + header('HTTP/1.0 200 OK'); + echo "URL queued.\n"; + exit(); +} + +if (!isset($_GET['hub_mode'])) { + err('Parameter missing: hub.mode', '400 Bad Request'); +} +$hubMode = $_GET['hub_mode']; + +if ($hubMode == 'subscribe') { if (!isset($_GET['hub_challenge'])) { - header('HTTP/1.0 400 Bad Request'); - echo "Parameter missing: hub.challenge\n"; - exit(1); + err('Parameter missing: hub.challenge', '400 Bad Request'); } $hubChallenge = $_GET['hub_challenge']; if (!isset($_GET['hub_lease_seconds'])) { - header('HTTP/1.0 400 Bad Request'); - echo "Parameter missing: hub.lease_seconds\n"; - exit(1); + err('Parameter missing: hub.lease_seconds', '400 Bad Request'); + } + if (!is_numeric($_GET['hub_lease_seconds'])) { + err('Invalid value for hub.lease_seconds', '400 Bad Request'); } - $hubLeaseSeconds = $_GET['hub_lease_seconds']; + $hubLeaseSeconds = intval($_GET['hub_lease_seconds']); - //FIXME: store in database + $subDb->subscribed($sub->sub_id, $hubLeaseSeconds); header('HTTP/1.0 200 OK'); header('Content-type: text/plain'); echo $hubChallenge; - exit(0); + exit(); } else if ($hubMode == 'unsubscribe') { - $sub = $subDb->get($hubTopic); - if ($sub === false) { - //we do not know this subscription - header('HTTP/1.0 404 Not Found'); - echo "We are not subscribed to this hub.topic\n"; - exit(1); - } - $pos = array_search($hubTopic, $GLOBALS['phinde']['subscriptions']); - if ($pos !== false) { + if ($sub->sub_status != 'unsubscribing') { //we do not want to unsubscribe - header('HTTP/1.0 404 Not Found'); - echo "We do not want to unsubscribe from this hub.topic\n"; - exit(1); + err( + 'We do not want to unsubscribe from this hub.topic', + '404 Not Found' + ); + } + if (!isset($_GET['hub_challenge'])) { + err('Parameter missing: hub.challenge', '400 Bad Request'); } - $sub->remove($hubTopic); + $hubChallenge = $_GET['hub_challenge']; + + $subDb->unsubscribed($sub->sub_id); + header('HTTP/1.0 200 OK'); header('Content-type: text/plain'); - echo "Unsubscribed.\n"; - exit(0); + echo $hubChallenge; + exit(); + +} else if ($hubMode == 'denied') { + //TODO: Inspect Location header to retry subscription (still valid?) + $reason = ''; + if (isset($_GET['hub_reason'])) { + $reason = $_GET['hub_reason']; + } + $subDb->denied($sub->sub_id, $reason); + exit(); + } else { - header('HTTP/1.0 400 Bad Request'); - echo "Invalid parameter value for hub.mode\n"; - exit(1); + err('Invalid parameter value for hub.mode', '400 Bad Request'); } @@ -101,4 +115,11 @@ function isValidUrl($url) } return false; } + +function err($msg, $statusline) +{ + header('HTTP/1.0 ' . $statusline); + echo $msg . "\n"; + exit(1); +} ?> \ No newline at end of file -- 2.30.2