From: Christian Weiske Date: Mon, 29 Aug 2016 18:30:45 +0000 (+0200) Subject: Send If-Modified-Since header on crawling and indexing X-Git-Tag: v0.2.0~37 X-Git-Url: https://git.cweiske.de/phinde.git/commitdiff_plain/90d2e0f8c2a80f4d5db5ae20248337d3a0594411?hp=197c95e9b9805206306100415c54702bb5d9f2ce Send If-Modified-Since header on crawling and indexing --- diff --git a/bin/index.php b/bin/index.php index d110423..5a55427 100755 --- a/bin/index.php +++ b/bin/index.php @@ -24,27 +24,38 @@ function removeTags($doc, $tag) { } } -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); - $url = $argv[1]; -$existingDoc = $es->get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - echo "URL already indexed: $url\n"; - exit(0); -} -//FIXME: size limit -//FIXME: sourcetitle, sourcelink $req = new \HTTP_Request2($url); $req->setConfig('follow_redirects', true); $req->setConfig('connect_timeout', 5); $req->setConfig('timeout', 10); $req->setConfig('ssl_verify_peer', false); +//FIXME: size limit + +$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); +$existingDoc = $es->get($url); +if ($existingDoc && $existingDoc->status == 'indexed') { + $nMoDate = strtotime($existingDoc->modate); + $refreshtime = $GLOBALS['phinde']['refreshtime']; + if (time() - $nMoDate < $refreshtime) { + echo "URL already indexed less than $refreshtime seconds ago: $url\n"; + exit(0); + } + + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); +} +//FIXME: sourcetitle, sourcelink + $res = $req->send(); //FIXME: try-catch -//FIXME: delete if 401 gone or 404 when updating -if ($res->getStatus() !== 200) { +if ($res->getStatus() === 304) { + //not modified since last time + //FIXME: store "last try" time + exit(0); +} else if ($res->getStatus() !== 200) { + //FIXME: delete if 401 gone or 404 when updating echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; //FIXME: update status exit(3); diff --git a/data/config.php.dist b/data/config.php.dist index 7eb8ccf..b4d7d5c 100644 --- a/data/config.php.dist +++ b/data/config.php.dist @@ -13,5 +13,7 @@ $GLOBALS['phinde'] = array( 'subscriptions' => array( 'http://www.example.org/feed', ), + //time in seconds after which URLs may be re-indexed + 'refreshtime' => 86400, ); ?> \ No newline at end of file diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 9b14878..43d9459 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -25,7 +25,11 @@ class Crawler public function crawl($url) { - $res = $this->fetch($url); + $res = $this->fetch($url); + if ($res === false) { + return; + } + $linkInfos = $this->extractLinks($res); if ($this->showLinksOnly) { $this->showLinks($linkInfos); @@ -36,13 +40,23 @@ class Crawler protected function fetch($url) { + $existingDoc = $this->es->get($url); + $req = new HttpRequest($url); $req->setHeader( 'accept', implode(',', array_keys(static::$supportedIndexTypes)) ); + if ($existingDoc) { + $nMoDate = strtotime($existingDoc->modate); + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); + } + $res = $req->send(); - if ($res->getStatus() !== 200) { + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + return false; + } else if ($res->getStatus() !== 200) { throw new \Exception( "Response code is not 200 but " . $res->getStatus() . ", stopping"