diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-08-29 20:30:45 +0200 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-08-29 20:30:45 +0200 |
| commit | 90d2e0f8c2a80f4d5db5ae20248337d3a0594411 (patch) | |
| tree | 88b04450c9ef821399feb57f89c8b753a22c615a /src | |
| parent | 197c95e9b9805206306100415c54702bb5d9f2ce (diff) | |
| download | phinde-90d2e0f8c2a80f4d5db5ae20248337d3a0594411.tar.gz phinde-90d2e0f8c2a80f4d5db5ae20248337d3a0594411.zip | |
Send If-Modified-Since header on crawling and indexing
Diffstat (limited to 'src')
| -rw-r--r-- | src/phinde/Crawler.php | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 9b14878..43d9459 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -25,7 +25,11 @@ class Crawler public function crawl($url) { - $res = $this->fetch($url); + $res = $this->fetch($url); + if ($res === false) { + return; + } + $linkInfos = $this->extractLinks($res); if ($this->showLinksOnly) { $this->showLinks($linkInfos); @@ -36,13 +40,23 @@ class Crawler protected function fetch($url) { + $existingDoc = $this->es->get($url); + $req = new HttpRequest($url); $req->setHeader( 'accept', implode(',', array_keys(static::$supportedIndexTypes)) ); + if ($existingDoc) { + $nMoDate = strtotime($existingDoc->modate); + $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); + } + $res = $req->send(); - if ($res->getStatus() !== 200) { + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + return false; + } else if ($res->getStatus() !== 200) { throw new \Exception( "Response code is not 200 but " . $res->getStatus() . ", stopping" |
