diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2020-03-08 23:54:16 +0100 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2020-03-08 23:54:16 +0100 |
| commit | 68906273b5caf36fb72f9bda40b0fcb5c728568d (patch) | |
| tree | 88646d1a94bcb1560ffba1ba834dfcf54a26042b | |
| parent | e9a75e18860a20a7510229d9467a1af87ba4f7b3 (diff) | |
| download | phinde-68906273b5caf36fb72f9bda40b0fcb5c728568d.tar.gz phinde-68906273b5caf36fb72f9bda40b0fcb5c728568d.zip | |
Support location redirect changes
| -rw-r--r-- | src/phinde/Fetcher.php | 22 |
1 files changed, 15 insertions, 7 deletions
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php index 666d7ed..c80032d 100644 --- a/src/phinde/Fetcher.php +++ b/src/phinde/Fetcher.php @@ -17,14 +17,15 @@ class Fetcher { $url = Helper::rewriteUrl($url); - $esDoc = $this->es->get($url); + $esDoc = $this->es->get($url); + $locUrl = null; if (isset($esDoc->status->location) && $esDoc->status->location != '' ) { - //TODO: what if location redirects change? - $url = $esDoc->status->location; - $url = Helper::rewriteUrl($url); - $esDoc = $this->es->get($url); + //Location redirect: Use modified time of known target + $locUrl = $esDoc->status->location; + $locUrl = Helper::rewriteUrl($locUrl); + $esDoc = $this->es->get($locUrl); } $types = array(); @@ -44,8 +45,17 @@ class Fetcher } $res = $req->send(); + $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); + $effUrl = Helper::rewriteUrl($effUrl); + if ($res->getStatus() === 304) { //not modified since last time, so don't crawl again + if ($locUrl !== null && $effUrl != $locUrl) { + //location URL changed, and we used the wrong crawl timestampx + $this->storeRedirect($url, $effUrl); + return $this->fetch($url, $actions, $force); + } + Log::info("Not modified since last fetch"); return false; } else if ($res->getStatus() !== 200) { @@ -55,8 +65,6 @@ class Fetcher ); } - $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); - $effUrl = Helper::rewriteUrl($effUrl); if ($effUrl != $url) { $this->storeRedirect($url, $effUrl); $url = $effUrl; |
