aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2020-03-08 23:54:16 +0100
committerChristian Weiske <cweiske@cweiske.de>2020-03-08 23:54:16 +0100
commit68906273b5caf36fb72f9bda40b0fcb5c728568d (patch)
tree88646d1a94bcb1560ffba1ba834dfcf54a26042b
parente9a75e18860a20a7510229d9467a1af87ba4f7b3 (diff)
downloadphinde-68906273b5caf36fb72f9bda40b0fcb5c728568d.tar.gz
phinde-68906273b5caf36fb72f9bda40b0fcb5c728568d.zip
Support location redirect changes
-rw-r--r--src/phinde/Fetcher.php22
1 files changed, 15 insertions, 7 deletions
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php
index 666d7ed..c80032d 100644
--- a/src/phinde/Fetcher.php
+++ b/src/phinde/Fetcher.php
@@ -17,14 +17,15 @@ class Fetcher
{
$url = Helper::rewriteUrl($url);
- $esDoc = $this->es->get($url);
+ $esDoc = $this->es->get($url);
+ $locUrl = null;
if (isset($esDoc->status->location)
&& $esDoc->status->location != ''
) {
- //TODO: what if location redirects change?
- $url = $esDoc->status->location;
- $url = Helper::rewriteUrl($url);
- $esDoc = $this->es->get($url);
+ //Location redirect: Use modified time of known target
+ $locUrl = $esDoc->status->location;
+ $locUrl = Helper::rewriteUrl($locUrl);
+ $esDoc = $this->es->get($locUrl);
}
$types = array();
@@ -44,8 +45,17 @@ class Fetcher
}
$res = $req->send();
+ $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
+ $effUrl = Helper::rewriteUrl($effUrl);
+
if ($res->getStatus() === 304) {
//not modified since last time, so don't crawl again
+ if ($locUrl !== null && $effUrl != $locUrl) {
+ //location URL changed, and we used the wrong crawl timestampx
+ $this->storeRedirect($url, $effUrl);
+ return $this->fetch($url, $actions, $force);
+ }
+
Log::info("Not modified since last fetch");
return false;
} else if ($res->getStatus() !== 200) {
@@ -55,8 +65,6 @@ class Fetcher
);
}
- $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
- $effUrl = Helper::rewriteUrl($effUrl);
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;