From 68906273b5caf36fb72f9bda40b0fcb5c728568d Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Sun, 8 Mar 2020 23:54:16 +0100 Subject: [PATCH] Support location redirect changes --- src/phinde/Fetcher.php | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php index 666d7ed..c80032d 100644 --- a/src/phinde/Fetcher.php +++ b/src/phinde/Fetcher.php @@ -17,14 +17,15 @@ class Fetcher { $url = Helper::rewriteUrl($url); - $esDoc = $this->es->get($url); + $esDoc = $this->es->get($url); + $locUrl = null; if (isset($esDoc->status->location) && $esDoc->status->location != '' ) { - //TODO: what if location redirects change? - $url = $esDoc->status->location; - $url = Helper::rewriteUrl($url); - $esDoc = $this->es->get($url); + //Location redirect: Use modified time of known target + $locUrl = $esDoc->status->location; + $locUrl = Helper::rewriteUrl($locUrl); + $esDoc = $this->es->get($locUrl); } $types = array(); @@ -44,8 +45,17 @@ class Fetcher } $res = $req->send(); + $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); + $effUrl = Helper::rewriteUrl($effUrl); + if ($res->getStatus() === 304) { //not modified since last time, so don't crawl again + if ($locUrl !== null && $effUrl != $locUrl) { + //location URL changed, and we used the wrong crawl timestampx + $this->storeRedirect($url, $effUrl); + return $this->fetch($url, $actions, $force); + } + Log::info("Not modified since last fetch"); return false; } else if ($res->getStatus() !== 200) { @@ -55,8 +65,6 @@ class Fetcher ); } - $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); - $effUrl = Helper::rewriteUrl($effUrl); if ($effUrl != $url) { $this->storeRedirect($url, $effUrl); $url = $effUrl; -- 2.30.2