From: Christian Weiske Date: Sat, 29 Feb 2020 21:08:30 +0000 (+0100) Subject: Add URL rewrites/replacements X-Git-Url: https://git.cweiske.de/phinde.git/commitdiff_plain/d3cdabcac7feb8c62451ac12a22256c0eff16873 Add URL rewrites/replacements --- diff --git a/data/config.php.dist b/data/config.php.dist index cef499b..38c0432 100644 --- a/data/config.php.dist +++ b/data/config.php.dist @@ -13,6 +13,10 @@ $GLOBALS['phinde'] = array( //list of regexes for URLs that should not be crawled 'crawlBlacklist' => array( ), + //modify URLs with regex + 'urlRewrites' => array( + // '^http://example.org/' => 'https://example.org/', + ), //verbose output 'debug' => true, //full path to log file diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 1f63e60..4d596b4 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -53,6 +53,7 @@ class Crawler { $filteredLinkInfos = array(); foreach ($linkInfos as $linkInfo) { + $linkInfo->url = Helper::rewriteUrl($linkInfo->url); $allowed = Helper::isUrlAllowed($linkInfo->url); $crawl = $allowed; $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed; diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php index dccb118..7cf11b7 100644 --- a/src/phinde/Fetcher.php +++ b/src/phinde/Fetcher.php @@ -15,12 +15,15 @@ class Fetcher */ public function fetch($url, $actions, $force = false) { + $url = Helper::rewriteUrl($url); + $esDoc = $this->es->get($url); if (isset($esDoc->status->location) && $esDoc->status->location != '' ) { //TODO: what if location redirects change? $url = $esDoc->status->location; + $url = Helper::rewriteUrl($url); $esDoc = $this->es->get($url); } @@ -53,6 +56,7 @@ class Fetcher } $effUrl = Helper::removeAnchor($res->getEffectiveUrl()); + $effUrl = Helper::rewriteUrl($effUrl); if ($effUrl != $url) { $this->storeRedirect($url, $effUrl); $url = $effUrl; diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index aeb8ba5..d22b9c8 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -3,6 +3,20 @@ namespace phinde; class Helper { + public static function rewriteUrl($url) + { + if (!isset($GLOBALS['phinde']['urlRewrites']) + || count($GLOBALS['phinde']['urlRewrites']) == 0 + ) { + return $url; + } + + foreach ($GLOBALS['phinde']['urlRewrites'] as $pattern => $replacement) { + $url = preg_replace('#' . $pattern . '#', $replacement, $url); + } + return $url; + } + public static function isUrlAllowed($url) { $urlDomain = parse_url($url, PHP_URL_HOST);