aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2020-02-29 22:08:30 +0100
committerChristian Weiske <cweiske@cweiske.de>2020-02-29 22:08:30 +0100
commitd3cdabcac7feb8c62451ac12a22256c0eff16873 (patch)
tree4a81723834ed307b0397d95602844b0c4d3ee1ba
parent8512ec548a4f8896aa37678f44de5b88a5a85b24 (diff)
downloadphinde-d3cdabcac7feb8c62451ac12a22256c0eff16873.tar.gz
phinde-d3cdabcac7feb8c62451ac12a22256c0eff16873.zip
Add URL rewrites/replacements
-rw-r--r--data/config.php.dist4
-rw-r--r--src/phinde/Crawler.php1
-rw-r--r--src/phinde/Fetcher.php4
-rw-r--r--src/phinde/Helper.php14
4 files changed, 23 insertions, 0 deletions
diff --git a/data/config.php.dist b/data/config.php.dist
index cef499b..38c0432 100644
--- a/data/config.php.dist
+++ b/data/config.php.dist
@@ -13,6 +13,10 @@ $GLOBALS['phinde'] = array(
//list of regexes for URLs that should not be crawled
'crawlBlacklist' => array(
),
+ //modify URLs with regex
+ 'urlRewrites' => array(
+ // '^http://example.org/' => 'https://example.org/',
+ ),
//verbose output
'debug' => true,
//full path to log file
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
index 1f63e60..4d596b4 100644
--- a/src/phinde/Crawler.php
+++ b/src/phinde/Crawler.php
@@ -53,6 +53,7 @@ class Crawler
{
$filteredLinkInfos = array();
foreach ($linkInfos as $linkInfo) {
+ $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
$allowed = Helper::isUrlAllowed($linkInfo->url);
$crawl = $allowed;
$index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php
index dccb118..7cf11b7 100644
--- a/src/phinde/Fetcher.php
+++ b/src/phinde/Fetcher.php
@@ -15,12 +15,15 @@ class Fetcher
*/
public function fetch($url, $actions, $force = false)
{
+ $url = Helper::rewriteUrl($url);
+
$esDoc = $this->es->get($url);
if (isset($esDoc->status->location)
&& $esDoc->status->location != ''
) {
//TODO: what if location redirects change?
$url = $esDoc->status->location;
+ $url = Helper::rewriteUrl($url);
$esDoc = $this->es->get($url);
}
@@ -53,6 +56,7 @@ class Fetcher
}
$effUrl = Helper::removeAnchor($res->getEffectiveUrl());
+ $effUrl = Helper::rewriteUrl($effUrl);
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;
diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php
index aeb8ba5..d22b9c8 100644
--- a/src/phinde/Helper.php
+++ b/src/phinde/Helper.php
@@ -3,6 +3,20 @@ namespace phinde;
class Helper
{
+ public static function rewriteUrl($url)
+ {
+ if (!isset($GLOBALS['phinde']['urlRewrites'])
+ || count($GLOBALS['phinde']['urlRewrites']) == 0
+ ) {
+ return $url;
+ }
+
+ foreach ($GLOBALS['phinde']['urlRewrites'] as $pattern => $replacement) {
+ $url = preg_replace('#' . $pattern . '#', $replacement, $url);
+ }
+ return $url;
+ }
+
public static function isUrlAllowed($url)
{
$urlDomain = parse_url($url, PHP_URL_HOST);