//list of regexes for URLs that should not be crawled
'crawlBlacklist' => array(
),
+ //modify URLs with regex
+ 'urlRewrites' => array(
+ // '^http://example.org/' => 'https://example.org/',
+ ),
//verbose output
'debug' => true,
//full path to log file
{
$filteredLinkInfos = array();
foreach ($linkInfos as $linkInfo) {
+ $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
$allowed = Helper::isUrlAllowed($linkInfo->url);
$crawl = $allowed;
$index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
*/
public function fetch($url, $actions, $force = false)
{
+ $url = Helper::rewriteUrl($url);
+
$esDoc = $this->es->get($url);
if (isset($esDoc->status->location)
&& $esDoc->status->location != ''
) {
//TODO: what if location redirects change?
$url = $esDoc->status->location;
+ $url = Helper::rewriteUrl($url);
$esDoc = $this->es->get($url);
}
}
$effUrl = Helper::removeAnchor($res->getEffectiveUrl());
+ $effUrl = Helper::rewriteUrl($effUrl);
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;
class Helper
{
+ public static function rewriteUrl($url)
+ {
+ if (!isset($GLOBALS['phinde']['urlRewrites'])
+ || count($GLOBALS['phinde']['urlRewrites']) == 0
+ ) {
+ return $url;
+ }
+
+ foreach ($GLOBALS['phinde']['urlRewrites'] as $pattern => $replacement) {
+ $url = preg_replace('#' . $pattern . '#', $replacement, $url);
+ }
+ return $url;
+ }
+
public static function isUrlAllowed($url)
{
$urlDomain = parse_url($url, PHP_URL_HOST);