From d8c39f2b0571b9734259b2f9dc218eed24412332 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Thu, 11 Feb 2016 17:37:12 +0100 Subject: [PATCH] sanitize title better --- src/phinde/Helper.php | 11 +++++++++++ src/phinde/LinkExtractor/Html.php | 5 +++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index 40ea751..312c5e5 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -30,5 +30,16 @@ class Helper } return 'http://' . $url; } + + public static function sanitizeTitle($str) + { + return trim( + str_replace( + array("\r", "\n", ' ', ' '), + array('', ' ', ' ', ' '), + $str + ) + ); + } } ?> diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php index 4acd19f..a6fa8ef 100644 --- a/src/phinde/LinkExtractor/Html.php +++ b/src/phinde/LinkExtractor/Html.php @@ -2,6 +2,7 @@ namespace phinde\LinkExtractor; use phinde\LinkInfo; +use phinde\Helper; class Html { @@ -36,10 +37,10 @@ class Html $links = $dx->evaluate('//a'); //FIXME: link rel, img, video - $alreadySeen = array(); + $alreadySeen = array($url => true); foreach ($links as $link) { - $linkTitle = $link->textContent; + $linkTitle = Helper::sanitizeTitle($link->textContent); $href = ''; foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') { -- 2.30.2