namespace phinde\LinkExtractor;
use phinde\LinkInfo;
+use phinde\Helper;
class Html
{
public function extract(\HTTP_Request2_Response $res)
{
- $url = $res->getEffectiveUrl();
+ $url = Helper::removeAnchor($res->getEffectiveUrl());
$linkInfos = array();
$dx = new \DOMXPath($doc);
+ $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
+ if ($xbase) {
+ $base = $base->resolve(
+ $xbase->attributes->getNamedItem('href')->textContent
+ );
+ }
+
$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
->item(0);
if ($meta) {
$links = $dx->evaluate('//a');
//FIXME: link rel, img, video
- $alreadySeen = array();
+ $alreadySeen = array($url => true);
foreach ($links as $link) {
- $linkTitle = $link->textContent;
+ $linkTitle = Helper::sanitizeTitle($link->textContent);
$href = '';
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {