aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/LinkExtractor/Html.php
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-02-10 14:56:20 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-02-10 14:56:20 +0100
commitcd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 (patch)
tree8cc7ee5d841f868e38ccc0b54d8cc6d33a852ed7 /src/phinde/LinkExtractor/Html.php
parentf67e8f0bc3f51f2d280a86a8c7cffa68d812efe1 (diff)
downloadphinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.tar.gz
phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.zip
rework crawler; add atom link extraction
Diffstat (limited to 'src/phinde/LinkExtractor/Html.php')
-rw-r--r--src/phinde/LinkExtractor/Html.php67
1 files changed, 67 insertions, 0 deletions
diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php
new file mode 100644
index 0000000..538d6c4
--- /dev/null
+++ b/src/phinde/LinkExtractor/Html.php
@@ -0,0 +1,67 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Html
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+
+ $linkInfos = array();
+
+ //FIXME: mime type switch for cdata
+ $doc = new \DOMDocument();
+ //@ to hide parse warning messages in invalid html
+ @$doc->loadHTML($res->getBody());
+
+ //FIXME: extract base url from html
+ $base = new \Net_URL2($url);
+
+ $xpath = new \DOMXPath($doc);
+ $links = $xpath->evaluate('//a');
+ //FIXME: link rel, img, video
+
+ $alreadySeen = array();
+
+ foreach ($links as $link) {
+ $linkTitle = $link->textContent;
+ $href = '';
+ foreach ($link->attributes as $attribute) {
+ if ($attribute->name == 'href') {
+ $href = $attribute->textContent;
+ }
+ }
+ if ($href == '' || $href{0} == '#') {
+ //link on this page
+ continue;
+ }
+
+ $linkUrlObj = $base->resolve($href);
+ $linkUrlObj->setFragment(false);
+ $linkUrl = (string) $linkUrlObj;
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ switch ($linkUrlObj->getScheme()) {
+ case 'http':
+ case 'https':
+ break;
+ default:
+ continue 2;
+ }
+
+ //FIXME: check target type
+ //FIXME: check nofollow
+ $linkInfos[] = new LinkInfo(
+ $linkUrl, $linkTitle, $url
+ );
+ $alreadySeen[$linkUrl] = true;
+ }
+
+ return $linkInfos;
+ }
+}
+?>