aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/LinkExtractor/Atom.php
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-02-10 14:56:20 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-02-10 14:56:20 +0100
commitcd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 (patch)
tree8cc7ee5d841f868e38ccc0b54d8cc6d33a852ed7 /src/phinde/LinkExtractor/Atom.php
parentf67e8f0bc3f51f2d280a86a8c7cffa68d812efe1 (diff)
downloadphinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.tar.gz
phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.zip
rework crawler; add atom link extraction
Diffstat (limited to 'src/phinde/LinkExtractor/Atom.php')
-rw-r--r--src/phinde/LinkExtractor/Atom.php35
1 files changed, 35 insertions, 0 deletions
diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php
new file mode 100644
index 0000000..bb4d90b
--- /dev/null
+++ b/src/phinde/LinkExtractor/Atom.php
@@ -0,0 +1,35 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Atom
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+ $base = new \Net_URL2($url);
+
+ $sx = simplexml_load_string($res->getBody());
+ $linkInfos = array();
+ $alreadySeen = array();
+
+ foreach ($sx->entry as $entry) {
+ $linkTitle = (string) $entry->title;
+ foreach ($entry->link as $xlink) {
+ $linkUrl = (string) $base->resolve((string) $xlink['href']);
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ if ($xlink['rel'] == 'alternate') {
+ $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
+ }
+ $alreadySeen[$linkUrl] = true;
+ }
+ }
+
+ return $linkInfos;
+ }
+}
+?>