From 91cd1aeb11c7708e09283e79b2db5406c8b378cc Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Tue, 18 Jun 2013 08:00:50 +0200 Subject: [PATCH] comment + link extraction --- src/stapibas/Content/Extractor.php | 38 ++++- src/stapibas/Content/Extractor/Base.php | 117 +++++++++++++ src/stapibas/Content/Extractor/Comment.php | 90 +--------- src/stapibas/Content/Extractor/Link.php | 67 ++++++++ tests/stapibas/Content/Extractor/LinkTest.php | 80 +++++++++ .../data/shadowbox-popup-positioning.htm | 159 ++++++++++++++++++ 6 files changed, 462 insertions(+), 89 deletions(-) create mode 100644 src/stapibas/Content/Extractor/Base.php create mode 100644 src/stapibas/Content/Extractor/Link.php create mode 100644 tests/stapibas/Content/Extractor/LinkTest.php create mode 100644 tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm diff --git a/src/stapibas/Content/Extractor.php b/src/stapibas/Content/Extractor.php index 0ede389..c54344b 100644 --- a/src/stapibas/Content/Extractor.php +++ b/src/stapibas/Content/Extractor.php @@ -46,7 +46,19 @@ class Content_Extractor $doc->loadHTML($contentRow->pc_fulltext); } - //FIXME: delete old content + //delete old content + $this->db->exec( + 'DELETE FROM rboomarks WHERE' + . ' rb_pc_id = ' . $this->db->quote($contentRow->pc_id) + ); + $this->db->exec( + 'DELETE FROM rcomments WHERE' + . ' rc_pc_id = ' . $this->db->quote($contentRow->pc_id) + ); + $this->db->exec( + 'DELETE FROM rlinks WHERE' + . ' rl_pc_id = ' . $this->db->quote($contentRow->pc_id) + ); $ce = new Content_Extractor_Comment($this->deps->log); $data = $ce->extract($doc, $contentRow->p_source, $contentRow->p_target); @@ -67,7 +79,29 @@ class Content_Extractor ); return; } - //FIXME: bookmark, link + + //FIXME: bookmark + + $ce = new Content_Extractor_Link($this->deps->log); + $data = $ce->extract($doc, $contentRow->p_source, $contentRow->p_target); + if ($data !== null) { + $this->log->info('Link found'); + var_dump($data); + $this->db->exec( + 'INSERT INTO rlinks SET' + . ' rl_pc_id = ' . $this->db->quote($contentRow->pc_id) + . ', rl_source = ' . $this->db->quote($contentRow->p_source) + . ', rl_target = ' . $this->db->quote($contentRow->p_target) + . ', rl_title = ' . $this->db->quote($data['title']) + . ', rl_author_name = ' . $this->db->quote($data['author_name']) + . ', rl_author_url = ' . $this->db->quote($data['author_url']) + . ', rl_author_image = ' . $this->db->quote($data['author_image']) + . ', rc_updated = NOW()' + ); + return; + } + + $this->log->info('Nothing found'); } diff --git a/src/stapibas/Content/Extractor/Base.php b/src/stapibas/Content/Extractor/Base.php new file mode 100644 index 0000000..9288120 --- /dev/null +++ b/src/stapibas/Content/Extractor/Base.php @@ -0,0 +1,117 @@ +log = $log; + } + + protected function extractAuthorData($hentry, $xpath, &$data, $source) + { + $data['author_name'] = null; + $data['author_image'] = null; + $data['author_url'] = null; + + $authors = $xpath->evaluate( + './/*[' . $this->xpc('p-author') . ']' + ); + if ($authors->length != 1) { + //no p-author, so use page author data + $data['author_name'] = $this->getFirst( + '/*[self::html or self::h:html]/*[self::head or self::h:head]' + . '/*[(self::meta or self::h:meta) and @name="author"]', + 'content', $hentry, $xpath + ); + + $data['author_url'] = + $this->absUrl( + $this->getFirst( + '/*[self::html or self::h:html]/*[self::head or self::h:head]' + . '/*[(self::link or self::h:link) and @rel="author"]', + 'href', $hentry, $xpath + ), + $source + ); + return; + } + + $author = $authors->item(0); + + $data['author_name'] = $this->getFirst( + './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']', + null, $author, $xpath + ); + $data['author_image'] = $this->getFirst( + './/*[' . $this->xpc('u-photo') . ']', + 'src', $author, $xpath + ); + $data['author_url'] = $this->absUrl( + $this->getFirst( + './/*[' . $this->xpc('u-url') . ']', + 'href', $author, $xpath + ), + $source + ); + } + + protected function getFirst($xpathExpr, $attrName, $elem, $xpath) + { + $items = $xpath->evaluate($xpathExpr, $elem); + if (!$items instanceof \DOMNodeList || $items->length == 0) { + return null; + } + + if ($attrName === false) { + return $items->item(0); + } else if ($attrName == null) { + return $items->item(0)->nodeValue; + } else { + return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue; + } + } + + protected function innerHtml($element) + { + $innerHTML = ''; + $children = $element->childNodes; + foreach ($children as $child) { + $tmp_dom = new \DOMDocument(); + $tmp_dom->appendChild($tmp_dom->importNode($child, true)); + $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n"); + } + return trim($innerHTML); + } + + protected function getXpath($node) + { + $xpath = new \DOMXPath($node); + $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); + return $xpath; + } + + protected function xpc($class) + { + return 'contains(' + . 'concat(" ", normalize-space(@class), " "),' + . '" ' . $class . ' "' + . ')'; + } + + protected function xpq($str) + { + return '"' . htmlspecialchars($str, ENT_QUOTES) . '"'; + } + + protected function absUrl($url, $source) + { + if ($url === null) { + return null; + } + $sourceUrl = new \Net_URL2($source); + return (string)$sourceUrl->resolve($url); + } + +} +?> diff --git a/src/stapibas/Content/Extractor/Comment.php b/src/stapibas/Content/Extractor/Comment.php index 4c848c0..ce7403c 100644 --- a/src/stapibas/Content/Extractor/Comment.php +++ b/src/stapibas/Content/Extractor/Comment.php @@ -1,13 +1,8 @@ log = $log; - } - /** * Try to extract comment data from HTML * @@ -22,7 +17,7 @@ class Content_Extractor_Comment $xpath = $this->getXpath($doc); $hentries = $xpath->query( '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ') and ' - . '//a[' + . '//*[(self::a or self::h:a) and ' . $this->xpc('u-in-reply-to') . ' and @href=' . $this->xpq($target) . ']' . ']' @@ -38,7 +33,7 @@ class Content_Extractor_Comment ); $hentry = $hentries->item(0); - $this->extractAuthorData($hentry, $xpath, $data, $doc); + $this->extractAuthorData($hentry, $xpath, $data, $source); $content = $this->getFirst( './/*[' . $this->xpc('e-content') . ']', false, $hentry, $xpath ); @@ -51,84 +46,5 @@ class Content_Extractor_Comment return $data; } - - protected function extractAuthorData($hentry, $xpath, &$data, $d) - { - $data['author_name'] = null; - $data['author_image'] = null; - $data['author_url'] = null; - - $authors = $xpath->evaluate( - './/*[' . $this->xpc('p-author') . ']' - ); - if ($authors->length != 1) { - return false; - } - - $author = $authors->item(0); - - $data['author_name'] = $this->getFirst( - './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']', - null, $author, $xpath - ); - $data['author_image'] = $this->getFirst( - './/*[' . $this->xpc('u-photo') . ']', - 'src', $author, $xpath - ); - $data['author_url'] = $this->getFirst( - './/*[' . $this->xpc('u-url') . ']', - 'href', $author, $xpath - ); - } - - protected function getFirst($xpathExpr, $attrName, $elem, $xpath) - { - $items = $xpath->evaluate($xpathExpr, $elem); - if (!$items instanceof \DOMNodeList || $items->length == 0) { - return null; - } - - if ($attrName === false) { - return $items->item(0); - } else if ($attrName == null) { - return $items->item(0)->nodeValue; - } else { - return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue; - } - } - - protected function innerHtml($element) - { - $innerHTML = ''; - $children = $element->childNodes; - foreach ($children as $child) { - $tmp_dom = new \DOMDocument(); - $tmp_dom->appendChild($tmp_dom->importNode($child, true)); - $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n"); - } - return trim($innerHTML); - } - - protected function getXpath($node) - { - $xpath = new \DOMXPath($node); - $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); - return $xpath; - } - - protected function xpc($class) - { - return 'contains(' - . 'concat(" ", normalize-space(@class), " "),' - . '" ' . $class . ' "' - . ')'; - } - - protected function xpq($str) - { - return '"' . htmlspecialchars($str, ENT_QUOTES) . '"'; - } - } - ?> diff --git a/src/stapibas/Content/Extractor/Link.php b/src/stapibas/Content/Extractor/Link.php new file mode 100644 index 0000000..91bdb31 --- /dev/null +++ b/src/stapibas/Content/Extractor/Link.php @@ -0,0 +1,67 @@ +getXpath($doc); + $hentries = $xpath->query( + '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ')' + . ' and //*[' . $this->xpc('e-content') . ']' + . ']' + ); + + $sourceUrl = new \Net_URL2($source); + $found = false; + + foreach ($hentries as $hentry) { + $links = $xpath->query('.//*[self::a or self::h:a]', $hentry); + foreach ($links as $link) { + $url = (string)$sourceUrl->resolve( + $link->attributes->getNamedItem('href')->nodeValue + ); + if ($url == $target) { + $found = true; + break 2; + } + } + } + + if (!$found) { + return null; + } + + $data = array('title' => null); + $hentry = $hentries->item(0); + + $this->extractAuthorData($hentry, $xpath, $data, $source); + $data['title'] = trim( + $this->getFirst( + './/*[' . $this->xpc('p-name') . ']', null, $hentry, $xpath + ) + ); + if ($data['title'] === null) { + //use page title + $data['title'] = trim( + $this->getFirst( + '/*[self::html or self::h:html]/*[self::head or self::h:head]' + . '/*[self::title or self::h:title]', + null, $hentry, $xpath + ) + ); + } + + return $data; + } +} +?> diff --git a/tests/stapibas/Content/Extractor/LinkTest.php b/tests/stapibas/Content/Extractor/LinkTest.php new file mode 100644 index 0000000..6b5d3ee --- /dev/null +++ b/tests/stapibas/Content/Extractor/LinkTest.php @@ -0,0 +1,80 @@ +loadHtmlFile(__DIR__ . '/data/shadowbox-popup-positioning.htm'); + $source = 'http://www.bogo/tagebuch/shadowbox-popup-positioning.htm'; + $target = 'http://www.bogo/tagebuch/demo/shadowbox-manual-positioning/static.html'; + + $logger = new Logger(); + $logger->debug = true; + $cel = new Content_Extractor_Link($logger); + $link = $cel->extract($doc, $source, $target); + + $this->assertNotNull($link, 'No extracted data'); + + $this->assertEquals( + 'Shadowbox: Manual popup positioning', + $link['title'] + ); + + $this->assertEquals('Christian Weiske', $link['author_name']); + $this->assertNull($link['author_image']); + $this->assertEquals('http://www.bogo/', $link['author_url']); + } + + public function testExtractXmlShadowBox() + { + $doc = new \DOMDocument(); + @$doc->load(__DIR__ . '/data/shadowbox-popup-positioning.htm'); + $source = 'http://www.bogo/tagebuch/shadowbox-popup-positioning.htm'; + $target = 'http://www.bogo/tagebuch/demo/shadowbox-manual-positioning/static.html'; + + $logger = new Logger(); + $logger->debug = true; + $cel = new Content_Extractor_Link($logger); + $link = $cel->extract($doc, $source, $target); + + $this->assertNotNull($link, 'No extracted data'); + + $this->assertEquals( + 'Shadowbox: Manual popup positioning', + $link['title'] + ); + + $this->assertEquals('Christian Weiske', $link['author_name']); + $this->assertNull($link['author_image']); + $this->assertEquals('http://www.bogo/', $link['author_url']); + } + + public function testExtractLaurent() + { + $doc = new \DOMDocument(); + @$doc->loadHtmlFile(__DIR__ . '/data/laurent-eschenauer.html'); + $source = 'http://eschnou.com/entry/testing-indieweb-federation-with-waterpigscouk-aaronpareckicom-and--62-24908.html'; + $target = 'http://indiewebcamp.com'; + + $logger = new Logger(); + $logger->debug = true; + $cel = new Content_Extractor_Link($logger); + $link = $cel->extract($doc, $source, $target); + + $this->assertNotNull($link, 'No extracted data'); + + $this->assertEquals( + 'Testing #indieweb federation with @waterpigs.co.uk, @aaronparecki.com and @indiewebcamp.com !', + $link['title'] + ); + + $this->assertNull($link['author_name']); + $this->assertNull($link['author_image']); + $this->assertNull($link['author_url']); + } + +} +?> diff --git a/tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm b/tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm new file mode 100644 index 0000000..d289aec --- /dev/null +++ b/tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm @@ -0,0 +1,159 @@ + + + + + Shadowbox: Manual popup positioning + + + + + + + + + + + + + + + + + + + + + + +
+

Shadowbox: Manual popup positioning

+ +
+ +
+

+ This article has originally been published on my employer's + blog: + + Shadowbox: Manual popup positioning @ netresearch + . +

+
+ +

+ Shadowbox can be used to display + images, videos or other HTML pages in a popup on your website. + Sometimes it is necessary to manually adjust the position of the overlay + window, for example when using it in an iframe with a very large + height setting. + Shadowbox itself does not offer a hook to modify the position, but with some + JavaScript trickery it is possible to manipulate the position nevertheless. +

+

+ The idea is - since we have no hook to register with - to replace the + original positioning method with our own. + Since JavaScript allows method renaming, this is fairly easy. +

+ + +

Static position

+

+ Shadowbox uses method setDimensions() to calculate and set position + and size of the popup window. + We rename it and put our own method at this place: +

+

+window.Shadowbox.setDimensionsOld = window.Shadowbox.setDimensions;
+window.Shadowbox.setDimensions = function (height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect) {
+    var S = window.Shadowbox;
+    window.Shadowbox.setDimensionsOld(height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect);
+    window.Shadowbox.dimensions.top = 10;
+    return window.Shadowbox.dimensions;
+}
+
+]]>
+

+ Now we have our shadowbox popup fixed at 10 pixels from the top of the page. +

+

+ Have a look at the + static positioning demo. +

+ + +

Dynamic position

+

+ When you have an iframe with some several thousand pixels in height, + you don't want to have a fixed position on top but a position near the mouse + cursor or the element that has been clicked. +

+

+ The following code positions the popup 10 pixels below the object that has + been clicked to open the overlay: +

+

+window.Shadowbox.setDimensionsOld = window.Shadowbox.setDimensions;
+window.Shadowbox.setDimensions = function (height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect) {
+    var S = window.Shadowbox;
+    window.Shadowbox.setDimensionsOld(height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect);
+    if (window.shadowboxClickObj && window.shadowboxClickObj.link) {
+        var offset = $(window.shadowboxClickObj.link).offset();
+        window.Shadowbox.dimensions.top = offset.top + 10;
+        $('#sb-container').css({position: 'absolute', 'height': $(document).height()});
+    }
+    return window.Shadowbox.dimensions
+}
+
+window.Shadowbox.skin.onOpenOld = window.Shadowbox.skin.onOpen;
+window.Shadowbox.skin.onOpen = function(obj, callback) {
+    window.shadowboxClickObj = obj;
+    window.Shadowbox.skin.onOpenOld(obj, callback);
+}
+
+]]>
+

+ Here, onOpen() needs to be overwritten as well because the clicked + object is not available anymore in setDimensions(). +

+

+ Have a look at the + dynamic positioning demo. +

+ +
+
+

+ Comments? Please + send an e-mail. +

+
+
+ -- 2.30.2