comment + link extraction
authorChristian Weiske <cweiske@cweiske.de>
Tue, 18 Jun 2013 06:00:50 +0000 (08:00 +0200)
committerChristian Weiske <cweiske@cweiske.de>
Tue, 18 Jun 2013 06:03:07 +0000 (08:03 +0200)
src/stapibas/Content/Extractor.php
src/stapibas/Content/Extractor/Base.php [new file with mode: 0644]
src/stapibas/Content/Extractor/Comment.php
src/stapibas/Content/Extractor/Link.php [new file with mode: 0644]
tests/stapibas/Content/Extractor/LinkTest.php [new file with mode: 0644]
tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm [new file with mode: 0644]

index 0ede38959bc007d4c33d1f2e907d34cdc85af91e..c54344be9f7705bb79a2d020e02a39b91ed7de6d 100644 (file)
@@ -46,7 +46,19 @@ class Content_Extractor
             $doc->loadHTML($contentRow->pc_fulltext);
         }
 
-        //FIXME: delete old content
+        //delete old content
+        $this->db->exec(
+            'DELETE FROM rboomarks WHERE'
+            . ' rb_pc_id = ' . $this->db->quote($contentRow->pc_id)
+        );
+        $this->db->exec(
+            'DELETE FROM rcomments WHERE'
+            . ' rc_pc_id = ' . $this->db->quote($contentRow->pc_id)
+        );
+        $this->db->exec(
+            'DELETE FROM rlinks WHERE'
+            . ' rl_pc_id = ' . $this->db->quote($contentRow->pc_id)
+        );
 
         $ce = new Content_Extractor_Comment($this->deps->log);
         $data = $ce->extract($doc, $contentRow->p_source, $contentRow->p_target);
@@ -67,7 +79,29 @@ class Content_Extractor
             );
             return;
         }
-        //FIXME: bookmark, link
+
+        //FIXME: bookmark
+
+        $ce = new Content_Extractor_Link($this->deps->log);
+        $data = $ce->extract($doc, $contentRow->p_source, $contentRow->p_target);
+        if ($data !== null) {
+            $this->log->info('Link found');
+            var_dump($data);
+            $this->db->exec(
+                'INSERT INTO rlinks SET'
+                . '  rl_pc_id = ' . $this->db->quote($contentRow->pc_id)
+                . ', rl_source = ' . $this->db->quote($contentRow->p_source)
+                . ', rl_target = ' . $this->db->quote($contentRow->p_target)
+                . ', rl_title = ' . $this->db->quote($data['title'])
+                . ', rl_author_name = ' . $this->db->quote($data['author_name'])
+                . ', rl_author_url = ' . $this->db->quote($data['author_url'])
+                . ', rl_author_image = ' . $this->db->quote($data['author_image'])
+                . ', rc_updated = NOW()'
+            );
+            return;
+        }
+
+        $this->log->info('Nothing found');
     }
 
 
diff --git a/src/stapibas/Content/Extractor/Base.php b/src/stapibas/Content/Extractor/Base.php
new file mode 100644 (file)
index 0000000..9288120
--- /dev/null
@@ -0,0 +1,117 @@
+<?php
+namespace stapibas;
+
+class Content_Extractor_Base
+{
+    public function __construct(Logger $log)
+    {
+        $this->log = $log;
+    }
+
+    protected function extractAuthorData($hentry, $xpath, &$data, $source)
+    {
+        $data['author_name']  = null;
+        $data['author_image'] = null;
+        $data['author_url']   = null;
+
+        $authors = $xpath->evaluate(
+            './/*[' . $this->xpc('p-author') . ']'
+        );
+        if ($authors->length != 1) {
+            //no p-author, so use page author data
+            $data['author_name'] = $this->getFirst(
+                '/*[self::html or self::h:html]/*[self::head or self::h:head]'
+                . '/*[(self::meta or self::h:meta) and @name="author"]',
+                'content', $hentry, $xpath
+            );
+        
+            $data['author_url'] = 
+                $this->absUrl(
+                    $this->getFirst(
+                        '/*[self::html or self::h:html]/*[self::head or self::h:head]'
+                        . '/*[(self::link or self::h:link) and @rel="author"]',
+                        'href', $hentry, $xpath
+                    ),
+                    $source
+                );
+            return;
+        }
+
+        $author = $authors->item(0);
+
+        $data['author_name'] = $this->getFirst(
+            './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']',
+            null, $author, $xpath
+        );
+        $data['author_image'] = $this->getFirst(
+            './/*[' . $this->xpc('u-photo') . ']',
+            'src', $author, $xpath
+        );
+        $data['author_url'] = $this->absUrl(
+            $this->getFirst(
+                './/*[' . $this->xpc('u-url') . ']',
+                'href', $author, $xpath
+            ),
+            $source
+        );
+    }
+
+    protected function getFirst($xpathExpr, $attrName, $elem, $xpath)
+    {
+        $items = $xpath->evaluate($xpathExpr, $elem);
+        if (!$items instanceof \DOMNodeList || $items->length == 0) {
+            return null;
+        }
+
+        if ($attrName === false) {
+            return $items->item(0);
+        } else if ($attrName == null) {
+            return $items->item(0)->nodeValue;
+        } else {
+            return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue;
+        }
+    }
+
+    protected function innerHtml($element)
+    {
+        $innerHTML = '';
+        $children = $element->childNodes;
+        foreach ($children as $child) {
+            $tmp_dom = new \DOMDocument();
+            $tmp_dom->appendChild($tmp_dom->importNode($child, true));
+            $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n");
+        }
+        return trim($innerHTML);
+    }
+
+    protected function getXpath($node)
+    {
+        $xpath = new \DOMXPath($node);
+        $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
+        return $xpath;
+    }
+
+    protected function xpc($class)
+    {
+        return 'contains('
+            . 'concat(" ", normalize-space(@class), " "),'
+            . '" ' . $class . ' "'
+            . ')';
+    }
+
+    protected function xpq($str)
+    {
+        return '"' . htmlspecialchars($str, ENT_QUOTES) . '"';
+    }
+
+    protected function absUrl($url, $source)
+    {
+        if ($url === null) {
+            return null;
+        }
+        $sourceUrl = new \Net_URL2($source);
+        return (string)$sourceUrl->resolve($url);
+    }
+
+}
+?>
index 4c848c0934c29479be68ae0fc614c1c8febb34fe..ce7403c8d1959a1cc5a04469774bbf9d4af8a657 100644 (file)
@@ -1,13 +1,8 @@
 <?php
 namespace stapibas;
 
-class Content_Extractor_Comment
+class Content_Extractor_Comment extends Content_Extractor_Base
 {
-    public function __construct(Logger $log)
-    {
-        $this->log = $log;
-    }
-
     /**
      * Try to extract comment data from HTML
      *
@@ -22,7 +17,7 @@ class Content_Extractor_Comment
         $xpath = $this->getXpath($doc);
         $hentries = $xpath->query(
             '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ') and '
-            . '//a['
+            . '//*[(self::a or self::h:a) and '
             . $this->xpc('u-in-reply-to') . ' and @href=' . $this->xpq($target)
             . ']'
             . ']'
@@ -38,7 +33,7 @@ class Content_Extractor_Comment
         );
         $hentry = $hentries->item(0);
 
-        $this->extractAuthorData($hentry, $xpath, $data, $doc);
+        $this->extractAuthorData($hentry, $xpath, $data, $source);
         $content = $this->getFirst(
             './/*[' . $this->xpc('e-content') . ']', false, $hentry, $xpath
         );
@@ -51,84 +46,5 @@ class Content_Extractor_Comment
 
         return $data;
     }
-
-    protected function extractAuthorData($hentry, $xpath, &$data, $d)
-    {
-        $data['author_name']  = null;
-        $data['author_image'] = null;
-        $data['author_url']   = null;
-
-        $authors = $xpath->evaluate(
-            './/*[' . $this->xpc('p-author') . ']'
-        );
-        if ($authors->length != 1) {
-            return false;
-        }
-
-        $author = $authors->item(0);
-
-        $data['author_name'] = $this->getFirst(
-            './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']',
-            null, $author, $xpath
-        );
-        $data['author_image'] = $this->getFirst(
-            './/*[' . $this->xpc('u-photo') . ']',
-            'src', $author, $xpath
-        );
-        $data['author_url'] = $this->getFirst(
-            './/*[' . $this->xpc('u-url') . ']',
-            'href', $author, $xpath
-        );
-    }
-
-    protected function getFirst($xpathExpr, $attrName, $elem, $xpath)
-    {
-        $items = $xpath->evaluate($xpathExpr, $elem);
-        if (!$items instanceof \DOMNodeList || $items->length == 0) {
-            return null;
-        }
-
-        if ($attrName === false) {
-            return $items->item(0);
-        } else if ($attrName == null) {
-            return $items->item(0)->nodeValue;
-        } else {
-            return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue;
-        }
-    }
-
-    protected function innerHtml($element)
-    {
-        $innerHTML = '';
-        $children = $element->childNodes;
-        foreach ($children as $child) {
-            $tmp_dom = new \DOMDocument();
-            $tmp_dom->appendChild($tmp_dom->importNode($child, true));
-            $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n");
-        }
-        return trim($innerHTML);
-    }
-
-    protected function getXpath($node)
-    {
-        $xpath = new \DOMXPath($node);
-        $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
-        return $xpath;
-    }
-
-    protected function xpc($class)
-    {
-        return 'contains('
-            . 'concat(" ", normalize-space(@class), " "),'
-            . '" ' . $class . ' "'
-            . ')';
-    }
-
-    protected function xpq($str)
-    {
-        return '"' . htmlspecialchars($str, ENT_QUOTES) . '"';
-    }
-
 }
-
 ?>
diff --git a/src/stapibas/Content/Extractor/Link.php b/src/stapibas/Content/Extractor/Link.php
new file mode 100644 (file)
index 0000000..91bdb31
--- /dev/null
@@ -0,0 +1,67 @@
+<?php
+namespace stapibas;
+
+class Content_Extractor_Link extends Content_Extractor_Base
+{
+    /**
+     * Try to extract link data from HTML
+     *
+     * @param object $doc HTML
+     * @param string $source URL this HTML has been loaded from
+     * @param string $target URL the reply should be to
+     *
+     * @return mixed NULL if nothing found, array if ok
+     */
+    public function extract(\DOMDocument $doc, $source, $target)
+    {
+        $xpath = $this->getXpath($doc);
+        $hentries = $xpath->query(
+            '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ')'
+            . ' and //*[' . $this->xpc('e-content') . ']'
+            . ']'
+        );
+
+        $sourceUrl = new \Net_URL2($source);
+        $found = false;
+
+        foreach ($hentries as $hentry) {
+            $links = $xpath->query('.//*[self::a or self::h:a]', $hentry);
+            foreach ($links as $link) {
+                $url = (string)$sourceUrl->resolve(
+                    $link->attributes->getNamedItem('href')->nodeValue
+                );
+                if ($url == $target) {
+                    $found = true;
+                    break 2;
+                }
+            }
+        }
+
+        if (!$found) {
+            return null;
+        }
+
+        $data = array('title' => null);
+        $hentry = $hentries->item(0);
+
+        $this->extractAuthorData($hentry, $xpath, $data, $source);
+        $data['title'] = trim(
+            $this->getFirst(
+                './/*[' . $this->xpc('p-name') . ']', null, $hentry, $xpath
+            )
+        );
+        if ($data['title'] === null) {
+            //use page title
+            $data['title'] = trim(
+                $this->getFirst(
+                    '/*[self::html or self::h:html]/*[self::head or self::h:head]'
+                    . '/*[self::title or self::h:title]',
+                    null, $hentry, $xpath
+                )
+            );
+        }
+
+        return $data;
+    }
+}
+?>
diff --git a/tests/stapibas/Content/Extractor/LinkTest.php b/tests/stapibas/Content/Extractor/LinkTest.php
new file mode 100644 (file)
index 0000000..6b5d3ee
--- /dev/null
@@ -0,0 +1,80 @@
+<?php
+namespace stapibas;
+require_once 'stapibas/autoloader.php';
+
+class Content_Extractor_LinkTest extends \PHPUnit_Framework_TestCase
+{
+    public function testExtractShadowBox()
+    {
+        $doc = new \DOMDocument();
+        @$doc->loadHtmlFile(__DIR__ . '/data/shadowbox-popup-positioning.htm');
+        $source = 'http://www.bogo/tagebuch/shadowbox-popup-positioning.htm';
+        $target = 'http://www.bogo/tagebuch/demo/shadowbox-manual-positioning/static.html';
+        
+        $logger = new Logger();
+        $logger->debug = true;
+        $cel = new Content_Extractor_Link($logger);
+        $link = $cel->extract($doc, $source, $target);
+        
+        $this->assertNotNull($link, 'No extracted data');
+
+        $this->assertEquals(
+            'Shadowbox: Manual popup positioning',
+            $link['title']
+        );
+
+        $this->assertEquals('Christian Weiske', $link['author_name']);
+        $this->assertNull($link['author_image']);
+        $this->assertEquals('http://www.bogo/', $link['author_url']);
+    }
+
+    public function testExtractXmlShadowBox()
+    {
+        $doc = new \DOMDocument();
+        @$doc->load(__DIR__ . '/data/shadowbox-popup-positioning.htm');
+        $source = 'http://www.bogo/tagebuch/shadowbox-popup-positioning.htm';
+        $target = 'http://www.bogo/tagebuch/demo/shadowbox-manual-positioning/static.html';
+        
+        $logger = new Logger();
+        $logger->debug = true;
+        $cel = new Content_Extractor_Link($logger);
+        $link = $cel->extract($doc, $source, $target);
+        
+        $this->assertNotNull($link, 'No extracted data');
+
+        $this->assertEquals(
+            'Shadowbox: Manual popup positioning',
+            $link['title']
+        );
+
+        $this->assertEquals('Christian Weiske', $link['author_name']);
+        $this->assertNull($link['author_image']);
+        $this->assertEquals('http://www.bogo/', $link['author_url']);
+    }
+
+    public function testExtractLaurent()
+    {
+        $doc = new \DOMDocument();
+        @$doc->loadHtmlFile(__DIR__ . '/data/laurent-eschenauer.html');
+        $source = 'http://eschnou.com/entry/testing-indieweb-federation-with-waterpigscouk-aaronpareckicom-and--62-24908.html';
+        $target = 'http://indiewebcamp.com';
+        
+        $logger = new Logger();
+        $logger->debug = true;
+        $cel = new Content_Extractor_Link($logger);
+        $link = $cel->extract($doc, $source, $target);
+        
+        $this->assertNotNull($link, 'No extracted data');
+
+        $this->assertEquals(
+            'Testing #indieweb federation with @waterpigs.co.uk, @aaronparecki.com and @indiewebcamp.com !',
+            $link['title']
+        );
+
+        $this->assertNull($link['author_name']);
+        $this->assertNull($link['author_image']);
+        $this->assertNull($link['author_url']);
+    }
+
+}
+?>
diff --git a/tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm b/tests/stapibas/Content/Extractor/data/shadowbox-popup-positioning.htm
new file mode 100644 (file)
index 0000000..d289aec
--- /dev/null
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head profile="http://microformats.org/profile/rel-tag http://microformats.org/profile/h-entry">
+  <title>Shadowbox: Manual popup positioning</title>
+  <link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+  <meta name="author" content="Christian Weiske" />
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta http-equiv="content-language" content="en" />
+  <meta name="keywords" content="programming, web" />
+  <meta name="DC.date.created" content="2013-04-30T22:10:01+02:00" />
+  <meta name="DC.date.modified" content="2013-04-30T22:10:01+02:00" />
+  <link rel="license" type="text/html" href="http://creativecommons.org/licenses/by-nc-sa/3.0/" />
+  <link rel="license" type="application/rdf+xml" href="http://creativecommons.org/licenses/by-nc-sa/3.0/rdf" />
+  <link rel="canonical" href="http://www.netresearch.de/blog/shadowbox-manual-popup-positioning/" />
+   <link rel="stylesheet" type="text/css" href="tagebuch.css"/>
+  <link rel="contents" href="./" title="Sitemap"/>
+  <link rel="author" href="/" title="About the creator of this post"/>
+  <link rel="prev" href="php-redirection-limit-reached.htm" title="Next blog entry"/>
+  <link rel="next" href="gitorious-wildcard-search.htm" title="Previous blog entry"/>
+  <!--[if IE]>
+  <meta http-equiv="refresh" content="5; url=http://stackoverflow.com/q/9182692/282601">
+  <![endif]--></head>
+ <body class="h-entry hentry">
+ <div class="sidebar">
+  <!-- date -->
+  <p>
+   <span title="2013-04-30T22:10:01+02:00" class="dt-published published">
+    April 30, 2013   </span>
+     </p>
+
+  <ul class="prevnext">
+     <li class="next"><a href="gitorious-wildcard-search.htm">Gitorious: Enable wildcard search</a></li>
+   <li><a href="php-redirection-limit-reached.htm">PHP: Redirection limit reached</a></li>
+   <li class="up"><a href="./">Tagebuch</a></li>
+  </ul>
+
+  <h3>Tags</h3>
+  <ul class="tags">
+     <li><a rel="tag" class="p-category" href="tag/programming">programming</a>
+    <ul>
+     <li><a href="json-display.htm">Displaying JSON in your browser</a></li>
+    </ul>
+   </li>
+   <li><a rel="tag" class="p-category" href="tag/web">web</a>
+    <ul>
+     <li><a href="json-display.htm">Displaying JSON in your browser</a></li>
+    </ul>
+   </li>
+  </ul>
+ </div>
+
+ <div class="frame">
+  <h1 class="p-name entry-title">Shadowbox: Manual popup positioning</h1>
+
+  <div id="content" class="e-content entry-content">
+
+  <div class="warning">
+   <p>
+    This article has originally been published on my employer's
+    blog: 
+    <a href="http://www.netresearch.de/blog/shadowbox-manual-popup-positioning/">
+     Shadowbox: Manual popup positioning @ netresearch
+    </a>.
+   </p>
+  </div>
+
+  <p>
+   <a href="http://shadowbox-js.com/">Shadowbox</a> can be used to display
+   images, videos or other HTML pages in a popup on your website.
+   Sometimes it is necessary to manually adjust the position of the overlay
+   window, for example when using it in an iframe with a very large
+   height setting.
+   Shadowbox itself does not offer a hook to modify the position, but with some
+   JavaScript trickery it is possible to manipulate the position nevertheless.
+  </p>
+  <p>
+   The idea is - since we have no hook to register with - to replace the
+   original positioning method with our own.
+   Since JavaScript allows method renaming, this is fairly easy.
+  </p>
+
+
+  <h2 id="static-position">Static position<a class="anchorlink" href="#static-position"></a></h2>
+  <p>
+   Shadowbox uses method <tt>setDimensions()</tt> to calculate and set position
+   and size of the popup window.
+   We rename it and put our own method at this place:
+  </p>
+  <pre><code class="lang-js"><![CDATA[<script type="text/javascript">
+window.Shadowbox.setDimensionsOld = window.Shadowbox.setDimensions;
+window.Shadowbox.setDimensions = function (height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect) {
+    var S = window.Shadowbox;
+    window.Shadowbox.setDimensionsOld(height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect);
+    window.Shadowbox.dimensions.top = 10;
+    return window.Shadowbox.dimensions;
+}
+</script>
+]]></code></pre>
+  <p>
+   Now we have our shadowbox popup fixed at 10 pixels from the top of the page.
+  </p>
+  <p>
+   Have a look at the
+   <a href="demo/shadowbox-manual-positioning/static.html">static positioning demo</a>.
+  </p>
+
+
+  <h2 id="dynamic-position">Dynamic position<a class="anchorlink" href="#dynamic-position"></a></h2>
+  <p>
+   When you have an iframe with some several thousand pixels in height,
+   you don't want to have a fixed position on top but a position near the mouse
+   cursor or the element that has been clicked.
+  </p>
+  <p>
+  The following code positions the popup 10 pixels below the object that has
+  been clicked to open the overlay:
+  </p>
+  <pre><code class="lang-js"><![CDATA[<script type="text/javascript">
+window.Shadowbox.setDimensionsOld = window.Shadowbox.setDimensions;
+window.Shadowbox.setDimensions = function (height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect) {
+    var S = window.Shadowbox;
+    window.Shadowbox.setDimensionsOld(height, width, maxHeight, maxWidth, topBottom, leftRight, padding, preserveAspect);
+    if (window.shadowboxClickObj && window.shadowboxClickObj.link) {
+        var offset = $(window.shadowboxClickObj.link).offset();
+        window.Shadowbox.dimensions.top = offset.top + 10;
+        $('#sb-container').css({position: 'absolute', 'height': $(document).height()});
+    }
+    return window.Shadowbox.dimensions
+}
+
+window.Shadowbox.skin.onOpenOld = window.Shadowbox.skin.onOpen;
+window.Shadowbox.skin.onOpen = function(obj, callback) {
+    window.shadowboxClickObj = obj;
+    window.Shadowbox.skin.onOpenOld(obj, callback);
+}
+</script>
+]]></code></pre>
+  <p>
+   Here, <tt>onOpen()</tt> needs to be overwritten as well because the clicked
+  object is not available anymore in <tt>setDimensions()</tt>.
+  </p>
+  <p>
+   Have a look at the
+   <a href="demo/shadowbox-manual-positioning/dynamic.html">dynamic positioning demo</a>.
+  </p>
+  </div>
+  <div class="comments">
+   <p>
+    Comments? Please
+    <a href="&#x6D;&#x61;&#x69;&#x6C;&#x74;&#x6F;&#x3A;Christian%20Weiske%20%3C&#x63;&#x77;&#x65;&#x69;&#x73;&#x6B;&#x65;&#x0040;&#x63;&#x77;&#x65;&#x69;&#x73;&#x6B;&#x65;&#x2E;&#x64;&#x65;%3E?subject=Re:%20Shadowbox%3A%20Manual%20popup%20positioning">send an e-mail</a>.
+   </p>
+  </div>
+ </div></body>
+</html>