From: Christian Weiske Date: Mon, 17 Jun 2013 19:32:19 +0000 (+0200) Subject: first version of comment extraction X-Git-Url: https://git.cweiske.de/stapibas.git/commitdiff_plain/36c92cff442475d3ebeab443470ae34b77d76fd2 first version of comment extraction --- diff --git a/src/stapibas/Cli.php b/src/stapibas/Cli.php index 10b6957..52c7143 100644 --- a/src/stapibas/Cli.php +++ b/src/stapibas/Cli.php @@ -128,7 +128,8 @@ class Cli $cf = new Content_Fetcher($deps); $cf->updateAll(); - //FIXME + $cx = new Content_Extractor($deps); + $cx->updateAll(); } diff --git a/src/stapibas/Content/Extractor.php b/src/stapibas/Content/Extractor.php new file mode 100644 index 0000000..0ede389 --- /dev/null +++ b/src/stapibas/Content/Extractor.php @@ -0,0 +1,83 @@ +deps = $deps; + $this->db = $deps->db; + $this->log = $deps->log; + } + + /** + * Extracts content from all pingbackcontent entries and puts it + * into rbookmarks/rcomments/rlinks. + */ + public function updateAll() + { + $this->log->info('Extracting pingback content..'); + $res = $this->db->query( + 'SELECT * FROM pingbackcontent, pingbacks' + . ' WHERE p_id = pc_p_id' . $this->sqlNeedsUpdate() + ); + $items = 0; + while ($contentRow = $res->fetch(\PDO::FETCH_OBJ)) { + ++$items; + $this->extractContent($contentRow); + } + $this->log->info('Finished extracting %d pingback contents.', $items); + } + + protected function extractContent($contentRow) + { + $doc = new \DOMDocument(); + $typeParts = explode(';', $contentRow->pc_mime_type); + $type = $typeParts[0]; + if ($type == 'application/xhtml+xml' + || $type == 'application/xml' + || $type == 'text/xml' + ) { + $doc->loadXML($contentRow->pc_fulltext); + } else { + $doc->loadHTML($contentRow->pc_fulltext); + } + + //FIXME: delete old content + + $ce = new Content_Extractor_Comment($this->deps->log); + $data = $ce->extract($doc, $contentRow->p_source, $contentRow->p_target); + if ($data !== null) { + $this->log->info('Comment found'); + var_dump($data); + $this->db->exec( + 'INSERT INTO rcomments SET' + . ' rc_pc_id = ' . $this->db->quote($contentRow->pc_id) + . ', rc_source = ' . $this->db->quote($contentRow->p_source) + . ', rc_target = ' . $this->db->quote($contentRow->p_target) + . ', rc_title = ' . $this->db->quote($data['title']) + . ', rc_author_name = ' . $this->db->quote($data['author_name']) + . ', rc_author_url = ' . $this->db->quote($data['author_url']) + . ', rc_author_image = ' . $this->db->quote($data['author_image']) + . ', rc_content = ' . $this->db->quote($data['content']) + . ', rc_updated = NOW()' + ); + return; + } + //FIXME: bookmark, link + } + + + protected function sqlNeedsUpdate() + { + if ($this->deps->options['force']) { + return ''; + } + return ' AND pc_detected_type = 1'; + } + +} +?> diff --git a/src/stapibas/Content/Extractor/Comment.php b/src/stapibas/Content/Extractor/Comment.php new file mode 100644 index 0000000..4c848c0 --- /dev/null +++ b/src/stapibas/Content/Extractor/Comment.php @@ -0,0 +1,134 @@ +log = $log; + } + + /** + * Try to extract comment data from HTML + * + * @param object $doc HTML + * @param string $source URL this HTML has been loaded from + * @param string $target URL the reply should be to + * + * @return mixed NULL if nothing found, array if ok + */ + public function extract(\DOMDocument $doc, $source, $target) + { + $xpath = $this->getXpath($doc); + $hentries = $xpath->query( + '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ') and ' + . '//a[' + . $this->xpc('u-in-reply-to') . ' and @href=' . $this->xpq($target) + . ']' + . ']' + ); + + if ($hentries->length == 0) { + return null; + } + + $data = array( + 'content' => null, + 'title' => null, + ); + $hentry = $hentries->item(0); + + $this->extractAuthorData($hentry, $xpath, $data, $doc); + $content = $this->getFirst( + './/*[' . $this->xpc('e-content') . ']', false, $hentry, $xpath + ); + if ($content) { + $data['content'] = $this->innerHtml($content); + } + $data['title'] = $this->getFirst( + './/*[' . $this->xpc('p-name') . ']', false, $hentry, $xpath + ); + + return $data; + } + + protected function extractAuthorData($hentry, $xpath, &$data, $d) + { + $data['author_name'] = null; + $data['author_image'] = null; + $data['author_url'] = null; + + $authors = $xpath->evaluate( + './/*[' . $this->xpc('p-author') . ']' + ); + if ($authors->length != 1) { + return false; + } + + $author = $authors->item(0); + + $data['author_name'] = $this->getFirst( + './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']', + null, $author, $xpath + ); + $data['author_image'] = $this->getFirst( + './/*[' . $this->xpc('u-photo') . ']', + 'src', $author, $xpath + ); + $data['author_url'] = $this->getFirst( + './/*[' . $this->xpc('u-url') . ']', + 'href', $author, $xpath + ); + } + + protected function getFirst($xpathExpr, $attrName, $elem, $xpath) + { + $items = $xpath->evaluate($xpathExpr, $elem); + if (!$items instanceof \DOMNodeList || $items->length == 0) { + return null; + } + + if ($attrName === false) { + return $items->item(0); + } else if ($attrName == null) { + return $items->item(0)->nodeValue; + } else { + return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue; + } + } + + protected function innerHtml($element) + { + $innerHTML = ''; + $children = $element->childNodes; + foreach ($children as $child) { + $tmp_dom = new \DOMDocument(); + $tmp_dom->appendChild($tmp_dom->importNode($child, true)); + $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n"); + } + return trim($innerHTML); + } + + protected function getXpath($node) + { + $xpath = new \DOMXPath($node); + $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml'); + return $xpath; + } + + protected function xpc($class) + { + return 'contains(' + . 'concat(" ", normalize-space(@class), " "),' + . '" ' . $class . ' "' + . ')'; + } + + protected function xpq($str) + { + return '"' . htmlspecialchars($str, ENT_QUOTES) . '"'; + } + +} + +?> diff --git a/tests/phpunit.xml b/tests/phpunit.xml new file mode 100644 index 0000000..7a4a8cd --- /dev/null +++ b/tests/phpunit.xml @@ -0,0 +1,12 @@ + + + + ../src/ + ../tests/ + + + + ../src/ + + + diff --git a/tests/stapibas/Content/Extractor/CommentTest.php b/tests/stapibas/Content/Extractor/CommentTest.php new file mode 100644 index 0000000..219fa40 --- /dev/null +++ b/tests/stapibas/Content/Extractor/CommentTest.php @@ -0,0 +1,43 @@ +loadHtmlFile(__DIR__ . '/data/aaron-parecki.html'); + $source = 'http://aaronparecki.com/replies/2013/04/19/2/indieweb'; + $target = 'http://eschnou.com/entry/testing-indieweb-federation-with-waterpigscouk-aaronpareckicom-and--62-24908.html'; + + $logger = new Logger(); + $logger->debug = true; + $cec = new Content_Extractor_Comment($logger); + $comment = $cec->extract($doc, $source, $target); + + $this->assertNotNull($comment, 'No extracted data'); + $this->assertEquals( + 'Aaron Parecki', + $comment['author_name'], + 'author name error' + ); + $this->assertEquals( + 'http://aaronparecki.com/images/aaronpk.png', + $comment['author_image'] + ); + $this->assertEquals( + 'http://aaronparecki.com/', + $comment['author_url'] + ); + + $this->assertEquals( + <<@eschnou It worked! Now here's a reply! #indieweb +HTM + , + $comment['content'] + ); + } +} +?> diff --git a/tests/stapibas/Content/Extractor/data/aaron-parecki.html b/tests/stapibas/Content/Extractor/data/aaron-parecki.html new file mode 100644 index 0000000..3d0b91b --- /dev/null +++ b/tests/stapibas/Content/Extractor/data/aaron-parecki.html @@ -0,0 +1,232 @@ + + + + + @eschnou It worked! Now here's a reply! #indieweb - Aaron Parecki + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+ + + +
+ + +
+ +
@eschnou It worked! Now here's a reply! #indieweb
+ + + + +
+ + +
+
+ + + +
+
+
+ + +
+ +
+ + + +
+ +
+

© 1999-2013 by Aaron Parecki.

+

+ Except where otherwise noted, text content on this site is licensed under a Creative Commons Attribution 3.0 License. Creative Commons Attribution 3.0 +

+

+ This site is powered by p3k. +

+
+
+
+ + + + + + + + diff --git a/tests/stapibas/Content/Extractor/data/laurent-eschenauer.html b/tests/stapibas/Content/Extractor/data/laurent-eschenauer.html new file mode 100644 index 0000000..fdaae20 --- /dev/null +++ b/tests/stapibas/Content/Extractor/data/laurent-eschenauer.html @@ -0,0 +1,945 @@ + + + + + + Laurent Eschenauer | Testing #indieweb federation with @waterpigs.co.uk + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+ +
+ +
+ +
+ +
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + +
 Lifestream Mentions Pictures Videos Places Code Stories About 
+
+ +
+
+ +
+
+ +
+ + +
April 19, 2013
+
+ +
+
+ + + + + + 20:26 + + +
+
+
+ +
+ + + + + + + + + + + + + + +
+ + + + +
+
+ Testing #indieweb federation with @waterpigs.co.uk, @aaronparecki.com and @indiewebcamp.com !
+
+ +
+ Tags:
+ +
+
+
+
+ +
+
+

Comments

+ +
+
+
+
+ +
+ +
+ + Laurent Eschenauer It worked! Now here's a reply! #indieweb +
+
+
+
+
+
+ tantek çelik Gravtar +
+
+ +
+ on 20 Apr 13 at 0:57 CEST
+
+
+ Laurent, are the REPLIES FROM THE #INDIEWEB displayed automatically, are you manually adding them, or automatically queued and you're just manually approving them?
+
+
+
+
+
+ laurent eschenauer Gravtar +
+
+ +
+ on 20 Apr 13 at 8:15 CEST
+
+
+ @tantek.com It is automatic, when I receive a pingback I parse the source for mf2 content to find a hcard and hentry. No moderation, but I receive an email when someone comment/mention so I can react/delete if spammy.

Next for me is to also support webmentions, and enable a 'in-reply-to' flow.
+
+
+
+
+
+ +
+ +
+ Premier essai pour tenter de rejoindre une fédération #indieweb chez @eschnou #fra
+
+
+
+
+
+ +
+ +
+ Les réponses #indieweb chez @eschnou
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 22 Apr 13 at 22:27 CEST
+
+
+ And now we have unified local/indieweb comments, time ordered ! Let's have a real distributed conversation :-)
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 22 Apr 13 at 22:56 CEST
+
+
+ @aaronparecki.com This is a reply to your reply, making it a really distributed conversation :-)
+
+
+
+
+
+ +
+ +
+ Laurent Eschenauer great work getting #indieweb comments working :)
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 23 Apr 13 at 9:39 CEST
+
+
+ @waterpigs.co.uk Thanks for helping out. Could not have done it without your php-mf2 library!
+
+
+
+
+
+ +
+ +
+ Historically, I consider this to be the #indieweb equivalent of this.
+
+
+
+
+
+ Matthias Pfefferle Gravtar +
+
+ +
+ on 24 Apr 13 at 13:26 CEST
+
+
+ Hey Laurent, how do you know where you have to "attach" the reply, or do you run this task by hand?
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 24 Apr 13 at 14:04 CEST
+
+
+ @notizblog.org Everything is automatic. The pingback request as a 'target' which I map to an existing post (after a bit of regexp magic). In the 'source' item there should also be a link pointing to the target with a 'in-reply-to' tag. More details here: http://indiewebcamp.com/comment
+
+
+
+
+
+ +
+ +
+ WordPress and IndieWeb-Comments
+
+
+
+
+
+ Matthias Pfefferle Gravtar +
+
+ +
+ on 24 Apr 13 at 14:21 CEST
+
+
+ Nice! It seems there is a bug in my mf2 implementation and your storytlr is using the title instead of the post.
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 24 Apr 13 at 14:25 CEST
+
+
+ There is also a few bugs in my side (also in my own comments timestamp as you can see :-). A bit more work needed, but we are getting there :-) Welcome to the conversation!
+
+
+
+
+
+ Matthias Pfefferle Gravtar +
+
+ +
+ on 24 Apr 13 at 15:55 CEST
+
+
+ BTW, is storytlr also sending pingpacks/webmentions in the comments section? If so, do you have a source url for any "comment" or do you support some kind of html fragments like superfeedr does http://blog.superfeedr.com/fragment-subscription/ ?
+
+
+
+
+
+ Laurent Eschenauer Gravtar +
+
+ +
+ on 24 Apr 13 at 21:19 CEST
+
+
+ @notizblog.org Yes, I send pingback for any mention within a comment (hence the @ at the begining of this comment) and the pingback is from a URl + fragment to the comment. I'm missing proper mf for the comment but it is coming.
+
+
+
+
+
+ +
+ +
+ Le Premier Fil de Discussion Fédéré de Commentaires #Indieweb
+
+
+
+
+
+ +
+ +
+ Hopefully h-card entities should get expanded in the reply context for this note (crosses fingers)
+
+
+
+
+
+ +
+ +
+ Testing, testing, is this thing on? + + + + + + + 21:46 on 2013-04-26
+
+
+
+
+
+ +
+ +
+ Ben Werdmuller + + + + + Really excited by indieweb comments. Impressive idea, and a pointer to what's possible with microformats and webmentions. http://eschnou.com/entry/testing-indieweb-federation-with-waterpigscouk-aaronpareckicom-and--62-24908.html + + + + 3s
+
+
+
+
+
+ +
+ +
+ Foot very much in mouth, here ends my #indieweb comment testing for the night. + + + + 4s
+
+
+
+
+
+ +
+ +
+ Another #indieweb creator has commented on Laurent Eschenauer’s famous thread with a new implementation — congratulations benwerd!
+
+
+
+
+
+ +
+ +
+ Just implemented the ability to send [WebMentions](http://webmention.org)/Pingback. A little late to the party but here goes... #indieweb
+
+
+
+
+
+ +
+
+ + +
+
+ This was linked to here: http://www.sandeep.io/44
+
+
+
+
+
+
+
+ + + + + + + + + +
+ +
+ +
+ +
+ +
+
+ + + +

+
+
+ +
+ +
+ + +
+
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + +