From c5fb118236c472237ac1fc43b0eb9a98d56b98f1 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Fri, 2 May 2014 18:39:17 +0200 Subject: [PATCH] extract gist titles --- src/phorkie/HtmlParser.php | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/phorkie/HtmlParser.php b/src/phorkie/HtmlParser.php index 6b5d26a..d613452 100644 --- a/src/phorkie/HtmlParser.php +++ b/src/phorkie/HtmlParser.php @@ -52,9 +52,14 @@ class HtmlParser } else { $path = ltrim($arUrl['path'], '/'); } - //FIXME: title - $this->arGitUrls[][] = 'git://gist.github.com/' - . $path . '.git'; + $title = $this->getHtmlTitle($url); + if ($title === null) { + $this->arGitUrls[][] = 'git://gist.github.com/' + . $path . '.git'; + } else { + $this->arGitUrls[$title][] = 'git://gist.github.com/' + . $path . '.git'; + } return true; } @@ -157,5 +162,26 @@ class HtmlParser || $scheme == 'http' || $scheme == 'https'; } + /** + * Extract the title from a HTML URL + * + * @param string $url URL to a HTML page + * + * @return string|null NULL on error, title otherwise + */ + public function getHtmlTitle($url) + { + libxml_use_internal_errors(true); + $doc = \DOMDocument::loadHTMLFile($url); + if ($doc === false) { + return null; + } + $sx = simplexml_import_dom($doc); + $title = (string) $sx->head->title; + if ($title == '') { + return null; + } + return $title; + } } ?> -- 2.30.2