simple cache for rendered files
[phorkie.git] / src / phorkie / HtmlParser.php
index f75107415a10227093abe30ceb5469e00c0ba23f..38d8af858206627cc15b78e2c680b2ef3a72b225 100644 (file)
@@ -44,9 +44,23 @@ class HtmlParser
         if ($scheme == 'https' && isset($arUrl['host'])
             && $arUrl['host'] == 'gist.github.com'
         ) {
-            //FIXME: title
-            $this->arGitUrls[][] = 'git://gist.github.com/'
-                . ltrim($arUrl['path'], '/') . '.git';
+            //https://gist.github.com/cweiske/2400389
+            // clone URL: https://gist.github.com/2400389.git
+            $parts = explode('/', ltrim($arUrl['path'], '/'));
+            if (count($parts == 2)) {
+                //we only want the number, not the user name
+                $path = $parts[1];
+            } else {
+                $path = ltrim($arUrl['path'], '/');
+            }
+            $title = $this->getHtmlTitle($url);
+            if ($title === null) {
+                $this->arGitUrls[][] = 'https://gist.github.com/'
+                    . $path . '.git';
+            } else {
+                $this->arGitUrls[$title][] = 'https://gist.github.com/'
+                    . $path . '.git';
+            }
             return true;
         }
 
@@ -83,6 +97,7 @@ class HtmlParser
         } else {
             $sx = simplexml_import_dom(\DOMDocument::loadHTML($html));
         }
+        //FIXME: handle network error
 
         $elems = $sx->xpath('//*[@rel="vcs-git"]');
         $titles = $sx->xpath('/html/head/title');
@@ -149,5 +164,26 @@ class HtmlParser
             || $scheme == 'http' || $scheme == 'https';
     }
 
+    /**
+     * Extract the title from a HTML URL
+     *
+     * @param string $url URL to a HTML page
+     *
+     * @return string|null NULL on error, title otherwise
+     */
+    public function getHtmlTitle($url)
+    {
+        libxml_use_internal_errors(true);
+        $doc = \DOMDocument::loadHTMLFile($url);
+        if ($doc === false) {
+            return null;
+        }
+        $sx = simplexml_import_dom($doc);
+        $title = (string) $sx->head->title;
+        if ($title == '') {
+            return null;
+        }
+        return $title;
+    }
 }
 ?>