src/phorkie/HtmlParser.php

   1 <?php
   2 namespace phorkie;
   3
   4 class HtmlParser
   5 {
   6     /**
   7      * Contains error message when parse() failed
   8      */
   9     public $error;
  10
  11     /**
  12      * Array with keys (URL title) and values (arrays of urls)
  13      * Only supported URLs are included.
  14      *
  15      * @var array
  16      */
  17     protected $arGitUrls;
  18
  19
  20
  21     /**
  22      * Extract git URLs from the given URL, eventually fetching
  23      * HTML and extracting URLs from there.
  24      *
  25      * Sets $error and $arGitUrls class variables
  26      *
  27      * @param string $url  Git or HTTP URL
  28      * @param string $html HTML content of $url
  29      *
  30      * @return boolean True when all went well, false in case of an error
  31      * @uses   $error
  32      * @uses   $arGitUrls
  33      */
  34     public function extractGitUrls($url, $html = null)
  35     {
  36         if ($url == '') {
  37             $this->error = 'Empty fork URL';
  38             return false;
  39         }
  40
  41         $arUrl  = parse_url($url);
  42         $scheme = isset($arUrl['scheme']) ? $arUrl['scheme'] : '';
  43
  44         if ($scheme == 'https' && isset($arUrl['host'])
  45             && $arUrl['host'] == 'gist.github.com'
  46         ) {
  47             //https://gist.github.com/maddy2101/5764473
  48             $parts = explode('/', ltrim($arUrl['path'], '/'));
  49             if (count($parts == 2)) {
  50                 //we only want the number, not the user name
  51                 $path = $parts[1];
  52             } else {
  53                 $path = ltrim($arUrl['path'], '/');
  54             }
  55             $title = $this->getHtmlTitle($url);
  56             if ($title === null) {
  57                 $this->arGitUrls[][] = 'git://gist.github.com/'
  58                     . $path . '.git';
  59             } else {
  60                 $this->arGitUrls[$title][] = 'git://gist.github.com/'
  61                     . $path . '.git';
  62             }
  63             return true;
  64         }
  65
  66         switch ($scheme) {
  67         case 'git':
  68             //clearly a git url
  69             $this->arGitUrls = array(array($url));
  70             return true;
  71
  72         case 'ssh':
  73             //FIXME: maybe loosen this when we know how to skip the
  74             //"do you trust this server" question of ssh
  75             $this->error = 'ssh:// URLs are not supported';
  76             return false;
  77
  78         case 'http':
  79         case 'https':
  80             return $this->extractUrlsFromHtml($url, $html);
  81         }
  82
  83         $this->error = 'Unknown URLs scheme: ' . $scheme;
  84         return false;
  85     }
  86
  87     protected function extractUrlsFromHtml($url, $html = null)
  88     {
  89         //HTML is not necessarily well-formed, and Gitorious has many problems
  90         // in this regard
  91         //$sx = simplexml_load_file($url);
  92
  93         libxml_use_internal_errors(true);
  94         if ($html === null) {
  95             $sx = simplexml_import_dom(\DOMDocument::loadHTMLFile($url));
  96         } else {
  97             $sx = simplexml_import_dom(\DOMDocument::loadHTML($html));
  98         }
  99
 100         $elems = $sx->xpath('//*[@rel="vcs-git"]');
 101         $titles = $sx->xpath('/html/head/title');
 102         $pageTitle = $this->cleanPageTitle((string) reset($titles));
 103
 104         $count = $anonymous = 0;
 105         foreach ($elems as $elem) {
 106             if (!isset($elem['href'])) {
 107                 continue;
 108             }
 109             $str = (string)$elem;
 110             if (isset($elem['title'])) {
 111                 //<link href=".." rel="vcs-git" title="title" />
 112                 $title = (string)$elem['title'];
 113             } else if ($str != '') {
 114                 //<a href=".." rel="vcs-git">title</a>
 115                 $title = $str;
 116             } else if ($pageTitle != '') {
 117                 $title = $pageTitle;
 118             } else {
 119                 $title = 'Unnamed repository #' . ++$anonymous;
 120             }
 121             $url = (string)$elem['href'];
 122             if ($this->isSupported($url)) {
 123                 ++$count;
 124                 $this->arGitUrls[$title][] = $url;
 125             }
 126         }
 127
 128         if ($count > 0) {
 129             return true;
 130         }
 131
 132         $this->error = 'No git:// clone URL found';
 133         return false;
 134     }
 135
 136     public function getGitUrls()
 137     {
 138         return $this->arGitUrls;
 139     }
 140
 141     /**
 142      * Remove application names from HTML page titles
 143      *
 144      * @param string $title HTML page title
 145      *
 146      * @return string Cleaned HTML page title
 147      */
 148     protected function cleanPageTitle($title)
 149     {
 150         $title = trim($title);
 151         if (substr($title, -9) == '- phorkie') {
 152             $title = trim(substr($title, 0, -9));
 153         }
 154
 155         return $title;
 156     }
 157
 158     public function isSupported($url)
 159     {
 160         $scheme = parse_url($url, PHP_URL_SCHEME);
 161         return $scheme == 'git'
 162             || $scheme == 'http' || $scheme == 'https';
 163     }
 164
 165     /**
 166      * Extract the title from a HTML URL
 167      *
 168      * @param string $url URL to a HTML page
 169      *
 170      * @return string|null NULL on error, title otherwise
 171      */
 172     public function getHtmlTitle($url)
 173     {
 174         libxml_use_internal_errors(true);
 175         $doc = \DOMDocument::loadHTMLFile($url);
 176         if ($doc === false) {
 177             return null;
 178         }
 179         $sx = simplexml_import_dom($doc);
 180         $title = (string) $sx->head->title;
 181         if ($title == '') {
 182             return null;
 183         }
 184         return $title;
 185     }
 186 }
 187 ?>