7 * Contains error message when parse() failed
12 * Array with keys (URL title) and values (arrays of urls)
13 * Only supported URLs are included.
22 * Extract git URLs from the given URL, eventually fetching
23 * HTML and extracting URLs from there.
25 * Sets $error and $arGitUrls class variables
27 * @param string $url Git or HTTP URL
28 * @param string $html HTML content of $url
30 * @return boolean True when all went well, false in case of an error
34 public function extractGitUrls($url, $html = null)
37 $this->error = 'Empty fork URL';
41 $arUrl = parse_url($url);
42 $scheme = isset($arUrl['scheme']) ? $arUrl['scheme'] : '';
44 if ($scheme == 'https' && isset($arUrl['host'])
45 && $arUrl['host'] == 'gist.github.com'
47 //https://gist.github.com/cweiske/2400389
48 // clone URL: https://gist.github.com/2400389.git
49 $parts = explode('/', ltrim($arUrl['path'], '/'));
50 if (count($parts == 2)) {
51 //we only want the number, not the user name
54 $path = ltrim($arUrl['path'], '/');
56 $title = $this->getHtmlTitle($url);
57 if ($title === null) {
58 $this->arGitUrls[][] = 'https://gist.github.com/'
61 $this->arGitUrls[$title][] = 'https://gist.github.com/'
70 $this->arGitUrls = array(array($url));
74 //FIXME: maybe loosen this when we know how to skip the
75 //"do you trust this server" question of ssh
76 $this->error = 'ssh:// URLs are not supported';
81 return $this->extractUrlsFromHtml($url, $html);
84 $this->error = 'Unknown URLs scheme: ' . $scheme;
88 protected function extractUrlsFromHtml($url, $html = null)
90 //HTML is not necessarily well-formed, and Gitorious has many problems
92 //$sx = simplexml_load_file($url);
94 libxml_use_internal_errors(true);
96 $sx = simplexml_import_dom(\DOMDocument::loadHTMLFile($url));
98 $sx = simplexml_import_dom(\DOMDocument::loadHTML($html));
101 $elems = $sx->xpath('//*[@rel="vcs-git"]');
102 $titles = $sx->xpath('/html/head/title');
103 $pageTitle = $this->cleanPageTitle((string) reset($titles));
105 $count = $anonymous = 0;
106 foreach ($elems as $elem) {
107 if (!isset($elem['href'])) {
110 $str = (string)$elem;
111 if (isset($elem['title'])) {
112 //<link href=".." rel="vcs-git" title="title" />
113 $title = (string)$elem['title'];
114 } else if ($str != '') {
115 //<a href=".." rel="vcs-git">title</a>
117 } else if ($pageTitle != '') {
120 $title = 'Unnamed repository #' . ++$anonymous;
122 $url = (string)$elem['href'];
123 if ($this->isSupported($url)) {
125 $this->arGitUrls[$title][] = $url;
133 $this->error = 'No git:// clone URL found';
137 public function getGitUrls()
139 return $this->arGitUrls;
143 * Remove application names from HTML page titles
145 * @param string $title HTML page title
147 * @return string Cleaned HTML page title
149 protected function cleanPageTitle($title)
151 $title = trim($title);
152 if (substr($title, -9) == '- phorkie') {
153 $title = trim(substr($title, 0, -9));
159 public function isSupported($url)
161 $scheme = parse_url($url, PHP_URL_SCHEME);
162 return $scheme == 'git'
163 || $scheme == 'http' || $scheme == 'https';
167 * Extract the title from a HTML URL
169 * @param string $url URL to a HTML page
171 * @return string|null NULL on error, title otherwise
173 public function getHtmlTitle($url)
175 libxml_use_internal_errors(true);
176 $doc = \DOMDocument::loadHTMLFile($url);
177 if ($doc === false) {
180 $sx = simplexml_import_dom($doc);
181 $title = (string) $sx->head->title;