From: Christian Weiske Date: Thu, 5 Nov 2015 06:40:03 +0000 (+0100) Subject: Detect text files for unknown file types X-Git-Tag: v0.7.0~1 X-Git-Url: https://git.cweiske.de/phorkie.git/commitdiff_plain/0b24764b8d1065fc57e219c431112860c7147dd0 Detect text files for unknown file types --- diff --git a/README.rst b/README.rst index 2e99e6c..dde1bda 100644 --- a/README.rst +++ b/README.rst @@ -42,6 +42,7 @@ Features - webhook support - get notified when pastes are created, edited or deleted - atom feed for new and updated pastes - notifies remote instances via linkbacks when a paste has been forked +- text file detection for unknown file types ============ @@ -117,6 +118,8 @@ on your machine: - Git v1.7.5 or later - PHP v5.3.0 or later + + - optionally the ``mbstring`` extension - PEAR v1.9.2 or later :: diff --git a/src/phorkie/File.php b/src/phorkie/File.php index 300e810..cf5daae 100644 --- a/src/phorkie/File.php +++ b/src/phorkie/File.php @@ -126,15 +126,21 @@ class File } /** - * @return string Mime type of file + * @return string Mime type of file, NULL if no type detected */ public function getMimeType() { $ext = $this->getExt(); - if (!isset($GLOBALS['phorkie']['languages'][$ext])) { - return null; + if (isset($GLOBALS['phorkie']['languages'][$ext])) { + return $GLOBALS['phorkie']['languages'][$ext]['mime']; } - return $GLOBALS['phorkie']['languages'][$ext]['mime']; + + $mte = new \MIME_Type_Extension(); + $type = $mte->getMIMEType($this->getFilename()); + if (!\PEAR::isError($type)) { + return $type; + } + return null; } /** @@ -159,22 +165,53 @@ class File { $ext = $this->getExt(); if ($ext == '') { - //no file extension? then consider the size - $size = filesize($this->getFullPath()); - //files <= 4kiB are considered to be text - return $size <= 4096; + return $this->isNonBinary(); } - if (!isset($GLOBALS['phorkie']['languages'][$ext]['mime'])) { - return false; + $type = $this->getMimeType(); + if ($type === null) { + return $this->isNonBinary(); } - - $type = $GLOBALS['phorkie']['languages'][$ext]['mime']; return substr($type, 0, 5) === 'text/' || $type == 'application/javascript' || substr($type, -4) == '+xml' || substr($type, -5) == '+json'; } + + /** + * Look at the file's bytes and guess if it's binary or not. + * + * @return boolean True if it's most likely plain text + */ + public function isNonBinary() + { + $fp = fopen($this->getFullPath(), 'r'); + if (!$fp) { + return false; + } + + //When multibyte extension is not installed, + // we only allow files with ASCII characters. + // Files with UTF-8 characters will not be detected as text. + $hasMb = function_exists('mb_detect_encoding'); + + $pos = 0; + $data = ''; + while (false !== ($char = fgetc($fp)) && ++$pos < 100) { + $data .= $char; + if (!$hasMb && ord($char) > 128) { + return false; + } + } + if (!$hasMb) { + return true; + } + + if (mb_detect_encoding($data) === false) { + return false; + } + return true; + } } ?>