From 3b06ee0d381dc1be5f40ca98ad4278046d869d21 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sun, 17 Nov 2019 20:57:39 +0100 Subject: checked in initial customized verison for Archlinux32 --- include/utf8/utils/bad.php | 430 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 include/utf8/utils/bad.php (limited to 'include/utf8/utils/bad.php') diff --git a/include/utf8/utils/bad.php b/include/utf8/utils/bad.php new file mode 100644 index 0000000..2704294 --- /dev/null +++ b/include/utf8/utils/bad.php @@ -0,0 +1,430 @@ + 0) + return $badList; + + return false; +} + +/** +* Strips out any bad bytes from a UTF-8 string and returns the rest +* PCRE Pattern to locate bad bytes in a UTF-8 string +* Comes from W3 FAQ: Multilingual Forms +* Note: modified to include full ASCII range including control chars +* @see http://www.w3.org/International/questions/qa-forms-utf-8 +* @param string +* @return string +* @package utf8 +* @subpackage bad +*/ +function utf8_bad_strip($original) +{ + return utf8_bad_replace($original, ''); +} + +/** +* Replace bad bytes with an alternative character - ASCII character +* recommended is replacement char +* PCRE Pattern to locate bad bytes in a UTF-8 string +* Comes from W3 FAQ: Multilingual Forms +* Note: modified to include full ASCII range including control chars +* @see http://www.w3.org/International/questions/qa-forms-utf-8 +* @param string to search +* @param string to replace bad bytes with (defaults to '?') - use ASCII +* @return string +* @package utf8 +* @subpackage bad +*/ +function utf8_bad_replace($original, $replace = '?') { + $result = ''; + + $strlen = strlen($original); + for ($i = 0; $i < $strlen;) { + $char = $original[$i++]; + $byte = ord($char); + + if ($byte < 0x80) $bytes = 0; // 1-bytes (00000000 - 01111111) + else if ($byte < 0xC0) { // 1-bytes (10000000 - 10111111) + $result .= $replace; + continue; + } + else if ($byte < 0xE0) $bytes = 1; // 2-bytes (11000000 - 11011111) + else if ($byte < 0xF0) $bytes = 2; // 3-bytes (11100000 - 11101111) + else if ($byte < 0xF8) $bytes = 3; // 4-bytes (11110000 - 11110111) + else if ($byte < 0xFC) $bytes = 4; // 5-bytes (11111000 - 11111011) + else if ($byte < 0xFE) $bytes = 5; // 6-bytes (11111100 - 11111101) + else { // Otherwise it's something invalid + $result .= $replace; + continue; + } + + // Check our input actually has enough data + if ($i + $bytes > $strlen) { + $result .= $replace; + continue; + } + + // If we've got this far then we have a multiple-byte character + for ($j = 0; $j < $bytes; $j++) { + $byte = $original[$i + $j]; + + $char .= $byte; + $byte = ord($byte); + + // Every following byte must be 10000000 - 10111111 + if ($byte < 0x80 || $byte > 0xBF) { + $result .= $replace; + continue 2; + } + } + + $i += $bytes; + $result .= $char; + } + + return $result; +} + +/** +* Return code from utf8_bad_identify() when a five octet sequence is detected. +* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so +* do not represent a useful character +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_5OCTET', 1); + +/** +* Return code from utf8_bad_identify() when a six octet sequence is detected. +* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so +* do not represent a useful character +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_6OCTET', 2); + +/** +* Return code from utf8_bad_identify(). +* Invalid octet for use as start of multi-byte UTF-8 sequence +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_SEQID', 3); + +/** +* Return code from utf8_bad_identify(). +* From Unicode 3.1, non-shortest form is illegal +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_NONSHORT', 4); + +/** +* Return code from utf8_bad_identify(). +* From Unicode 3.2, surrogate characters are illegal +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_SURROGATE', 5); + +/** +* Return code from utf8_bad_identify(). +* Codepoints outside the Unicode range are illegal +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_UNIOUTRANGE', 6); + +/** +* Return code from utf8_bad_identify(). +* Incomplete multi-octet sequence +* Note: this is kind of a "catch-all" +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +define('UTF8_BAD_SEQINCOMPLETE', 7); + +/** +* Reports on the type of bad byte found in a UTF-8 string. Returns a +* status code on the first bad byte found +* @author +* @param string UTF-8 encoded string +* @return mixed integer constant describing problem or FALSE if valid UTF-8 +* @see utf8_bad_explain +* @see http://hsivonen.iki.fi/php-utf8/ +* @package utf8 +* @subpackage bad +*/ +function utf8_bad_identify($str, &$i) +{ + $mState = 0; // Cached expected number of octets after the current octet + // until the beginning of the next UTF8 character sequence + $mUcs4 = 0; // Cached Unicode character + $mBytes = 1; // Cached expected number of octets in the current sequence + + $len = strlen($str); + + for($i=0; $i < $len; $i++) + { + $in = ord($str{$i}); + + if ( $mState == 0) + { + // When mState is zero we expect either a US-ASCII character or a multi-octet sequence. + if (0 == (0x80 & ($in))) + { + // US-ASCII, pass straight through. + $mBytes = 1; + } + else if (0xC0 == (0xE0 & ($in))) + { + // First octet of 2 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x1F) << 6; + $mState = 1; + $mBytes = 2; + } + else if (0xE0 == (0xF0 & ($in))) + { + // First octet of 3 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x0F) << 12; + $mState = 2; + $mBytes = 3; + } + else if (0xF0 == (0xF8 & ($in))) + { + // First octet of 4 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x07) << 18; + $mState = 3; + $mBytes = 4; + } + else if (0xF8 == (0xFC & ($in))) + { + /* First octet of 5 octet sequence. + * + * This is illegal because the encoded codepoint must be either + * (a) not the shortest form or + * (b) outside the Unicode range of 0-0x10FFFF. + */ + return UTF8_BAD_5OCTET; + } + else if (0xFC == (0xFE & ($in))) + { + // First octet of 6 octet sequence, see comments for 5 octet sequence. + return UTF8_BAD_6OCTET; + } + else + { + // Current octet is neither in the US-ASCII range nor a legal first + // octet of a multi-octet sequence. + return UTF8_BAD_SEQID; + } + } + else + { + // When mState is non-zero, we expect a continuation of the multi-octet sequence + if (0x80 == (0xC0 & ($in))) + { + // Legal continuation. + $shift = ($mState - 1) * 6; + $tmp = $in; + $tmp = ($tmp & 0x0000003F) << $shift; + $mUcs4 |= $tmp; + + /** + * End of the multi-octet sequence. mUcs4 now contains the final + * Unicode codepoint to be output + */ + if (0 == --$mState) + { + // From Unicode 3.1, non-shortest form is illegal + if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || + ((3 == $mBytes) && ($mUcs4 < 0x0800)) || + ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) + return UTF8_BAD_NONSHORT; + else if (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal + return UTF8_BAD_SURROGATE; + else if ($mUcs4 > 0x10FFFF) // Codepoints outside the Unicode range are illegal + return UTF8_BAD_UNIOUTRANGE; + + // Initialize UTF8 cache + $mState = 0; + $mUcs4 = 0; + $mBytes = 1; + } + + } + else + { + // ((0xC0 & (*in) != 0x80) && (mState != 0)) + // Incomplete multi-octet sequence. + $i--; + return UTF8_BAD_SEQINCOMPLETE; + } + } + } + + // Incomplete multi-octet sequence + if ($mState != 0) + { + $i--; + return UTF8_BAD_SEQINCOMPLETE; + } + + // No bad octets found + $i = null; + return false; +} + +/** +* Takes a return code from utf8_bad_identify() are returns a message +* (in English) explaining what the problem is. +* @param int return code from utf8_bad_identify +* @return mixed string message or FALSE if return code unknown +* @see utf8_bad_identify +* @package utf8 +* @subpackage bad +*/ +function utf8_bad_explain($code) +{ + switch ($code) + { + case UTF8_BAD_5OCTET: + return 'Five octet sequences are valid UTF-8 but are not supported by Unicode'; + break; + + case UTF8_BAD_6OCTET: + return 'Six octet sequences are valid UTF-8 but are not supported by Unicode'; + break; + + case UTF8_BAD_SEQID: + return 'Invalid octet for use as start of multi-byte UTF-8 sequence'; + break; + + case UTF8_BAD_NONSHORT: + return 'From Unicode 3.1, non-shortest form is illegal'; + break; + + case UTF8_BAD_SURROGATE: + return 'From Unicode 3.2, surrogate characters are illegal'; + break; + + case UTF8_BAD_UNIOUTRANGE: + return 'Codepoints outside the Unicode range are illegal'; + break; + + case UTF8_BAD_SEQINCOMPLETE: + return 'Incomplete multi-octet sequence'; + break; + } + + trigger_error('Unknown error code: '.$code, E_USER_WARNING); + + return false; +} -- cgit v1.2.3-54-g00ecf