0x80 == (0xC0 & ($in))) { // Legal continuation. $shift = ($mState - 1) * 6; $tmp = $in; $tmp = ($tmp & 0x0000003F) << $shift; $mUcs4 |= $tmp; /** * End of the multi-octet sequence. mUcs4 now contains the final * Unicode codepoint to be output */ if (0 == --$mState) { /* * Check for illegal sequences and codepoints. */ // From Unicode 3.1, non-shortest form is illegal if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000)) || (4 < $mBytes) || // From Unicode 3.2, surrogate characters are illegal (($mUcs4 & 0xFFFFF800) == 0xD800) || // Codepoints outside the Unicode range are illegal ($mUcs4 > 0x10FFFF)) { return FALSE; } //initialize UTF8 cache $mState = 0; $mUcs4 = 0; $mBytes = 1; } } else { /** *((0xC0 & (*in) != 0x80) && (mState != 0)) * Incomplete multi-octet sequence. */ return FALSE; } } } return TRUE; } //-------------------------------------------------------------------- /** * Tests whether a string complies as UTF-8. This will be much * faster than utf8_is_valid but will pass five and six octet * UTF-8 sequences, which are not supported by Unicode and * so cannot be displayed correctly in a browser. In other words * it is not as strict as utf8_is_valid but it's faster. If you use * is to validate user input, you place yourself at the risk that * attackers will be able to inject 5 and 6 byte sequences (which * may or may not be a significant risk, depending on what you are * are doing) * @see utf8_is_valid * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 * @param string UTF-8 string to check * @return boolean TRUE if string is valid UTF-8 * @package utf8 */ function utf8_compliant($str) { if ( strlen($str) == 0 ) { return TRUE; } // If even just the first character can be matched, when the /u // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow // invalid, nothing at all will match, even if the string contains // some valid sequences return (preg_match('/^.{1}/us',$str,$ar) == 1); } Error