blob: 5a85a4f1a23c21183deb651d5ed06dd48e4d0242 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
<?php
/**
* PCRE Regular expressions for UTF-8. Note this file is not actually used by
* the rest of the library but these regular expressions can be useful to have
* available.
* @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @package utf8
* @subpackage patterns
*/
/**
* PCRE Pattern to check a UTF-8 string is valid
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @package utf8
* @subpackage patterns
*/
$UTF8_VALID = '^('.
'[\x00-\x7F]'. # ASCII (including control chars)
'|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
'|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
'|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
'|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
'|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
'|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
')*$';
/**
* PCRE Pattern to match single UTF-8 characters
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @package utf8
* @subpackage patterns
*/
$UTF8_MATCH =
'([\x00-\x7F])'. # ASCII (including control chars)
'|([\xC2-\xDF][\x80-\xBF])'. # Non-overlong 2-byte
'|(\xE0[\xA0-\xBF][\x80-\xBF])'. # Excluding overlongs
'|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # Straight 3-byte
'|(\xED[\x80-\x9F][\x80-\xBF])'. # Excluding surrogates
'|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # Planes 1-3
'|([\xF1-\xF3][\x80-\xBF]{3})'. # Planes 4-15
'|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # Plane 16
/**
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @package utf8
* @subpackage patterns
*/
$UTF8_BAD =
'([\x00-\x7F]'. # ASCII (including control chars)
'|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
'|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
'|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
'|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
'|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
'|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
'|(.{1}))'; # Invalid byte
|