00001 <?php
00002 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
00003 # http://www.mediawiki.org/
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00013 # GNU General Public License for more details.
00014 #
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00018 # http://www.gnu.org/copyleft/gpl.html
00019
00025 require_once dirname(__FILE__).'/UtfNormalUtil.php';
00026
00027 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
00028 $utfCombiningClass = NULL;
00029 $utfCanonicalComp = NULL;
00030 $utfCanonicalDecomp = NULL;
00031
00032 # Load compatibility decompositions on demand if they are needed.
00033 global $utfCompatibilityDecomp;
00034 $utfCompatibilityDecomp = NULL;
00035
00039 define( 'UNORM_NONE', 1 );
00040 define( 'UNORM_NFD', 2 );
00041 define( 'UNORM_NFKD', 3 );
00042 define( 'UNORM_NFC', 4 );
00043 define( 'UNORM_DEFAULT', UNORM_NFC );
00044 define( 'UNORM_NFKC', 5 );
00045 define( 'UNORM_FCD', 6 );
00046
00047 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
00048
00063 class UtfNormal {
00074 static function cleanUp( $string ) {
00075 if( NORMALIZE_ICU ) {
00076 # We exclude a few chars that ICU would not.
00077 $string = preg_replace(
00078 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
00079 UTF8_REPLACEMENT,
00080 $string );
00081 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
00082 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
00083
00084 # UnicodeString constructor fails if the string ends with a
00085 # head byte. Add a junk char at the end, we'll strip it off.
00086 return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
00087 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
00088 # Side effect -- $string has had UTF-8 errors cleaned up.
00089 return $string;
00090 } else {
00091 return UtfNormal::NFC( $string );
00092 }
00093 }
00094
00103 static function toNFC( $string ) {
00104 if( NORMALIZE_ICU )
00105 return utf8_normalize( $string, UNORM_NFC );
00106 elseif( UtfNormal::quickIsNFC( $string ) )
00107 return $string;
00108 else
00109 return UtfNormal::NFC( $string );
00110 }
00111
00119 static function toNFD( $string ) {
00120 if( NORMALIZE_ICU )
00121 return utf8_normalize( $string, UNORM_NFD );
00122 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00123 return UtfNormal::NFD( $string );
00124 else
00125 return $string;
00126 }
00127
00136 static function toNFKC( $string ) {
00137 if( NORMALIZE_ICU )
00138 return utf8_normalize( $string, UNORM_NFKC );
00139 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00140 return UtfNormal::NFKC( $string );
00141 else
00142 return $string;
00143 }
00144
00153 static function toNFKD( $string ) {
00154 if( NORMALIZE_ICU )
00155 return utf8_normalize( $string, UNORM_NFKD );
00156 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00157 return UtfNormal::NFKD( $string );
00158 else
00159 return $string;
00160 }
00161
00166 static function loadData() {
00167 global $utfCombiningClass;
00168 if( !isset( $utfCombiningClass ) ) {
00169 require_once( dirname(__FILE__) . '/UtfNormalData.inc' );
00170 }
00171 }
00172
00179 static function quickIsNFC( $string ) {
00180 # ASCII is always valid NFC!
00181 # If it's pure ASCII, let it through.
00182 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00183
00184 UtfNormal::loadData();
00185 global $utfCheckNFC, $utfCombiningClass;
00186 $len = strlen( $string );
00187 for( $i = 0; $i < $len; $i++ ) {
00188 $c = $string{$i};
00189 $n = ord( $c );
00190 if( $n < 0x80 ) {
00191 continue;
00192 } elseif( $n >= 0xf0 ) {
00193 $c = substr( $string, $i, 4 );
00194 $i += 3;
00195 } elseif( $n >= 0xe0 ) {
00196 $c = substr( $string, $i, 3 );
00197 $i += 2;
00198 } elseif( $n >= 0xc0 ) {
00199 $c = substr( $string, $i, 2 );
00200 $i++;
00201 }
00202 if( isset( $utfCheckNFC[$c] ) ) {
00203 # If it's NO or MAYBE, bail and do the slow check.
00204 return false;
00205 }
00206 if( isset( $utfCombiningClass[$c] ) ) {
00207 # Combining character? We might have to do sorting, at least.
00208 return false;
00209 }
00210 }
00211 return true;
00212 }
00213
00219 static function quickIsNFCVerify( &$string ) {
00220 # Screen out some characters that eg won't be allowed in XML
00221 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
00222
00223 # ASCII is always valid NFC!
00224 # If we're only ever given plain ASCII, we can avoid the overhead
00225 # of initializing the decomposition tables by skipping out early.
00226 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00227
00228 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
00229 if( !isset( $checkit ) ) {
00230 # Load/build some scary lookup tables...
00231 UtfNormal::loadData();
00232 global $utfCheckNFC, $utfCombiningClass;
00233
00234 $utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass );
00235
00236 # Head bytes for sequences which we should do further validity checks
00237 $checkit = array_flip( array_map( 'chr',
00238 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
00239 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
00240 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
00241
00242 # Each UTF-8 head byte is followed by a certain
00243 # number of tail bytes.
00244 $tailBytes = array();
00245 for( $n = 0; $n < 256; $n++ ) {
00246 if( $n < 0xc0 ) {
00247 $remaining = 0;
00248 } elseif( $n < 0xe0 ) {
00249 $remaining = 1;
00250 } elseif( $n < 0xf0 ) {
00251 $remaining = 2;
00252 } elseif( $n < 0xf8 ) {
00253 $remaining = 3;
00254 } elseif( $n < 0xfc ) {
00255 $remaining = 4;
00256 } elseif( $n < 0xfe ) {
00257 $remaining = 5;
00258 } else {
00259 $remaining = 0;
00260 }
00261 $tailBytes[chr($n)] = $remaining;
00262 }
00263 }
00264
00265 # Chop the text into pure-ASCII and non-ASCII areas;
00266 # large ASCII parts can be handled much more quickly.
00267 # Don't chop up Unicode areas for punctuation, though,
00268 # that wastes energy.
00269 $matches = array();
00270 preg_match_all(
00271 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
00272 $string, $matches );
00273
00274 $looksNormal = true;
00275 $base = 0;
00276 $replace = array();
00277 foreach( $matches[1] as $str ) {
00278 $chunk = strlen( $str );
00279
00280 if( $str{0} < "\x80" ) {
00281 # ASCII chunk: guaranteed to be valid UTF-8
00282 # and in normal form C, so skip over it.
00283 $base += $chunk;
00284 continue;
00285 }
00286
00287 # We'll have to examine the chunk byte by byte to ensure
00288 # that it consists of valid UTF-8 sequences, and to see
00289 # if any of them might not be normalized.
00290 #
00291 # Since PHP is not the fastest language on earth, some of
00292 # this code is a little ugly with inner loop optimizations.
00293
00294 $head = '';
00295 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
00296
00297 for( $i = -1; --$len; ) {
00298 if( $remaining = $tailBytes[$c = $str{++$i}] ) {
00299 # UTF-8 head byte!
00300 $sequence = $head = $c;
00301 do {
00302 # Look for the defined number of tail bytes...
00303 if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) {
00304 # Legal tail bytes are nice.
00305 $sequence .= $c;
00306 } else {
00307 if( 0 == $len ) {
00308 # Premature end of string!
00309 # Drop a replacement character into output to
00310 # represent the invalid UTF-8 sequence.
00311 $replace[] = array( UTF8_REPLACEMENT,
00312 $base + $i + 1 - strlen( $sequence ),
00313 strlen( $sequence ) );
00314 break 2;
00315 } else {
00316 # Illegal tail byte; abandon the sequence.
00317 $replace[] = array( UTF8_REPLACEMENT,
00318 $base + $i - strlen( $sequence ),
00319 strlen( $sequence ) );
00320 # Back up and reprocess this byte; it may itself
00321 # be a legal ASCII or UTF-8 sequence head.
00322 --$i;
00323 ++$len;
00324 continue 2;
00325 }
00326 }
00327 } while( --$remaining );
00328
00329 if( isset( $checkit[$head] ) ) {
00330 # Do some more detailed validity checks, for
00331 # invalid characters and illegal sequences.
00332 if( $head == "\xed" ) {
00333 # 0xed is relatively frequent in Korean, which
00334 # abuts the surrogate area, so we're doing
00335 # this check separately to speed things up.
00336
00337 if( $sequence >= UTF8_SURROGATE_FIRST ) {
00338 # Surrogates are legal only in UTF-16 code.
00339 # They are totally forbidden here in UTF-8
00340 # utopia.
00341 $replace[] = array( UTF8_REPLACEMENT,
00342 $base + $i + 1 - strlen( $sequence ),
00343 strlen( $sequence ) );
00344 $head = '';
00345 continue;
00346 }
00347 } else {
00348 # Slower, but rarer checks...
00349 $n = ord( $head );
00350 if(
00351 # "Overlong sequences" are those that are syntactically
00352 # correct but use more UTF-8 bytes than are necessary to
00353 # encode a character. Naïve string comparisons can be
00354 # tricked into failing to see a match for an ASCII
00355 # character, for instance, which can be a security hole
00356 # if blacklist checks are being used.
00357 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
00358 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
00359 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
00360
00361 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
00362 || ($n == 0xef &&
00363 ($sequence == UTF8_FFFE)
00364 || ($sequence == UTF8_FFFF) )
00365
00366 # Unicode has been limited to 21 bits; longer
00367 # sequences are not allowed.
00368 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
00369
00370 $replace[] = array( UTF8_REPLACEMENT,
00371 $base + $i + 1 - strlen( $sequence ),
00372 strlen( $sequence ) );
00373 $head = '';
00374 continue;
00375 }
00376 }
00377 }
00378
00379 if( isset( $utfCheckOrCombining[$sequence] ) ) {
00380 # If it's NO or MAYBE, we'll have to rip
00381 # the string apart and put it back together.
00382 # That's going to be mighty slow.
00383 $looksNormal = false;
00384 }
00385
00386 # The sequence is legal!
00387 $head = '';
00388 } elseif( $c < "\x80" ) {
00389 # ASCII byte.
00390 $head = '';
00391 } elseif( $c < "\xc0" ) {
00392 # Illegal tail bytes
00393 if( $head == '' ) {
00394 # Out of the blue!
00395 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00396 } else {
00397 # Don't add if we're continuing a broken sequence;
00398 # we already put a replacement character when we looked
00399 # at the broken sequence.
00400 $replace[] = array( '', $base + $i, 1 );
00401 }
00402 } else {
00403 # Miscellaneous freaks.
00404 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00405 $head = '';
00406 }
00407 }
00408 $base += $chunk;
00409 }
00410 if( count( $replace ) ) {
00411 # There were illegal UTF-8 sequences we need to fix up.
00412 $out = '';
00413 $last = 0;
00414 foreach( $replace as $rep ) {
00415 list( $replacement, $start, $length ) = $rep;
00416 if( $last < $start ) {
00417 $out .= substr( $string, $last, $start - $last );
00418 }
00419 $out .= $replacement;
00420 $last = $start + $length;
00421 }
00422 if( $last < strlen( $string ) ) {
00423 $out .= substr( $string, $last );
00424 }
00425 $string = $out;
00426 }
00427 return $looksNormal;
00428 }
00429
00430 # These take a string and run the normalization on them, without
00431 # checking for validity or any optimization etc. Input must be
00432 # VALID UTF-8!
00433
00438 static function NFC( $string ) {
00439 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
00440 }
00441
00447 static function NFD( $string ) {
00448 UtfNormal::loadData();
00449 global $utfCanonicalDecomp;
00450 return UtfNormal::fastCombiningSort(
00451 UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
00452 }
00453
00459 static function NFKC( $string ) {
00460 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
00461 }
00462
00468 static function NFKD( $string ) {
00469 global $utfCompatibilityDecomp;
00470 if( !isset( $utfCompatibilityDecomp ) ) {
00471 require_once( 'UtfNormalDataK.inc' );
00472 }
00473 return UtfNormal::fastCombiningSort(
00474 UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
00475 }
00476
00477
00487 static function fastDecompose( $string, $map ) {
00488 UtfNormal::loadData();
00489 $len = strlen( $string );
00490 $out = '';
00491 for( $i = 0; $i < $len; $i++ ) {
00492 $c = $string{$i};
00493 $n = ord( $c );
00494 if( $n < 0x80 ) {
00495 # ASCII chars never decompose
00496 # THEY ARE IMMORTAL
00497 $out .= $c;
00498 continue;
00499 } elseif( $n >= 0xf0 ) {
00500 $c = substr( $string, $i, 4 );
00501 $i += 3;
00502 } elseif( $n >= 0xe0 ) {
00503 $c = substr( $string, $i, 3 );
00504 $i += 2;
00505 } elseif( $n >= 0xc0 ) {
00506 $c = substr( $string, $i, 2 );
00507 $i++;
00508 }
00509 if( isset( $map[$c] ) ) {
00510 $out .= $map[$c];
00511 continue;
00512 } else {
00513 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
00514 # Decompose a hangul syllable into jamo;
00515 # hardcoded for three-byte UTF-8 sequence.
00516 # A lookup table would be slightly faster,
00517 # but adds a lot of memory & disk needs.
00518 #
00519 $index = ( (ord( $c{0} ) & 0x0f) << 12
00520 | (ord( $c{1} ) & 0x3f) << 6
00521 | (ord( $c{2} ) & 0x3f) )
00522 - UNICODE_HANGUL_FIRST;
00523 $l = intval( $index / UNICODE_HANGUL_NCOUNT );
00524 $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
00525 $t = $index % UNICODE_HANGUL_TCOUNT;
00526 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
00527 if( $t >= 25 ) {
00528 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
00529 } elseif( $t ) {
00530 $out .= "\xe1\x86" . chr( 0xa7 + $t );
00531 }
00532 continue;
00533 }
00534 }
00535 $out .= $c;
00536 }
00537 return $out;
00538 }
00539
00547 static function fastCombiningSort( $string ) {
00548 UtfNormal::loadData();
00549 global $utfCombiningClass;
00550 $len = strlen( $string );
00551 $out = '';
00552 $combiners = array();
00553 $lastClass = -1;
00554 for( $i = 0; $i < $len; $i++ ) {
00555 $c = $string{$i};
00556 $n = ord( $c );
00557 if( $n >= 0x80 ) {
00558 if( $n >= 0xf0 ) {
00559 $c = substr( $string, $i, 4 );
00560 $i += 3;
00561 } elseif( $n >= 0xe0 ) {
00562 $c = substr( $string, $i, 3 );
00563 $i += 2;
00564 } elseif( $n >= 0xc0 ) {
00565 $c = substr( $string, $i, 2 );
00566 $i++;
00567 }
00568 if( isset( $utfCombiningClass[$c] ) ) {
00569 $lastClass = $utfCombiningClass[$c];
00570 if( isset( $combiners[$lastClass] ) ) {
00571 $combiners[$lastClass] .= $c;
00572 } else {
00573 $combiners[$lastClass] = $c;
00574 }
00575 continue;
00576 }
00577 }
00578 if( $lastClass ) {
00579 ksort( $combiners );
00580 $out .= implode( '', $combiners );
00581 $combiners = array();
00582 }
00583 $out .= $c;
00584 $lastClass = 0;
00585 }
00586 if( $lastClass ) {
00587 ksort( $combiners );
00588 $out .= implode( '', $combiners );
00589 }
00590 return $out;
00591 }
00592
00600 static function fastCompose( $string ) {
00601 UtfNormal::loadData();
00602 global $utfCanonicalComp, $utfCombiningClass;
00603 $len = strlen( $string );
00604 $out = '';
00605 $lastClass = -1;
00606 $lastHangul = 0;
00607 $startChar = '';
00608 $combining = '';
00609 $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
00610 $x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
00611 for( $i = 0; $i < $len; $i++ ) {
00612 $c = $string{$i};
00613 $n = ord( $c );
00614 if( $n < 0x80 ) {
00615 # No combining characters here...
00616 $out .= $startChar;
00617 $out .= $combining;
00618 $startChar = $c;
00619 $combining = '';
00620 $lastClass = 0;
00621 continue;
00622 } elseif( $n >= 0xf0 ) {
00623 $c = substr( $string, $i, 4 );
00624 $i += 3;
00625 } elseif( $n >= 0xe0 ) {
00626 $c = substr( $string, $i, 3 );
00627 $i += 2;
00628 } elseif( $n >= 0xc0 ) {
00629 $c = substr( $string, $i, 2 );
00630 $i++;
00631 }
00632 $pair = $startChar . $c;
00633 if( $n > 0x80 ) {
00634 if( isset( $utfCombiningClass[$c] ) ) {
00635 # A combining char; see what we can do with it
00636 $class = $utfCombiningClass[$c];
00637 if( !empty( $startChar ) &&
00638 $lastClass < $class &&
00639 $class > 0 &&
00640 isset( $utfCanonicalComp[$pair] ) ) {
00641 $startChar = $utfCanonicalComp[$pair];
00642 $class = 0;
00643 } else {
00644 $combining .= $c;
00645 }
00646 $lastClass = $class;
00647 $lastHangul = 0;
00648 continue;
00649 }
00650 }
00651 # New start char
00652 if( $lastClass == 0 ) {
00653 if( isset( $utfCanonicalComp[$pair] ) ) {
00654 $startChar = $utfCanonicalComp[$pair];
00655 $lastHangul = 0;
00656 continue;
00657 }
00658 if( $n >= $x1 && $n <= $x2 ) {
00659 # WARNING: Hangul code is painfully slow.
00660 # I apologize for this ugly, ugly code; however
00661 # performance is even more teh suck if we call
00662 # out to nice clean functions. Lookup tables are
00663 # marginally faster, but require a lot of space.
00664 #
00665 if( $c >= UTF8_HANGUL_VBASE &&
00666 $c <= UTF8_HANGUL_VEND &&
00667 $startChar >= UTF8_HANGUL_LBASE &&
00668 $startChar <= UTF8_HANGUL_LEND ) {
00669 #
00670 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
00671 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
00672 $lIndex = ord( $startChar{2} ) - 0x80;
00673 $vIndex = ord( $c{2} ) - 0xa1;
00674
00675 $hangulPoint = UNICODE_HANGUL_FIRST +
00676 UNICODE_HANGUL_TCOUNT *
00677 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
00678
00679 # Hardcode the limited-range UTF-8 conversion:
00680 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
00681 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
00682 chr( $hangulPoint & 0x3f | 0x80 );
00683 $lastHangul = 0;
00684 continue;
00685 } elseif( $c >= UTF8_HANGUL_TBASE &&
00686 $c <= UTF8_HANGUL_TEND &&
00687 $startChar >= UTF8_HANGUL_FIRST &&
00688 $startChar <= UTF8_HANGUL_LAST &&
00689 !$lastHangul ) {
00690 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
00691 $tIndex = ord( $c{2} ) - 0xa7;
00692 if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
00693
00694 # Increment the code point by $tIndex, without
00695 # the function overhead of decoding and recoding UTF-8
00696 #
00697 $tail = ord( $startChar{2} ) + $tIndex;
00698 if( $tail > 0xbf ) {
00699 $tail -= 0x40;
00700 $mid = ord( $startChar{1} ) + 1;
00701 if( $mid > 0xbf ) {
00702 $startChar{0} = chr( ord( $startChar{0} ) + 1 );
00703 $mid -= 0x40;
00704 }
00705 $startChar{1} = chr( $mid );
00706 }
00707 $startChar{2} = chr( $tail );
00708
00709 # If there's another jamo char after this, *don't* try to merge it.
00710 $lastHangul = 1;
00711 continue;
00712 }
00713 }
00714 }
00715 $out .= $startChar;
00716 $out .= $combining;
00717 $startChar = $c;
00718 $combining = '';
00719 $lastClass = 0;
00720 $lastHangul = 0;
00721 }
00722 $out .= $startChar . $combining;
00723 return $out;
00724 }
00725
00732 static function placebo( $string ) {
00733 $len = strlen( $string );
00734 $out = '';
00735 for( $i = 0; $i < $len; $i++ ) {
00736 $out .= $string{$i};
00737 }
00738 return $out;
00739 }
00740 }