00001 <?php
00002 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
00003 # http://www.mediawiki.org/
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00013 # GNU General Public License for more details.
00014 #
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00018 # http://www.gnu.org/copyleft/gpl.html
00019
00030 if( php_sapi_name() != 'cli' ) {
00031 die( "Run me from the command line please.\n" );
00032 }
00033
00034 require_once 'UtfNormalUtil.php';
00035
00036 $in = fopen("DerivedNormalizationProps.txt", "rt" );
00037 if( !$in ) {
00038 print "Can't open DerivedNormalizationProps.txt for reading.\n";
00039 print "If necessary, fetch this file from the internet:\n";
00040 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
00041 exit(-1);
00042 }
00043 print "Initializing normalization quick check tables...\n";
00044 $checkNFC = array();
00045 while( false !== ($line = fgets( $in ) ) ) {
00046 $matches = array();
00047 if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) {
00048 list( $junk, $first, $last, $prop, $value ) = $matches;
00049 #print "$first $last $prop $value\n";
00050 if( !$last ) $last = $first;
00051 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) {
00052 $char = codepointToUtf8( $i );
00053 $checkNFC[$char] = $value;
00054 }
00055 }
00056 }
00057 fclose( $in );
00058
00059 $in = fopen("CompositionExclusions.txt", "rt" );
00060 if( !$in ) {
00061 print "Can't open CompositionExclusions.txt for reading.\n";
00062 print "If necessary, fetch this file from the internet:\n";
00063 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
00064 exit(-1);
00065 }
00066 $exclude = array();
00067 while( false !== ($line = fgets( $in ) ) ) {
00068 if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
00069 $codepoint = $matches[1];
00070 $source = codepointToUtf8( hexdec( $codepoint ) );
00071 $exclude[$source] = true;
00072 }
00073 }
00074 fclose($in);
00075
00076 $in = fopen("UnicodeData.txt", "rt" );
00077 if( !$in ) {
00078 print "Can't open UnicodeData.txt for reading.\n";
00079 print "If necessary, fetch this file from the internet:\n";
00080 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00081 exit(-1);
00082 }
00083
00084 $compatibilityDecomp = array();
00085 $canonicalDecomp = array();
00086 $canonicalComp = array();
00087 $combiningClass = array();
00088 $total = 0;
00089 $compat = 0;
00090 $canon = 0;
00091
00092 print "Reading character definitions...\n";
00093 while( false !== ($line = fgets( $in ) ) ) {
00094 $columns = split(';', $line);
00095 $codepoint = $columns[0];
00096 $name = $columns[1];
00097 $canonicalCombiningClass = $columns[3];
00098 $decompositionMapping = $columns[5];
00099
00100 $source = codepointToUtf8( hexdec( $codepoint ) );
00101
00102 if( $canonicalCombiningClass != 0 ) {
00103 $combiningClass[$source] = intval( $canonicalCombiningClass );
00104 }
00105
00106 if( $decompositionMapping === '' ) continue;
00107 if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
00108 # Compatibility decomposition
00109 $canonical = false;
00110 $decompositionMapping = $matches[2];
00111 $compat++;
00112 } else {
00113 $canonical = true;
00114 $canon++;
00115 }
00116 $total++;
00117 $dest = hexSequenceToUtf8( $decompositionMapping );
00118
00119 $compatibilityDecomp[$source] = $dest;
00120 if( $canonical ) {
00121 $canonicalDecomp[$source] = $dest;
00122 if( empty( $exclude[$source] ) ) {
00123 $canonicalComp[$dest] = $source;
00124 }
00125 }
00126 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
00127 }
00128 fclose( $in );
00129
00130 print "Recursively expanding canonical mappings...\n";
00131 $changed = 42;
00132 $pass = 1;
00133 while( $changed > 0 ) {
00134 print "pass $pass\n";
00135 $changed = 0;
00136 foreach( $canonicalDecomp as $source => $dest ) {
00137 $newDest = preg_replace_callback(
00138 '/([\xc0-\xff][\x80-\xbf]+)/',
00139 'callbackCanonical',
00140 $dest);
00141 if( $newDest === $dest ) continue;
00142 $changed++;
00143 $canonicalDecomp[$source] = $newDest;
00144 }
00145 $pass++;
00146 }
00147
00148 print "Recursively expanding compatibility mappings...\n";
00149 $changed = 42;
00150 $pass = 1;
00151 while( $changed > 0 ) {
00152 print "pass $pass\n";
00153 $changed = 0;
00154 foreach( $compatibilityDecomp as $source => $dest ) {
00155 $newDest = preg_replace_callback(
00156 '/([\xc0-\xff][\x80-\xbf]+)/',
00157 'callbackCompat',
00158 $dest);
00159 if( $newDest === $dest ) continue;
00160 $changed++;
00161 $compatibilityDecomp[$source] = $newDest;
00162 }
00163 $pass++;
00164 }
00165
00166 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
00167
00168 $out = fopen("UtfNormalData.inc", "wt");
00169 if( $out ) {
00170 $serCombining = escapeSingleString( serialize( $combiningClass ) );
00171 $serComp = escapeSingleString( serialize( $canonicalComp ) );
00172 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
00173 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
00174 $outdata = "<" . "?php
00180 global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC;
00181 \$utfCombiningClass = unserialize( '$serCombining' );
00182 \$utfCanonicalComp = unserialize( '$serComp' );
00183 \$utfCanonicalDecomp = unserialize( '$serCanon' );
00184 \$utfCheckNFC = unserialize( '$serCheckNFC' );
00185 ?" . ">\n";
00186 fputs( $out, $outdata );
00187 fclose( $out );
00188 print "Wrote out UtfNormalData.inc\n";
00189 } else {
00190 print "Can't create file UtfNormalData.inc\n";
00191 exit(-1);
00192 }
00193
00194
00195 $out = fopen("UtfNormalDataK.inc", "wt");
00196 if( $out ) {
00197 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
00198 $outdata = "<" . "?php
00204 global \$utfCompatibilityDecomp;
00205 \$utfCompatibilityDecomp = unserialize( '$serCompat' );
00206 ?" . ">\n";
00207 fputs( $out, $outdata );
00208 fclose( $out );
00209 print "Wrote out UtfNormalDataK.inc\n";
00210 exit(0);
00211 } else {
00212 print "Can't create file UtfNormalDataK.inc\n";
00213 exit(-1);
00214 }
00215
00216 # ---------------
00217
00218 function callbackCanonical( $matches ) {
00219 global $canonicalDecomp;
00220 if( isset( $canonicalDecomp[$matches[1]] ) ) {
00221 return $canonicalDecomp[$matches[1]];
00222 }
00223 return $matches[1];
00224 }
00225
00226 function callbackCompat( $matches ) {
00227 global $compatibilityDecomp;
00228 if( isset( $compatibilityDecomp[$matches[1]] ) ) {
00229 return $compatibilityDecomp[$matches[1]];
00230 }
00231 return $matches[1];
00232 }