00001 <?php
00002 
00029 if( php_sapi_name() != 'cli' ) {
00030         echo "Please customize the settings and run me from the command line.";
00031         die( -1 );
00032 }
00033 
00035 $wgImportEncoding = "CP1252"; 
00036 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
00037 
00038 
00039 @ini_set( 'memory_limit', '40M' );
00040 
00041 
00042 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
00043         $FS = $wgFieldSeparator ;
00044         $FS1 = $FS."1" ;
00045         $FS2 = $FS."2" ;
00046         $FS3 = $FS."3" ;
00047 
00048 # Unicode sanitization tools
00049 require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
00050 
00051 $usercache = array();
00052 
00053 importPages();
00054 
00055 # ------------------------------------------------------------------------------
00056 
00057 function importPages()
00058 {
00059         global $wgRootDirectory;
00060 
00061         $gt = '>';
00062         echo <<<END
00063 <?xml version="1.0" encoding="UTF-8" ?$gt
00064 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
00065            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
00066            xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
00067                                http://www.mediawiki.org/xml/export-0.1.xsd"
00068            version="0.1"
00069            xml:lang="en">
00070 <!-- generated by importUseModWiki.php -->
00071 
00072 END;
00073         $letters = array(
00074                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
00075                 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
00076                 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
00077         foreach( $letters as $letter ) {
00078                 $dir = "$wgRootDirectory/page/$letter";
00079                 if( is_dir( $dir ) )
00080                         importPageDirectory( $dir );
00081         }
00082         echo <<<END
00083 </mediawiki>
00084 
00085 END;
00086 }
00087 
00088 function importPageDirectory( $dir, $prefix = "" )
00089 {
00090         echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
00091         $mydir = opendir( $dir );
00092         while( $entry = readdir( $mydir ) ) {
00093                 $m = array();
00094                 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
00095                         echo importPage( $prefix . $m[1] );
00096                 } else {
00097                         if( is_dir( "$dir/$entry" ) ) {
00098                                 if( $entry != '.' && $entry != '..' ) {
00099                                         importPageDirectory( "$dir/$entry", "$entry/" );
00100                                 }
00101                         } else {
00102                                 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
00103                         }
00104                 }
00105         }
00106 }
00107 
00108 
00109 # ------------------------------------------------------------------------------
00110 
00111 
00112 
00113 
00114 
00115 function useModFilename( $title ) {
00116         $c = substr( $title, 0, 1 );
00117         if(preg_match( '/[A-Z]/i', $c ) ) {
00118                 return strtoupper( $c ) . "/$title";
00119         }
00120         return "other/$title";
00121 }
00122 
00123 function fetchPage( $title )
00124 {
00125         global $FS1,$FS2,$FS3, $wgRootDirectory;
00126 
00127         $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
00128         if( !file_exists( $fname ) ) {
00129                 echo "Couldn't open file '$fname' for page '$title'.\n";
00130                 die( -1 );
00131         }
00132 
00133         $page = splitHash( $FS1, file_get_contents( $fname ) );
00134         $section = splitHash( $FS2, $page["text_default"] );
00135         $text = splitHash( $FS3, $section["data"] );
00136 
00137         return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
00138                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
00139                 "username" => $section["username"] , "host" => $section["host"] ) );
00140 }
00141 
00142 function fetchKeptPages( $title )
00143 {
00144         global $FS1,$FS2,$FS3, $wgRootDirectory;
00145 
00146         $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
00147         if( !file_exists( $fname ) ) return array();
00148 
00149         $keptlist = explode( $FS1, file_get_contents( $fname ) );
00150         array_shift( $keptlist ); # Drop the junk at beginning of file
00151 
00152         $revisions = array();
00153         foreach( $keptlist as $rev ) {
00154                 $section = splitHash( $FS2, $rev );
00155                 $text = splitHash( $FS3, $section["data"] );
00156                 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
00157                         array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
00158                                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
00159                                 "username" => $section["username"] , "host" => $section["host"] ) ) );
00160                 } else {
00161                         echo "<!-- skipped a bad old revision -->\n";
00162                 }
00163         }
00164         return $revisions;
00165 }
00166 
00167 function splitHash ( $sep , $str ) {
00168         $temp = explode ( $sep , $str ) ;
00169         $ret = array () ;
00170         for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
00171                 $ret[$temp[$i]] = $temp[++$i] ;
00172                 }
00173         return $ret ;
00174         }
00175 
00176 
00177 
00178 
00179 
00180 
00181 function checkUserCache( $name, $host )
00182 {
00183         global $usercache;
00184 
00185         if( $name ) {
00186                 if( in_array( $name, $usercache ) ) {
00187                         $userid = $usercache[$name];
00188                 } else {
00189                         # If we haven't imported user accounts
00190                         $userid = 0;
00191                 }
00192                 $username = str_replace( '_', ' ', $name );
00193         } else {
00194                 $userid = 0;
00195                 $username = $host;
00196         }
00197         return array( $userid, $username );
00198 }
00199 
00200 function importPage( $title )
00201 {
00202         global $usercache;
00203 
00204         echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
00205         $page = fetchPage( $title );
00206 
00207         $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
00208 
00209         $munged = mungeFormat( $page->text );
00210         if( $munged != $page->text ) {
00215                 $next = array2object( array(
00216                         'text'     => $munged,
00217                         'minor'    => 1,
00218                         'username' => 'Conversion script',
00219                         'host'     => '127.0.0.1',
00220                         'ts'       => time(),
00221                         'summary'  => 'link fix',
00222                         ) );
00223                 $revisions = array( $page, $next );
00224         } else {
00228                 $revisions = array( $page );
00229         }
00230         $xml = <<<END
00231         <page>
00232                 <title>$newtitle</title>
00233 
00234 END;
00235 
00236         # History
00237         $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
00238         if(count( $revisions ) == 0 ) {
00239                 return NULL; 
00240         }
00241 
00242         foreach( $revisions as $rev ) {
00243                 $text      = xmlsafe( recodeText( $rev->text ) );
00244                 $minor     = ($rev->minor ? '<minor/>' : '');
00245                 list(  , $username ) = checkUserCache( $rev->username, $rev->host );
00246                 $username  = xmlsafe( recodeText( $username ) );
00247                 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
00248                 $comment   = xmlsafe( recodeText( $rev->summary ) );
00249 
00250                 $xml .= <<<END
00251                 <revision>
00252                         <timestamp>$timestamp</timestamp>
00253                         <contributor><username>$username</username></contributor>
00254                         $minor
00255                         <comment>$comment</comment>
00256                         <text>$text</text>
00257                 </revision>
00258 
00259 END;
00260         }
00261         $xml .= "</page>\n\n";
00262         return $xml;
00263 }
00264 
00265 # Whee!
00266 function recodeText( $string ) {
00267         global $wgImportEncoding;
00268         # For currently latin-1 wikis
00269         $string = str_replace( "\r\n", "\n", $string );
00270         $string = @iconv( $wgImportEncoding, "UTF-8", $string );
00271         $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
00272         return $string;
00273 }
00274 
00275 function wfUtf8Sequence($codepoint) {
00276         if($codepoint <     0x80) return chr($codepoint);
00277         if($codepoint <    0x800) return chr($codepoint >>  6 & 0x3f | 0xc0) .
00278                                      chr($codepoint       & 0x3f | 0x80);
00279     if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
00280                                      chr($codepoint >>  6 & 0x3f | 0x80) .
00281                                      chr($codepoint       & 0x3f | 0x80);
00282         if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
00283                                          chr($codepoint >> 12 & 0x3f | 0x80) .
00284                                      chr($codepoint >>  6 & 0x3f | 0x80) .
00285                                      chr($codepoint       & 0x3f | 0x80);
00286         # Doesn't yet handle outside the BMP
00287         return "&#$codepoint;";
00288 }
00289 
00290 function wfMungeToUtf8($string) {
00291         $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
00292         $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
00293         # Should also do named entities here
00294         return $string;
00295 }
00296 
00297 function timestamp2ISO8601( $ts ) {
00298         #2003-08-05T18:30:02Z
00299         return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
00300 }
00301 
00302 function xmlsafe( $string ) {
00308         $string = UtfNormal::cleanUp( $string );
00309 
00310         $string = htmlspecialchars( $string );
00311         return $string;
00312 }
00313 
00314 function xmlCommentSafe( $text ) {
00315         return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
00316 }
00317 
00318 
00319 function array2object( $arr ) {
00320         $o = (object)0;
00321         foreach( $arr as $x => $y ) {
00322                 $o->$x = $y;
00323         }
00324         return $o;
00325 }
00326 
00327 
00331 function mungeFormat( $text ) {
00332         global $nowiki;
00333         $nowiki = array();
00334         $staged = preg_replace_callback(
00335                 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
00336                 'nowikiPlaceholder', $text );
00337 
00338         # This is probably not  100% correct, I'm just
00339         # glancing at the UseModWiki code.
00340         $upper   = "[A-Z]";
00341         $lower   = "[a-z_0-9]";
00342         $any     = "[A-Za-z_0-9]";
00343         $camel   = "(?:$upper+$lower+$upper+$any*)";
00344         $subpage = "(?:\\/$any+)";
00345         $substart = "(?:\\/$upper$any*)";
00346 
00347         $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
00348                 '[[$1]]', $staged );
00349 
00350         $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
00351                 'array_shift( $nowiki )', $munged );
00352         return $final;
00353 }
00354 
00355 
00356 function placeholder( $x = null ) {
00357         return '\xffplaceholder\xff';
00358 }
00359 
00360 function nowikiPlaceholder( $matches ) {
00361         global $nowiki;
00362         $nowiki[] = $matches[1];
00363         return placeholder();
00364 }
00365 
00366