00001 <?php
00002
00003 $optionsWithArgs = array( 'start', 'limit', 'type' );
00004 require( dirname(__FILE__).'/../commandLine.inc' );
00005
00006 if ( !isset( $args[0] ) ) {
00007 echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
00008 exit( 1 );
00009 }
00010
00011 $title = Title::newFromText( $args[0] );
00012 if ( isset( $options['start'] ) ) {
00013 $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
00014 echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
00015 } else {
00016 $start = '19700101000000';
00017 }
00018 if ( isset( $options['limit'] ) ) {
00019 $limit = $options['limit'];
00020 $untilHappy = false;
00021 } else {
00022 $limit = 1000;
00023 $untilHappy = true;
00024 }
00025 $type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
00026
00027
00028 $dbr = wfGetDB( DB_SLAVE );
00029 $res = $dbr->select(
00030 array( 'page', 'revision', 'text' ),
00031 '*',
00032 array(
00033 'page_namespace' => $title->getNamespace(),
00034 'page_title' => $title->getDBkey(),
00035 'page_id=rev_page',
00036 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
00037 'rev_text_id=old_id'
00038 ), __FILE__, array( 'LIMIT' => $limit )
00039 );
00040
00041 $blob = new $type;
00042 $hashes = array();
00043 $keys = array();
00044 $uncompressedSize = 0;
00045 $t = -microtime( true );
00046 foreach ( $res as $row ) {
00047 $revision = new Revision( $row );
00048 $text = $revision->getText();
00049 $uncompressedSize += strlen( $text );
00050 $hashes[$row->rev_id] = md5( $text );
00051 $keys[$row->rev_id] = $blob->addItem( $text );
00052 if ( $untilHappy && !$blob->isHappy() ) {
00053 break;
00054 }
00055 }
00056
00057 $serialized = serialize( $blob );
00058 $t += microtime( true );
00059 #print_r( $blob->mDiffMap );
00060
00061 printf( "%s\nCompression ratio for %d revisions: %5.2f, %s -> %d\n",
00062 $type,
00063 count( $hashes ),
00064 $uncompressedSize / strlen( $serialized ),
00065 $wgLang->formatSize( $uncompressedSize ),
00066 strlen( $serialized )
00067 );
00068 printf( "Compression time: %5.2f ms\n", $t * 1000 );
00069
00070 $t = -microtime( true );
00071 $blob = unserialize( $serialized );
00072 foreach ( $keys as $id => $key ) {
00073 $text = $blob->getItem( $key );
00074 if ( md5( $text ) != $hashes[$id] ) {
00075 echo "Content hash mismatch for rev_id $id\n";
00076 #var_dump( $text );
00077 }
00078 }
00079 $t += microtime( true );
00080 printf( "Decompression time: %5.2f ms\n", $t * 1000 );
00081