User:El~metawiki/History compression
History compression proposal: All revision texts are split into sections (the delimiter is "\n=="). Unchanged sections are stored only once. Sections are sorted by their headings. Then everything is compressed with deflate().
columns in the following table:
cd = concatenate-deflate (megabytes, old method)
smd = split-merge-deflate (megabytes, proposed method)
namespace | cd | smd | smd/cd |
---|---|---|---|
0 | 2599 | 1044 | 0.402 |
1 | 1126 | 164 | 0.146 |
2 | 140 | 59 | 0.421 |
3 | 771 | 102 | 0.132 |
4 | 5214 | 472 | 0.091 |
5 | 391 | 39 | 0.100 |
6 | 10 | 12 | 1.2 |
7 | 1 | 1 | 1 |
8 | 0 | 0 | |
9 | 0 | 0 | |
10 | 4 | 4 | 1 |
11 | 14 | 2 | 0.14 |
12 | 2 | 0 | 0 |
13 | 0 | 0 | |
14 | 2 | 2 | 1 |
15 | 7 | 1 | 0.14 |
10281 | 1902 | 0.185 |
dumps used: German Wikipedia, 2005-04-06
programs used:
extraction of data from the sql dumps: de:Bild:Mkzsf.ogg (set $ar=1)
compression (both methods):
#!/usr/bin/perl use Compress::Zlib; use Digest::MD5 qw(md5_base64); use bytes; %unesc = ("\\" => "\\", "n" => "\n", "r" => "\r", "0" => "\0", "Z" => "\x1A", "\"" => "\"", "'" => "'"); exit unless -f $ARGV[0]; open IN, "zcat $ARGV[0]|"; open SMD, ">$ARGV[0].smd"; open CD, ">$ARGV[0].cd"; $nstitle_last=""; $blobnr=0; while(<IN>) { chop; /^(\d+ \S+) \d+ \S+ \S+ \d \d \d+ \d (.*)$/; $nstitle=$1; $txt = $2; if(($nstitle ne $nstitle_last && $nRevisions>0) || $nRevisions==20) { $nSections = $section_nr; writeBlob() if $blobnr%10==0; $blobnr++; $section_nr=0; $heading_nr=0; $concat=""; %section = (); %heading_nr = (); %section_nr = (); @sections_md5 = (); $nRevisions=0; $index = ""; $rev_nr=0; } $nstitle_last = $nstitle; $nRevisions++; next if $blobnr%10>0; # only testing, no need to save every blob $txt =~ s/\\(.)/$unesc{$1}/g; $concat .= $txt; @index = (); @rev_sections = split(/\n==/, $txt); for($i=0; $i<@rev_sections; $i++) { if($i==0) { $section = $rev_sections[$i]; $heading = ""; } else { $section = "\n==".$rev_sections[$i]; $section =~ /^\n==[^\n]*?/s; $heading = $&; } $section = ($i==0?"":"\n==").$rev_sections[$i]; if($section =~ /^\n==[^\n]*?/s) { $heading = $&; } else { $heading = ""; } $section_md5 = md5_base64($section); $section{$section_md5} = $section; $heading_nr{$heading} = $heading_nr++ unless defined $heading_nr{$heading}; unless (defined $section_nr{$section_md5}) { $section_nr{$section_md5} = $section_nr++; push @{$sections_md5[$heading_nr{$heading}]}, $section_md5; } push @index, $section_nr{$section_md5}; } $index .= join(" ", @index) . "\n"; } close IN; close SMD; close CD; sub writeBlob { my $sections = ""; my $pos=0; my %poslen; foreach my $sections_md5 (@sections_md5) { foreach my $section_md5 (@$sections_md5) { my $len = length $section{$section_md5}; $sections .= $section{$section_md5}; $poslen[$section_nr{$section_md5}] = "$pos $len\n"; $pos += $len; } } my $poslen = ""; for(my $section_nr=0; $section_nr<$nSections; $section_nr++) { $poslen .= $poslen[$section_nr]; } my $index_len = length($index); my $poslen_len = length($poslen); my $blob = sprintf "%08d %08d %08d %08d %08d\n", $nRevisions, $nSections, $index_len, $poslen_len, 1; $blob .= "$index$poslen$sections"; my ($refinf, $status) = deflateInit(); my ($blobgz_a, $status) = $refinf->deflate($blob); my ($blobgz_b, $status) = $refinf->flush(); print SMD $blobgz_a.$blobgz_b; my ($cd_a, $cd_b); ($refinf, $status) = deflateInit(); ($cd_a, $status) = $refinf->deflate($concat); ($cd_b, $status) = $refinf->flush(); print CD $cd_a.$cd_b; }
decompression (This program decompresses exactly one Blob.):
#!/usr/bin/perl use bytes; use Compress::Zlib; open IN, $ARGV[0]; while(<IN>) { $buf .= $_; } close IN; ($refinf, $status) = inflateInit(); ($buf, $status) = $refinf->inflate($buf); #print $buf; ($nRevisions, $nSections, $index_len, $poslen_len, $defaultRevision) = $buf =~ /^(\d{8}) (\d{8}) (\d{8}) (\d{8}) (\d{8})\n/s; @index = split(/\n/, substr($buf, 45, $index_len)); @poslen = split(/\n/, substr($buf, 45+$index_len, $poslen_len)); $pos0 = 45+$index_len+$poslen_len; open RAW, ">$ARGV[0].raw"; for($revision=0; $revision<$nRevisions; $revision++) { @section_nr = split(/ /, $index[$revision]); $txt = ""; foreach $section_nr (@section_nr) { ($pos, $len) = split(/ /, $poslen[$section_nr]); $txt .= substr($buf, $pos0+$pos, $len); } print RAW $txt; } close RAW;