bbwr_file_compression.pl 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 19 March 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using Binary Burrows-Wheeler Transform (BWT) + Binary Variable Run-Length Encoding.
  7. # References:
  8. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  9. # https://youtube.com/watch?v=cvoZbBZ3M2A
  10. #
  11. # Data Compression (Summer 2023) - Lecture 5 - Basic Techniques
  12. # https://youtube.com/watch?v=TdFWb8mL5Gk
  13. use 5.036;
  14. use Getopt::Std qw(getopts);
  15. use File::Basename qw(basename);
  16. use List::Util qw(max uniq);
  17. use constant {
  18. PKGNAME => 'BBWR',
  19. VERSION => '0.02',
  20. FORMAT => 'bbwr',
  21. CHUNK_SIZE => 1 << 13, # larger values == better compression
  22. LOOKAHEAD_LEN => 128,
  23. };
  24. # Container signature
  25. use constant SIGNATURE => uc(FORMAT) . chr(2);
  26. sub usage {
  27. my ($code) = @_;
  28. print <<"EOH";
  29. usage: $0 [options] [input file] [output file]
  30. options:
  31. -e : extract
  32. -i <filename> : input filename
  33. -o <filename> : output filename
  34. -r : rewrite output
  35. -v : version number
  36. -h : this message
  37. examples:
  38. $0 document.txt
  39. $0 document.txt archive.${\FORMAT}
  40. $0 archive.${\FORMAT} document.txt
  41. $0 -e -i archive.${\FORMAT} -o document.txt
  42. EOH
  43. exit($code // 0);
  44. }
  45. sub version {
  46. printf("%s %s\n", PKGNAME, VERSION);
  47. exit;
  48. }
  49. sub valid_archive {
  50. my ($fh) = @_;
  51. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  52. $sig eq SIGNATURE || return;
  53. }
  54. return 1;
  55. }
  56. sub main {
  57. my %opt;
  58. getopts('ei:o:vhr', \%opt);
  59. $opt{h} && usage(0);
  60. $opt{v} && version();
  61. my ($input, $output) = @ARGV;
  62. $input //= $opt{i} // usage(2);
  63. $output //= $opt{o};
  64. my $ext = qr{\.${\FORMAT}\z}io;
  65. if ($opt{e} || $input =~ $ext) {
  66. if (not defined $output) {
  67. ($output = basename($input)) =~ s{$ext}{}
  68. || die "$0: no output file specified!\n";
  69. }
  70. if (not $opt{r} and -e $output) {
  71. print "'$output' already exists! -- Replace? [y/N] ";
  72. <STDIN> =~ /^y/i || exit 17;
  73. }
  74. decompress_file($input, $output)
  75. || die "$0: error: decompression failed!\n";
  76. }
  77. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  78. $output //= basename($input) . '.' . FORMAT;
  79. compress_file($input, $output)
  80. || die "$0: error: compression failed!\n";
  81. }
  82. else {
  83. warn "$0: don't know what to do...\n";
  84. usage(1);
  85. }
  86. }
  87. sub read_bit ($fh, $bitstring) {
  88. if (($$bitstring // '') eq '') {
  89. $$bitstring = unpack('b*', getc($fh) // return undef);
  90. }
  91. chop($$bitstring);
  92. }
  93. sub read_bits ($fh, $bits_len) {
  94. my $data = '';
  95. read($fh, $data, $bits_len >> 3);
  96. $data = unpack('B*', $data);
  97. while (length($data) < $bits_len) {
  98. $data .= unpack('B*', getc($fh) // return undef);
  99. }
  100. if (length($data) > $bits_len) {
  101. $data = substr($data, 0, $bits_len);
  102. }
  103. return $data;
  104. }
  105. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  106. #<<<
  107. [
  108. map { $_->[1] } sort {
  109. ($a->[0] cmp $b->[0])
  110. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  111. }
  112. map {
  113. my $t = substr($s, $_, LOOKAHEAD_LEN);
  114. if (length($t) < LOOKAHEAD_LEN) {
  115. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  116. }
  117. [$t, $_]
  118. } 0 .. length($s) - 1
  119. ];
  120. #>>>
  121. }
  122. sub bwt_encode ($s) {
  123. my $bwt = bwt_balanced($s);
  124. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  125. my $idx = 0;
  126. foreach my $i (@$bwt) {
  127. $i || last;
  128. ++$idx;
  129. }
  130. return ($ret, $idx);
  131. }
  132. sub bwt_decode ($bwt, $idx) { # fast inversion
  133. my @tail = split(//, $bwt);
  134. my @head = sort @tail;
  135. my %indices;
  136. foreach my $i (0 .. $#tail) {
  137. push @{$indices{$tail[$i]}}, $i;
  138. }
  139. my @table;
  140. foreach my $v (@head) {
  141. push @table, shift(@{$indices{$v}});
  142. }
  143. my $dec = '';
  144. my $i = $idx;
  145. for (1 .. scalar(@head)) {
  146. $dec .= $head[$i];
  147. $i = $table[$i];
  148. }
  149. return $dec;
  150. }
  151. sub run_length ($arr) {
  152. @$arr || return [];
  153. my @result = [$arr->[0], 1];
  154. my $prev_value = $arr->[0];
  155. foreach my $i (1 .. $#{$arr}) {
  156. my $curr_value = $arr->[$i];
  157. if ($curr_value eq $prev_value) {
  158. ++$result[-1][1];
  159. }
  160. else {
  161. push(@result, [$curr_value, 1]);
  162. }
  163. $prev_value = $curr_value;
  164. }
  165. return \@result;
  166. }
  167. sub binary_vrl_encoding ($str) {
  168. my @bits = split(//, $str);
  169. my $bitstring = $bits[0];
  170. foreach my $rle (@{run_length(\@bits)}) {
  171. my ($c, $v) = @$rle;
  172. if ($v == 1) {
  173. $bitstring .= '0';
  174. }
  175. else {
  176. my $t = sprintf('%b', $v - 1);
  177. $bitstring .= join('', '1' x length($t), '0', substr($t, 1));
  178. }
  179. }
  180. return $bitstring;
  181. }
  182. sub binary_vrl_decoding ($bitstring) {
  183. my $decoded = '';
  184. my $bit = substr($bitstring, 0, 1, '');
  185. while ($bitstring ne '') {
  186. $decoded .= $bit;
  187. my $bl = 0;
  188. while (substr($bitstring, 0, 1, '') eq '1') {
  189. ++$bl;
  190. }
  191. if ($bl > 0) {
  192. $decoded .= $bit x oct('0b1' . join('', map { substr($bitstring, 0, 1, '') } 1 .. $bl - 1));
  193. }
  194. $bit = ($bit eq '1' ? '0' : '1');
  195. }
  196. return $decoded;
  197. }
  198. sub compression ($chunk, $out_fh) {
  199. my $bits = unpack('B*', $chunk);
  200. my $vrle1 = binary_vrl_encoding($bits);
  201. if (length($vrle1) < length($bits)) {
  202. printf "Doing early VLR, saving %s bits\n", length($bits) - length($vrle1);
  203. print $out_fh chr(1);
  204. }
  205. else {
  206. print $out_fh chr(0);
  207. $vrle1 = $bits;
  208. }
  209. my ($bwt, $idx) = bwt_encode($vrle1);
  210. my $vrle2 = binary_vrl_encoding($bwt);
  211. say "BWT index: $idx";
  212. print $out_fh pack('N', $idx);
  213. print $out_fh pack('N', length($vrle2));
  214. print $out_fh pack('B*', $vrle2);
  215. }
  216. sub decompression ($fh, $out_fh) {
  217. my $compressed_byte = ord(getc($fh) // die "error");
  218. my $idx = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  219. my $bits_len = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  220. say "BWT index = $idx";
  221. my $bwt = binary_vrl_decoding(read_bits($fh, $bits_len));
  222. my $data = bwt_decode($bwt, $idx);
  223. if ($compressed_byte == 1) {
  224. $data = binary_vrl_decoding($data);
  225. }
  226. print $out_fh pack('B*', $data);
  227. }
  228. # Compress file
  229. sub compress_file ($input, $output) {
  230. open my $fh, '<:raw', $input
  231. or die "Can't open file <<$input>> for reading: $!";
  232. my $header = SIGNATURE;
  233. # Open the output file for writing
  234. open my $out_fh, '>:raw', $output
  235. or die "Can't open file <<$output>> for write: $!";
  236. # Print the header
  237. print $out_fh $header;
  238. # Compress data
  239. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  240. compression($chunk, $out_fh);
  241. }
  242. # Close the file
  243. close $out_fh;
  244. }
  245. # Decompress file
  246. sub decompress_file ($input, $output) {
  247. # Open and validate the input file
  248. open my $fh, '<:raw', $input
  249. or die "Can't open file <<$input>> for reading: $!";
  250. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  251. # Open the output file
  252. open my $out_fh, '>:raw', $output
  253. or die "Can't open file <<$output>> for writing: $!";
  254. while (!eof($fh)) {
  255. decompression($fh, $out_fh);
  256. }
  257. # Close the file
  258. close $fh;
  259. close $out_fh;
  260. }
  261. main();
  262. exit(0);