lzbwd_file_compression.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 07 September 2023
  4. # https://github.com/trizen
  5. # Compress/decompress files using LZ77 compression + DEFLATE integers encoding + Burrows-Wheeler Transform (BWT) + Huffman coding.
  6. # References:
  7. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  8. # https://youtube.com/watch?v=cvoZbBZ3M2A
  9. #
  10. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  11. # https://youtube.com/watch?v=SJPvNi4HrWQ
  12. use 5.036;
  13. use Getopt::Std qw(getopts);
  14. use File::Basename qw(basename);
  15. use List::Util qw(max uniq);
  16. use constant {
  17. PKGNAME => 'LZBWD',
  18. VERSION => '0.01',
  19. FORMAT => 'lzbwd',
  20. COMPRESSED_BYTE => chr(1),
  21. UNCOMPRESSED_BYTE => chr(0),
  22. CHUNK_SIZE => 1 << 16, # higher value = better compression
  23. LOOKAHEAD_LEN => 128,
  24. RANDOM_DATA_THRESHOLD => 1, # in ratio
  25. MAX_INT => oct('0b' . ('1' x 32)),
  26. };
  27. # Container signature
  28. use constant SIGNATURE => uc(FORMAT) . chr(1);
  29. # [distance value, offset bits]
  30. my @DISTANCE_SYMBOLS = (map { [$_, 0] } 0 .. 4);
  31. until ($DISTANCE_SYMBOLS[-1][0] > MAX_INT) {
  32. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
  33. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
  34. }
  35. sub usage {
  36. my ($code) = @_;
  37. print <<"EOH";
  38. usage: $0 [options] [input file] [output file]
  39. options:
  40. -e : extract
  41. -i <filename> : input filename
  42. -o <filename> : output filename
  43. -r : rewrite output
  44. -v : version number
  45. -h : this message
  46. examples:
  47. $0 document.txt
  48. $0 document.txt archive.${\FORMAT}
  49. $0 archive.${\FORMAT} document.txt
  50. $0 -e -i archive.${\FORMAT} -o document.txt
  51. EOH
  52. exit($code // 0);
  53. }
  54. sub version {
  55. printf("%s %s\n", PKGNAME, VERSION);
  56. exit;
  57. }
  58. sub valid_archive {
  59. my ($fh) = @_;
  60. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  61. $sig eq SIGNATURE || return;
  62. }
  63. return 1;
  64. }
  65. sub main {
  66. my %opt;
  67. getopts('ei:o:vhr', \%opt);
  68. $opt{h} && usage(0);
  69. $opt{v} && version();
  70. my ($input, $output) = @ARGV;
  71. $input //= $opt{i} // usage(2);
  72. $output //= $opt{o};
  73. my $ext = qr{\.${\FORMAT}\z}io;
  74. if ($opt{e} || $input =~ $ext) {
  75. if (not defined $output) {
  76. ($output = basename($input)) =~ s{$ext}{}
  77. || die "$0: no output file specified!\n";
  78. }
  79. if (not $opt{r} and -e $output) {
  80. print "'$output' already exists! -- Replace? [y/N] ";
  81. <STDIN> =~ /^y/i || exit 17;
  82. }
  83. decompress_file($input, $output)
  84. || die "$0: error: decompression failed!\n";
  85. }
  86. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  87. $output //= basename($input) . '.' . FORMAT;
  88. compress_file($input, $output)
  89. || die "$0: error: compression failed!\n";
  90. }
  91. else {
  92. warn "$0: don't know what to do...\n";
  93. usage(1);
  94. }
  95. }
  96. sub lz77_compression ($str, $uncompressed, $indices, $lengths) {
  97. my $la = 0;
  98. my $prefix = '';
  99. my @chars = split(//, $str);
  100. my $end = $#chars;
  101. while ($la <= $end) {
  102. my $n = 1;
  103. my $p = length($prefix);
  104. my $tmp;
  105. my $token = $chars[$la];
  106. while ( $n < 255
  107. and $la + $n <= $end
  108. and ($tmp = rindex($prefix, $token, $p)) >= 0) {
  109. $p = $tmp;
  110. $token .= $chars[$la + $n];
  111. ++$n;
  112. }
  113. --$n;
  114. push @$indices, $la - $p;
  115. push @$lengths, $n;
  116. push @$uncompressed, ord($chars[$la + $n]);
  117. $la += $n + 1;
  118. $prefix .= $token;
  119. }
  120. return;
  121. }
  122. sub lz77_decompression ($uncompressed, $indices, $lengths) {
  123. my $chunk = '';
  124. my $offset = 0;
  125. foreach my $i (0 .. $#{$uncompressed}) {
  126. $chunk .= substr($chunk, $offset - $indices->[$i], $lengths->[$i]) . $uncompressed->[$i];
  127. $offset += $lengths->[$i] + 1;
  128. }
  129. return $chunk;
  130. }
  131. sub read_bit ($fh, $bitstring) {
  132. if (($$bitstring // '') eq '') {
  133. $$bitstring = unpack('b*', getc($fh) // return undef);
  134. }
  135. chop($$bitstring);
  136. }
  137. sub read_bits ($fh, $bits_len) {
  138. my $data = '';
  139. read($fh, $data, $bits_len >> 3);
  140. $data = unpack('B*', $data);
  141. while (length($data) < $bits_len) {
  142. $data .= unpack('B*', getc($fh) // return undef);
  143. }
  144. if (length($data) > $bits_len) {
  145. $data = substr($data, 0, $bits_len);
  146. }
  147. return $data;
  148. }
  149. sub delta_encode ($integers, $double = 0) {
  150. my @deltas;
  151. my $prev = 0;
  152. unshift(@$integers, scalar(@$integers));
  153. while (@$integers) {
  154. my $curr = shift(@$integers);
  155. push @deltas, $curr - $prev;
  156. $prev = $curr;
  157. }
  158. my $bitstring = '';
  159. foreach my $d (@deltas) {
  160. if ($d == 0) {
  161. $bitstring .= '0';
  162. }
  163. elsif ($double) {
  164. my $t = sprintf('%b', abs($d) + 1);
  165. my $l = sprintf('%b', length($t));
  166. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  167. }
  168. else {
  169. my $t = sprintf('%b', abs($d));
  170. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  171. }
  172. }
  173. pack('B*', $bitstring);
  174. }
  175. sub delta_decode ($fh, $double = 0) {
  176. my @deltas;
  177. my $buffer = '';
  178. my $len = 0;
  179. for (my $k = 0 ; $k <= $len ; ++$k) {
  180. my $bit = read_bit($fh, \$buffer);
  181. if ($bit eq '0') {
  182. push @deltas, 0;
  183. }
  184. elsif ($double) {
  185. my $bit = read_bit($fh, \$buffer);
  186. my $bl = 0;
  187. ++$bl while (read_bit($fh, \$buffer) eq '1');
  188. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  189. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  190. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  191. }
  192. else {
  193. my $bit = read_bit($fh, \$buffer);
  194. my $n = 0;
  195. ++$n while (read_bit($fh, \$buffer) eq '1');
  196. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  197. push @deltas, ($bit eq '1' ? $d : -$d);
  198. }
  199. if ($k == 0) {
  200. $len = pop(@deltas);
  201. }
  202. }
  203. my @acc;
  204. my $prev = $len;
  205. foreach my $d (@deltas) {
  206. $prev += $d;
  207. push @acc, $prev;
  208. }
  209. return \@acc;
  210. }
  211. sub encode_integers ($integers) {
  212. my @symbols;
  213. my $offset_bits = '';
  214. foreach my $dist (@$integers) {
  215. foreach my $i (0 .. $#DISTANCE_SYMBOLS) {
  216. if ($DISTANCE_SYMBOLS[$i][0] > $dist) {
  217. push @symbols, $i - 1;
  218. if ($DISTANCE_SYMBOLS[$i - 1][1] > 0) {
  219. $offset_bits .= sprintf('%0*b', $DISTANCE_SYMBOLS[$i - 1][1], $dist - $DISTANCE_SYMBOLS[$i - 1][0]);
  220. }
  221. last;
  222. }
  223. }
  224. }
  225. return (pack('C*', @symbols), pack('B*', $offset_bits));
  226. }
  227. sub decode_integers ($symbols, $fh) {
  228. my $bits_len = 0;
  229. foreach my $i (@$symbols) {
  230. $bits_len += $DISTANCE_SYMBOLS[$i][1];
  231. }
  232. my $bits = read_bits($fh, $bits_len);
  233. my @distances;
  234. foreach my $i (@$symbols) {
  235. push @distances, $DISTANCE_SYMBOLS[$i][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS[$i][1], ''));
  236. }
  237. return \@distances;
  238. }
  239. # produce encode and decode dictionary from a tree
  240. sub walk ($node, $code, $h, $rev_h) {
  241. my $c = $node->[0] // return ($h, $rev_h);
  242. if (ref $c) { walk($c->[$_], $code . $_, $h, $rev_h) for ('0', '1') }
  243. else { $h->{$c} = $code; $rev_h->{$code} = $c }
  244. return ($h, $rev_h);
  245. }
  246. # make a tree, and return resulting dictionaries
  247. sub mktree_from_freq ($freq) {
  248. my @nodes = map { [$_, $freq->{$_}] } sort { $a <=> $b } keys %$freq;
  249. do { # poor man's priority queue
  250. @nodes = sort { $a->[1] <=> $b->[1] } @nodes;
  251. my ($x, $y) = splice(@nodes, 0, 2);
  252. if (defined($x)) {
  253. if (defined($y)) {
  254. push @nodes, [[$x, $y], $x->[1] + $y->[1]];
  255. }
  256. else {
  257. push @nodes, [[$x], $x->[1]];
  258. }
  259. }
  260. } while (@nodes > 1);
  261. walk($nodes[0], '', {}, {});
  262. }
  263. sub huffman_encode ($bytes, $dict) {
  264. join('', @{$dict}{@$bytes});
  265. }
  266. sub huffman_decode ($bits, $hash) {
  267. local $" = '|';
  268. [split(' ', $bits =~ s/(@{[sort { length($a) <=> length($b) } keys %{$hash}]})/$hash->{$1} /gr)]; # very fast
  269. }
  270. sub create_huffman_entry ($bytes, $out_fh) {
  271. my %freq;
  272. ++$freq{$_} for @$bytes;
  273. my ($h, $rev_h) = mktree_from_freq(\%freq);
  274. my $enc = huffman_encode($bytes, $h);
  275. my $max_symbol = max(keys %freq) // 0;
  276. say "Max symbol: $max_symbol\n";
  277. my @freqs;
  278. foreach my $i (0 .. $max_symbol) {
  279. push @freqs, $freq{$i} // 0;
  280. }
  281. print $out_fh delta_encode(\@freqs);
  282. print $out_fh pack("N", length($enc));
  283. print $out_fh pack("B*", $enc);
  284. }
  285. sub decode_huffman_entry ($fh) {
  286. my @freqs = @{delta_decode($fh)};
  287. my %freq;
  288. foreach my $i (0 .. $#freqs) {
  289. if ($freqs[$i]) {
  290. $freq{$i} = $freqs[$i];
  291. }
  292. }
  293. my (undef, $rev_dict) = mktree_from_freq(\%freq);
  294. my $enc_len = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  295. say "Encoded length: $enc_len\n";
  296. if ($enc_len > 0) {
  297. return huffman_decode(read_bits($fh, $enc_len), $rev_dict);
  298. }
  299. return [];
  300. }
  301. sub mtf_encode ($bytes, $alphabet = [0 .. 255]) {
  302. my @C;
  303. my @table;
  304. @table[@$alphabet] = (0 .. $#{$alphabet});
  305. foreach my $c (@$bytes) {
  306. push @C, (my $index = $table[$c]);
  307. unshift(@$alphabet, splice(@$alphabet, $index, 1));
  308. @table[@{$alphabet}[0 .. $index]] = (0 .. $index);
  309. }
  310. return \@C;
  311. }
  312. sub mtf_decode ($encoded, $alphabet = [0 .. 255]) {
  313. my @S;
  314. foreach my $p (@$encoded) {
  315. push @S, $alphabet->[$p];
  316. unshift(@$alphabet, splice(@$alphabet, $p, 1));
  317. }
  318. return \@S;
  319. }
  320. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  321. #<<<
  322. [
  323. map { $_->[1] } sort {
  324. ($a->[0] cmp $b->[0])
  325. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  326. }
  327. map {
  328. my $t = substr($s, $_, LOOKAHEAD_LEN);
  329. if (length($t) < LOOKAHEAD_LEN) {
  330. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  331. }
  332. [$t, $_]
  333. } 0 .. length($s) - 1
  334. ];
  335. #>>>
  336. }
  337. sub bwt_encode ($s) {
  338. my $bwt = bwt_balanced($s);
  339. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  340. my $idx = 0;
  341. foreach my $i (@$bwt) {
  342. $i || last;
  343. ++$idx;
  344. }
  345. return ($ret, $idx);
  346. }
  347. sub bwt_decode ($bwt, $idx) { # fast inversion
  348. my @tail = split(//, $bwt);
  349. my @head = sort @tail;
  350. my %indices;
  351. foreach my $i (0 .. $#tail) {
  352. push @{$indices{$tail[$i]}}, $i;
  353. }
  354. my @table;
  355. foreach my $v (@head) {
  356. push @table, shift(@{$indices{$v}});
  357. }
  358. my $dec = '';
  359. my $i = $idx;
  360. for (1 .. scalar(@head)) {
  361. $dec .= $head[$i];
  362. $i = $table[$i];
  363. }
  364. return $dec;
  365. }
  366. sub rle4_encode ($bytes) { # RLE1
  367. my @rle;
  368. my $end = $#{$bytes};
  369. my $prev = -1;
  370. my $run = 0;
  371. for (my $i = 0 ; $i <= $end ; ++$i) {
  372. if ($bytes->[$i] == $prev) {
  373. ++$run;
  374. }
  375. else {
  376. $run = 1;
  377. }
  378. push @rle, $bytes->[$i];
  379. $prev = $bytes->[$i];
  380. if ($run >= 4) {
  381. $run = 0;
  382. $i += 1;
  383. while ($run < 255 and $i <= $end and $bytes->[$i] == $prev) {
  384. ++$run;
  385. ++$i;
  386. }
  387. push @rle, $run;
  388. $run = 1;
  389. if ($i <= $end) {
  390. $prev = $bytes->[$i];
  391. push @rle, $bytes->[$i];
  392. }
  393. }
  394. }
  395. return \@rle;
  396. }
  397. sub rle4_decode ($bytes) { # RLE1
  398. my @dec = $bytes->[0];
  399. my $end = $#{$bytes};
  400. my $prev = $bytes->[0];
  401. my $run = 1;
  402. for (my $i = 1 ; $i <= $end ; ++$i) {
  403. if ($bytes->[$i] == $prev) {
  404. ++$run;
  405. }
  406. else {
  407. $run = 1;
  408. }
  409. push @dec, $bytes->[$i];
  410. $prev = $bytes->[$i];
  411. if ($run >= 4) {
  412. if (++$i <= $end) {
  413. $run = $bytes->[$i];
  414. push @dec, (($prev) x $run);
  415. }
  416. $run = 0;
  417. }
  418. }
  419. return \@dec;
  420. }
  421. sub rle_encode ($bytes) { # RLE2
  422. my @rle;
  423. my $end = $#{$bytes};
  424. for (my $i = 0 ; $i <= $end ; ++$i) {
  425. my $run = 0;
  426. while ($i <= $end and $bytes->[$i] == 0) {
  427. ++$run;
  428. ++$i;
  429. }
  430. if ($run >= 1) {
  431. my $t = sprintf('%b', $run + 1);
  432. push @rle, split(//, substr($t, 1));
  433. }
  434. if ($i <= $end) {
  435. push @rle, $bytes->[$i] + 1;
  436. }
  437. }
  438. return \@rle;
  439. }
  440. sub rle_decode ($rle) { # RLE2
  441. my @dec;
  442. my $end = $#{$rle};
  443. for (my $i = 0 ; $i <= $end ; ++$i) {
  444. my $k = $rle->[$i];
  445. if ($k == 0 or $k == 1) {
  446. my $run = 1;
  447. while (($i <= $end) and ($k == 0 or $k == 1)) {
  448. ($run <<= 1) |= $k;
  449. $k = $rle->[++$i];
  450. }
  451. push @dec, (0) x ($run - 1);
  452. }
  453. if ($i <= $end) {
  454. push @dec, $k - 1;
  455. }
  456. }
  457. return \@dec;
  458. }
  459. sub encode_alphabet ($alphabet) {
  460. my %table;
  461. @table{@$alphabet} = ();
  462. my $populated = 0;
  463. my @marked;
  464. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  465. my $enc = 0;
  466. foreach my $j (0 .. 31) {
  467. if (exists($table{$i + $j})) {
  468. $enc |= 1 << $j;
  469. }
  470. }
  471. if ($enc == 0) {
  472. $populated <<= 1;
  473. }
  474. else {
  475. ($populated <<= 1) |= 1;
  476. push @marked, $enc;
  477. }
  478. }
  479. my $delta = delta_encode([@marked], 1);
  480. say "Populated : ", sprintf('%08b', $populated);
  481. say "Marked : @marked";
  482. say "Delta len : ", length($delta);
  483. my $encoded = '';
  484. $encoded .= chr($populated);
  485. $encoded .= $delta;
  486. return $encoded;
  487. }
  488. sub decode_alphabet ($fh) {
  489. my @populated = split(//, sprintf('%08b', ord(getc($fh))));
  490. my $marked = delta_decode($fh, 1);
  491. my @alphabet;
  492. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  493. if (shift(@populated)) {
  494. my $m = shift(@$marked);
  495. foreach my $j (0 .. 31) {
  496. if ($m & 1) {
  497. push @alphabet, $i + $j;
  498. }
  499. $m >>= 1;
  500. }
  501. }
  502. }
  503. return \@alphabet;
  504. }
  505. sub bz2_compression ($chunk, $out_fh) {
  506. my $rle1 = rle4_encode([unpack('C*', $chunk)]);
  507. my ($bwt, $idx) = bwt_encode(pack('C*', @$rle1));
  508. say "BWT index = $idx";
  509. my @bytes = unpack('C*', $bwt);
  510. my @alphabet = sort { $a <=> $b } uniq(@bytes);
  511. my $alphabet_enc = encode_alphabet(\@alphabet);
  512. my $mtf = mtf_encode(\@bytes, [@alphabet]);
  513. my $rle = rle_encode($mtf);
  514. print $out_fh pack('N', $idx);
  515. print $out_fh $alphabet_enc;
  516. create_huffman_entry($rle, $out_fh);
  517. }
  518. sub bz2_decompression ($fh, $out_fh) {
  519. my $idx = unpack('N', join('', map { getc($fh) // return undef } 1 .. 4));
  520. my $alphabet = decode_alphabet($fh);
  521. say "BWT index = $idx";
  522. say "Alphabet size: ", scalar(@$alphabet);
  523. my $rle = decode_huffman_entry($fh);
  524. my $mtf = rle_decode($rle);
  525. my $bwt = mtf_decode($mtf, $alphabet);
  526. my $rle4 = bwt_decode(pack('C*', @$bwt), $idx);
  527. my $data = rle4_decode([unpack('C*', $rle4)]);
  528. print $out_fh pack('C*', @$data);
  529. }
  530. # Compress file
  531. sub compress_file ($input, $output) {
  532. open my $fh, '<:raw', $input
  533. or die "Can't open file <<$input>> for reading: $!";
  534. my $header = SIGNATURE;
  535. # Open the output file for writing
  536. open my $out_fh, '>:raw', $output
  537. or die "Can't open file <<$output>> for write: $!";
  538. # Print the header
  539. print $out_fh $header;
  540. my $lengths = '';
  541. my $uncompressed = '';
  542. my @sizes;
  543. my @indices_block;
  544. open my $uc_fh, '>:raw', \$uncompressed;
  545. open my $len_fh, '>:raw', \$lengths;
  546. my $create_bz2_block = sub {
  547. scalar(@sizes) > 0 or return;
  548. print $out_fh COMPRESSED_BYTE;
  549. print $out_fh delta_encode(\@sizes, 1);
  550. my ($symbols, $offset_bits) = encode_integers(\@indices_block);
  551. bz2_compression($uncompressed, $out_fh);
  552. bz2_compression($lengths, $out_fh);
  553. bz2_compression($symbols, $out_fh);
  554. bz2_compression($offset_bits, $out_fh);
  555. @sizes = ();
  556. @indices_block = ();
  557. open $uc_fh, '>:raw', \$uncompressed;
  558. open $len_fh, '>:raw', \$lengths;
  559. };
  560. # Compress data
  561. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  562. my (@uncompressed, @indices, @lengths);
  563. lz77_compression($chunk, \@uncompressed, \@indices, \@lengths);
  564. my $est_ratio = length($chunk) / (4 * scalar(@uncompressed));
  565. say "Est. ratio: ", $est_ratio, " (", scalar(@uncompressed), " uncompressed bytes)";
  566. if ($est_ratio > RANDOM_DATA_THRESHOLD) {
  567. push @sizes, scalar(@uncompressed);
  568. print $uc_fh pack('C*', @uncompressed);
  569. print $len_fh pack('C*', @lengths);
  570. push @indices_block, @indices;
  571. }
  572. else {
  573. say "Random data detected...";
  574. $create_bz2_block->();
  575. print $out_fh UNCOMPRESSED_BYTE;
  576. create_huffman_entry([unpack 'C*', $chunk], $out_fh);
  577. }
  578. if (length($uncompressed) >= CHUNK_SIZE) {
  579. $create_bz2_block->();
  580. }
  581. }
  582. $create_bz2_block->();
  583. close $out_fh;
  584. }
  585. # Decompress file
  586. sub decompress_file ($input, $output) {
  587. # Open and validate the input file
  588. open my $fh, '<:raw', $input
  589. or die "Can't open file <<$input>> for reading: $!";
  590. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  591. # Open the output file
  592. open my $out_fh, '>:raw', $output
  593. or die "Can't open file <<$output>> for writing: $!";
  594. while (!eof($fh)) {
  595. my $compression_byte = getc($fh) // die "decompression error";
  596. if ($compression_byte eq UNCOMPRESSED_BYTE) {
  597. say "Decoding random data...";
  598. print $out_fh pack('C*', @{decode_huffman_entry($fh)});
  599. next;
  600. }
  601. elsif ($compression_byte ne COMPRESSED_BYTE) {
  602. die "decompression error";
  603. }
  604. my @sizes = @{delta_decode($fh, 1)};
  605. my $lengths = '';
  606. my $uncompressed = '';
  607. my $symbols = '';
  608. my $offset_bits = '';
  609. open my $uc_fh, '>:raw', \$uncompressed;
  610. open my $len_fh, '>:raw', \$lengths;
  611. open my $sym_fh, '+>:raw', \$symbols;
  612. open my $offbits_fh, '+>:raw', \$offset_bits;
  613. bz2_decompression($fh, $uc_fh); # uncompressed
  614. bz2_decompression($fh, $len_fh); # lengths
  615. bz2_decompression($fh, $sym_fh); # symbols
  616. bz2_decompression($fh, $offbits_fh); # offset bits
  617. seek($offbits_fh, 0, 0);
  618. my @indices = @{decode_integers([unpack('C*', $symbols)], $offbits_fh)};
  619. my @uncompressed = split(//, $uncompressed);
  620. my @lengths = unpack('C*', $lengths);
  621. while (@uncompressed) {
  622. my $size = shift(@sizes) // die "decompression error";
  623. my @uncompressed_chunk = splice(@uncompressed, 0, $size);
  624. my @lengths_chunk = splice(@lengths, 0, $size);
  625. my @indices_chunk = splice(@indices, 0, $size);
  626. scalar(@uncompressed_chunk) == $size or die "decompression error";
  627. scalar(@lengths_chunk) == $size or die "decompression error";
  628. scalar(@indices_chunk) == $size or die "decompression error";
  629. print $out_fh lz77_decompression(\@uncompressed_chunk, \@indices_chunk, \@lengths_chunk);
  630. }
  631. }
  632. close $fh;
  633. close $out_fh;
  634. }
  635. main();
  636. exit(0);