bwlza2_file_compression.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 15 June 2023
  4. # Edit: 19 March 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using Burrows-Wheeler Transform (BWT) + LZ77 compression (LZHD variant) + Arithmetic Coding (in fixed bits).
  7. # Encoding the distances using a DEFLATE-like approach.
  8. # References:
  9. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  10. # https://youtube.com/watch?v=SJPvNi4HrWQ
  11. #
  12. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  13. # https://youtube.com/watch?v=cvoZbBZ3M2A
  14. #
  15. # Basic arithmetic coder in C++
  16. # https://github.com/billbird/arith32
  17. use 5.036;
  18. use Getopt::Std qw(getopts);
  19. use File::Basename qw(basename);
  20. use List::Util qw(max uniq sum);
  21. use constant {
  22. PKGNAME => 'BWLZA2',
  23. VERSION => '0.01',
  24. FORMAT => 'bwlza2',
  25. COMPRESSED_BYTE => chr(1),
  26. UNCOMPRESSED_BYTE => chr(0),
  27. CHUNK_SIZE => 1 << 17, # higher value = better compression
  28. LOOKAHEAD_LEN => 128,
  29. RANDOM_DATA_THRESHOLD => 1, # in ratio
  30. };
  31. # Arithmetic Coding settings
  32. use constant BITS => 32;
  33. use constant MAX => oct('0b' . ('1' x BITS));
  34. # Container signature
  35. use constant SIGNATURE => uc(FORMAT) . chr(1);
  36. # [distance value, offset bits]
  37. my @DISTANCE_SYMBOLS = map { [$_, 0] } (0 .. 4);
  38. until ($DISTANCE_SYMBOLS[-1][0] > CHUNK_SIZE) {
  39. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
  40. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
  41. }
  42. my @DISTANCE_INDICES;
  43. foreach my $i (0 .. $#DISTANCE_SYMBOLS) {
  44. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$i]};
  45. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  46. last if ($k > CHUNK_SIZE);
  47. $DISTANCE_INDICES[$k] = $i;
  48. }
  49. }
  50. sub usage {
  51. my ($code) = @_;
  52. print <<"EOH";
  53. usage: $0 [options] [input file] [output file]
  54. options:
  55. -e : extract
  56. -i <filename> : input filename
  57. -o <filename> : output filename
  58. -r : rewrite output
  59. -v : version number
  60. -h : this message
  61. examples:
  62. $0 document.txt
  63. $0 document.txt archive.${\FORMAT}
  64. $0 archive.${\FORMAT} document.txt
  65. $0 -e -i archive.${\FORMAT} -o document.txt
  66. EOH
  67. exit($code // 0);
  68. }
  69. sub version {
  70. printf("%s %s\n", PKGNAME, VERSION);
  71. exit;
  72. }
  73. sub valid_archive {
  74. my ($fh) = @_;
  75. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  76. $sig eq SIGNATURE || return;
  77. }
  78. return 1;
  79. }
  80. sub main {
  81. my %opt;
  82. getopts('ei:o:vhr', \%opt);
  83. $opt{h} && usage(0);
  84. $opt{v} && version();
  85. my ($input, $output) = @ARGV;
  86. $input //= $opt{i} // usage(2);
  87. $output //= $opt{o};
  88. my $ext = qr{\.${\FORMAT}\z}io;
  89. if ($opt{e} || $input =~ $ext) {
  90. if (not defined $output) {
  91. ($output = basename($input)) =~ s{$ext}{}
  92. || die "$0: no output file specified!\n";
  93. }
  94. if (not $opt{r} and -e $output) {
  95. print "'$output' already exists! -- Replace? [y/N] ";
  96. <STDIN> =~ /^y/i || exit 17;
  97. }
  98. decompress_file($input, $output)
  99. || die "$0: error: decompression failed!\n";
  100. }
  101. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  102. $output //= basename($input) . '.' . FORMAT;
  103. compress_file($input, $output)
  104. || die "$0: error: compression failed!\n";
  105. }
  106. else {
  107. warn "$0: don't know what to do...\n";
  108. usage(1);
  109. }
  110. }
  111. sub lz77_compression ($str, $uncompressed, $indices, $lengths) {
  112. my $la = 0;
  113. my $prefix = '';
  114. my @chars = split(//, $str);
  115. my $end = $#chars;
  116. while ($la <= $end) {
  117. my $n = 1;
  118. my $p = length($prefix);
  119. my $tmp;
  120. my $token = $chars[$la];
  121. while ( $n < 255
  122. and $la + $n <= $end
  123. and ($tmp = rindex($prefix, $token, $p)) >= 0) {
  124. $p = $tmp;
  125. $token .= $chars[$la + $n];
  126. ++$n;
  127. }
  128. --$n;
  129. push @$indices, $la - $p;
  130. push @$lengths, $n;
  131. push @$uncompressed, ord($chars[$la + $n]);
  132. $la += $n + 1;
  133. $prefix .= $token;
  134. }
  135. return;
  136. }
  137. sub lz77_decompression ($uncompressed, $indices, $lengths) {
  138. my $chunk = '';
  139. my $offset = 0;
  140. foreach my $i (0 .. $#{$uncompressed}) {
  141. $chunk .= substr($chunk, $offset - $indices->[$i], $lengths->[$i]) . chr($uncompressed->[$i]);
  142. $offset += $lengths->[$i] + 1;
  143. }
  144. return $chunk;
  145. }
  146. sub read_bit ($fh, $bitstring) {
  147. if (($$bitstring // '') eq '') {
  148. $$bitstring = unpack('b*', getc($fh) // return undef);
  149. }
  150. chop($$bitstring);
  151. }
  152. sub read_bits ($fh, $bits_len) {
  153. my $data = '';
  154. read($fh, $data, $bits_len >> 3);
  155. $data = unpack('B*', $data);
  156. while (length($data) < $bits_len) {
  157. $data .= unpack('B*', getc($fh) // return undef);
  158. }
  159. if (length($data) > $bits_len) {
  160. $data = substr($data, 0, $bits_len);
  161. }
  162. return $data;
  163. }
  164. sub delta_encode ($integers, $double = 0) {
  165. my @deltas;
  166. my $prev = 0;
  167. unshift(@$integers, scalar(@$integers));
  168. while (@$integers) {
  169. my $curr = shift(@$integers);
  170. push @deltas, $curr - $prev;
  171. $prev = $curr;
  172. }
  173. my $bitstring = '';
  174. foreach my $d (@deltas) {
  175. if ($d == 0) {
  176. $bitstring .= '0';
  177. }
  178. elsif ($double) {
  179. my $t = sprintf('%b', abs($d) + 1);
  180. my $l = sprintf('%b', length($t));
  181. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  182. }
  183. else {
  184. my $t = sprintf('%b', abs($d));
  185. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  186. }
  187. }
  188. pack('B*', $bitstring);
  189. }
  190. sub delta_decode ($fh, $double = 0) {
  191. my @deltas;
  192. my $buffer = '';
  193. my $len = 0;
  194. for (my $k = 0 ; $k <= $len ; ++$k) {
  195. my $bit = read_bit($fh, \$buffer);
  196. if ($bit eq '0') {
  197. push @deltas, 0;
  198. }
  199. elsif ($double) {
  200. my $bit = read_bit($fh, \$buffer);
  201. my $bl = 0;
  202. ++$bl while (read_bit($fh, \$buffer) eq '1');
  203. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  204. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  205. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  206. }
  207. else {
  208. my $bit = read_bit($fh, \$buffer);
  209. my $n = 0;
  210. ++$n while (read_bit($fh, \$buffer) eq '1');
  211. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  212. push @deltas, ($bit eq '1' ? $d : -$d);
  213. }
  214. if ($k == 0) {
  215. $len = pop(@deltas);
  216. }
  217. }
  218. my @acc;
  219. my $prev = $len;
  220. foreach my $d (@deltas) {
  221. $prev += $d;
  222. push @acc, $prev;
  223. }
  224. return \@acc;
  225. }
  226. sub create_cfreq ($freq) {
  227. my @cf;
  228. my $T = 0;
  229. foreach my $i (sort { $a <=> $b } keys %$freq) {
  230. $freq->{$i} // next;
  231. $cf[$i] = $T;
  232. $T += $freq->{$i};
  233. $cf[$i + 1] = $T;
  234. }
  235. return (\@cf, $T);
  236. }
  237. sub ac_encode ($bytes_arr) {
  238. my $enc = '';
  239. my $EOF_SYMBOL = (max(@$bytes_arr) // 0) + 1;
  240. my @bytes = (@$bytes_arr, $EOF_SYMBOL);
  241. my %freq;
  242. ++$freq{$_} for @bytes;
  243. my ($cf, $T) = create_cfreq(\%freq);
  244. if ($T > MAX) {
  245. die "Too few bits: $T > ${\MAX}";
  246. }
  247. my $low = 0;
  248. my $high = MAX;
  249. my $uf_count = 0;
  250. foreach my $c (@bytes) {
  251. my $w = $high - $low + 1;
  252. $high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX;
  253. $low = ($low + int(($w * $cf->[$c]) / $T)) & MAX;
  254. if ($high > MAX) {
  255. die "high > MAX: $high > ${\MAX}";
  256. }
  257. if ($low >= $high) { die "$low >= $high" }
  258. while (1) {
  259. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  260. my $bit = $high >> (BITS - 1);
  261. $enc .= $bit;
  262. if ($uf_count > 0) {
  263. $enc .= join('', 1 - $bit) x $uf_count;
  264. $uf_count = 0;
  265. }
  266. $low <<= 1;
  267. ($high <<= 1) |= 1;
  268. }
  269. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  270. ($high <<= 1) |= (1 << (BITS - 1));
  271. $high |= 1;
  272. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  273. ++$uf_count;
  274. }
  275. else {
  276. last;
  277. }
  278. $low &= MAX;
  279. $high &= MAX;
  280. }
  281. }
  282. $enc .= '0';
  283. $enc .= '1';
  284. while (length($enc) % 8 != 0) {
  285. $enc .= '1';
  286. }
  287. return ($enc, \%freq);
  288. }
  289. sub ac_decode ($fh, $freq) {
  290. my ($cf, $T) = create_cfreq($freq);
  291. my @dec;
  292. my $low = 0;
  293. my $high = MAX;
  294. my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS);
  295. my @table;
  296. foreach my $i (sort { $a <=> $b } keys %$freq) {
  297. foreach my $j ($cf->[$i] .. $cf->[$i + 1] - 1) {
  298. $table[$j] = $i;
  299. }
  300. }
  301. my $EOF_SYMBOL = max(keys %$freq) // 0;
  302. while (1) {
  303. my $w = $high - $low + 1;
  304. my $ss = int((($T * ($enc - $low + 1)) - 1) / $w);
  305. my $i = $table[$ss] // last;
  306. last if ($i == $EOF_SYMBOL);
  307. push @dec, $i;
  308. $high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX;
  309. $low = ($low + int(($w * $cf->[$i]) / $T)) & MAX;
  310. if ($high > MAX) {
  311. die "error";
  312. }
  313. if ($low >= $high) { die "$low >= $high" }
  314. while (1) {
  315. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  316. ($high <<= 1) |= 1;
  317. $low <<= 1;
  318. ($enc <<= 1) |= (getc($fh) // 1);
  319. }
  320. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  321. ($high <<= 1) |= (1 << (BITS - 1));
  322. $high |= 1;
  323. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  324. $enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1);
  325. }
  326. else {
  327. last;
  328. }
  329. $low &= MAX;
  330. $high &= MAX;
  331. $enc &= MAX;
  332. }
  333. }
  334. return \@dec;
  335. }
  336. sub create_ac_entry ($bytes, $out_fh) {
  337. my ($enc, $freq) = ac_encode($bytes);
  338. my $max_symbol = max(keys %$freq) // 0;
  339. my @freqs;
  340. foreach my $k (0 .. $max_symbol) {
  341. push @freqs, $freq->{$k} // 0;
  342. }
  343. push @freqs, length($enc) >> 3;
  344. say "Max symbol: $max_symbol";
  345. print $out_fh delta_encode(\@freqs);
  346. print $out_fh pack("B*", $enc);
  347. }
  348. sub decode_ac_entry ($fh) {
  349. my @freqs = @{delta_decode($fh)};
  350. my $bits_len = pop(@freqs);
  351. my %freq;
  352. foreach my $i (0 .. $#freqs) {
  353. if ($freqs[$i]) {
  354. $freq{$i} = $freqs[$i];
  355. }
  356. }
  357. say "Encoded length: $bits_len";
  358. my $bits = read_bits($fh, $bits_len << 3);
  359. if ($bits_len > 0) {
  360. open my $bits_fh, '<:raw', \$bits;
  361. return ac_decode($bits_fh, \%freq);
  362. }
  363. return [];
  364. }
  365. sub encode_distances ($distances, $out_fh) {
  366. my @symbols;
  367. my $offset_bits = '';
  368. foreach my $dist (@$distances) {
  369. my $i = $DISTANCE_INDICES[$dist];
  370. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$i]};
  371. push @symbols, $i;
  372. if ($bits > 0) {
  373. $offset_bits .= sprintf('%0*b', $bits, $dist - $min);
  374. }
  375. }
  376. create_ac_entry(\@symbols, $out_fh);
  377. print $out_fh pack('B*', $offset_bits);
  378. }
  379. sub decode_distances ($fh) {
  380. my $symbols = decode_ac_entry($fh);
  381. my $bits_len = 0;
  382. foreach my $i (@$symbols) {
  383. $bits_len += $DISTANCE_SYMBOLS[$i][1];
  384. }
  385. my $bits = read_bits($fh, $bits_len);
  386. my @distances;
  387. foreach my $i (@$symbols) {
  388. push @distances, $DISTANCE_SYMBOLS[$i][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS[$i][1], ''));
  389. }
  390. return \@distances;
  391. }
  392. sub mtf_encode ($bytes, $alphabet = [0 .. 255]) {
  393. my @C;
  394. my @table;
  395. @table[@$alphabet] = (0 .. $#{$alphabet});
  396. foreach my $c (@$bytes) {
  397. push @C, (my $index = $table[$c]);
  398. unshift(@$alphabet, splice(@$alphabet, $index, 1));
  399. @table[@{$alphabet}[0 .. $index]] = (0 .. $index);
  400. }
  401. return \@C;
  402. }
  403. sub mtf_decode ($encoded, $alphabet = [0 .. 255]) {
  404. my @S;
  405. foreach my $p (@$encoded) {
  406. push @S, $alphabet->[$p];
  407. unshift(@$alphabet, splice(@$alphabet, $p, 1));
  408. }
  409. return \@S;
  410. }
  411. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  412. #<<<
  413. [
  414. map { $_->[1] } sort {
  415. ($a->[0] cmp $b->[0])
  416. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  417. }
  418. map {
  419. my $t = substr($s, $_, LOOKAHEAD_LEN);
  420. if (length($t) < LOOKAHEAD_LEN) {
  421. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  422. }
  423. [$t, $_]
  424. } 0 .. length($s) - 1
  425. ];
  426. #>>>
  427. }
  428. sub bwt_encode ($s) {
  429. my $bwt = bwt_balanced($s);
  430. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  431. my $idx = 0;
  432. foreach my $i (@$bwt) {
  433. $i || last;
  434. ++$idx;
  435. }
  436. return ($ret, $idx);
  437. }
  438. sub bwt_decode ($bwt, $idx) { # fast inversion
  439. my @tail = split(//, $bwt);
  440. my @head = sort @tail;
  441. my %indices;
  442. foreach my $i (0 .. $#tail) {
  443. push @{$indices{$tail[$i]}}, $i;
  444. }
  445. my @table;
  446. foreach my $v (@head) {
  447. push @table, shift(@{$indices{$v}});
  448. }
  449. my $dec = '';
  450. my $i = $idx;
  451. for (1 .. scalar(@head)) {
  452. $dec .= $head[$i];
  453. $i = $table[$i];
  454. }
  455. return $dec;
  456. }
  457. sub rle4_encode ($bytes) { # RLE1
  458. my @rle;
  459. my $end = $#{$bytes};
  460. my $prev = -1;
  461. my $run = 0;
  462. for (my $i = 0 ; $i <= $end ; ++$i) {
  463. if ($bytes->[$i] == $prev) {
  464. ++$run;
  465. }
  466. else {
  467. $run = 1;
  468. }
  469. push @rle, $bytes->[$i];
  470. $prev = $bytes->[$i];
  471. if ($run >= 4) {
  472. $run = 0;
  473. $i += 1;
  474. while ($run < 254 and $i <= $end and $bytes->[$i] == $prev) {
  475. ++$run;
  476. ++$i;
  477. }
  478. push @rle, $run;
  479. $run = 1;
  480. if ($i <= $end) {
  481. $prev = $bytes->[$i];
  482. push @rle, $bytes->[$i];
  483. }
  484. }
  485. }
  486. return \@rle;
  487. }
  488. sub rle4_decode ($bytes) { # RLE1
  489. my @dec = $bytes->[0];
  490. my $end = $#{$bytes};
  491. my $prev = $bytes->[0];
  492. my $run = 1;
  493. for (my $i = 1 ; $i <= $end ; ++$i) {
  494. if ($bytes->[$i] == $prev) {
  495. ++$run;
  496. }
  497. else {
  498. $run = 1;
  499. }
  500. push @dec, $bytes->[$i];
  501. $prev = $bytes->[$i];
  502. if ($run >= 4) {
  503. if (++$i <= $end) {
  504. $run = $bytes->[$i];
  505. push @dec, (($prev) x $run);
  506. }
  507. $run = 0;
  508. }
  509. }
  510. return \@dec;
  511. }
  512. sub rle_encode ($bytes) { # RLE2
  513. my @rle;
  514. my $end = $#{$bytes};
  515. for (my $i = 0 ; $i <= $end ; ++$i) {
  516. my $run = 0;
  517. while ($i <= $end and $bytes->[$i] == 0) {
  518. ++$run;
  519. ++$i;
  520. }
  521. if ($run >= 1) {
  522. my $t = sprintf('%b', $run + 1);
  523. push @rle, split(//, substr($t, 1));
  524. }
  525. if ($i <= $end) {
  526. push @rle, $bytes->[$i] + 1;
  527. }
  528. }
  529. return \@rle;
  530. }
  531. sub rle_decode ($rle) { # RLE2
  532. my @dec;
  533. my $end = $#{$rle};
  534. for (my $i = 0 ; $i <= $end ; ++$i) {
  535. my $k = $rle->[$i];
  536. if ($k == 0 or $k == 1) {
  537. my $run = 1;
  538. while (($i <= $end) and ($k == 0 or $k == 1)) {
  539. ($run <<= 1) |= $k;
  540. $k = $rle->[++$i];
  541. }
  542. push @dec, (0) x ($run - 1);
  543. }
  544. if ($i <= $end) {
  545. push @dec, $k - 1;
  546. }
  547. }
  548. return \@dec;
  549. }
  550. sub encode_alphabet ($alphabet) {
  551. my %table;
  552. @table{@$alphabet} = ();
  553. my $populated = 0;
  554. my @marked;
  555. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  556. my $enc = 0;
  557. foreach my $j (0 .. 31) {
  558. if (exists($table{$i + $j})) {
  559. $enc |= 1 << $j;
  560. }
  561. }
  562. if ($enc == 0) {
  563. $populated <<= 1;
  564. }
  565. else {
  566. ($populated <<= 1) |= 1;
  567. push @marked, $enc;
  568. }
  569. }
  570. my $delta = delta_encode([@marked], 1);
  571. say "Populated : ", sprintf('%08b', $populated);
  572. say "Marked : @marked";
  573. say "Delta len : ", length($delta);
  574. my $encoded = '';
  575. $encoded .= chr($populated);
  576. $encoded .= $delta;
  577. return $encoded;
  578. }
  579. sub decode_alphabet ($fh) {
  580. my @populated = split(//, sprintf('%08b', ord(getc($fh) // die "error")));
  581. my $marked = delta_decode($fh, 1);
  582. my @alphabet;
  583. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  584. if (shift(@populated)) {
  585. my $m = shift(@$marked);
  586. foreach my $j (0 .. 31) {
  587. if ($m & 1) {
  588. push @alphabet, $i + $j;
  589. }
  590. $m >>= 1;
  591. }
  592. }
  593. }
  594. return \@alphabet;
  595. }
  596. sub lzhd_compression ($chunk, $out_fh) {
  597. my (@uncompressed, @indices, @lengths);
  598. lz77_compression($chunk, \@uncompressed, \@indices, \@lengths);
  599. my $est_ratio = length($chunk) / (4 * scalar(@uncompressed));
  600. say(scalar(@uncompressed), ' -> ', $est_ratio);
  601. if ($est_ratio > RANDOM_DATA_THRESHOLD) {
  602. print $out_fh COMPRESSED_BYTE;
  603. create_ac_entry(\@uncompressed, $out_fh);
  604. create_ac_entry(\@lengths, $out_fh);
  605. encode_distances(\@indices, $out_fh);
  606. }
  607. else {
  608. print $out_fh UNCOMPRESSED_BYTE;
  609. create_ac_entry([unpack('C*', $chunk)], $out_fh);
  610. }
  611. }
  612. sub lzhd_decompression ($fh) {
  613. my $compression_byte = getc($fh) // die "decompression error";
  614. if ($compression_byte eq COMPRESSED_BYTE) {
  615. my $uncompressed = decode_ac_entry($fh);
  616. my $lengths = decode_ac_entry($fh);
  617. my $indices = decode_distances($fh);
  618. return lz77_decompression($uncompressed, $indices, $lengths);
  619. }
  620. elsif ($compression_byte eq UNCOMPRESSED_BYTE) {
  621. return pack('C*', @{decode_ac_entry($fh)});
  622. }
  623. else {
  624. die "Invalid compression...";
  625. }
  626. }
  627. sub compression ($chunk, $out_fh) {
  628. my @chunk_bytes = unpack('C*', $chunk);
  629. my $data = pack('C*', @{rle4_encode(\@chunk_bytes)});
  630. my ($bwt, $idx) = bwt_encode($data);
  631. my @bytes = unpack('C*', $bwt);
  632. my @alphabet = sort { $a <=> $b } uniq(@bytes);
  633. my $enc_bytes = mtf_encode(\@bytes, [@alphabet]);
  634. if (max(@$enc_bytes) < 255) {
  635. print $out_fh chr(1);
  636. $enc_bytes = rle_encode($enc_bytes);
  637. }
  638. else {
  639. print $out_fh chr(0);
  640. $enc_bytes = rle4_encode($enc_bytes);
  641. }
  642. print $out_fh pack('N', $idx);
  643. print $out_fh encode_alphabet(\@alphabet);
  644. lzhd_compression(pack('C*', @$enc_bytes), $out_fh);
  645. }
  646. sub decompression ($fh, $out_fh) {
  647. my $rle_encoded = ord(getc($fh) // die "error");
  648. my $idx = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  649. my $alphabet = decode_alphabet($fh);
  650. my $bytes = [unpack('C*', lzhd_decompression($fh))];
  651. if ($rle_encoded) {
  652. $bytes = rle_decode($bytes);
  653. }
  654. else {
  655. $bytes = rle4_decode($bytes);
  656. }
  657. $bytes = mtf_decode($bytes, [@$alphabet]);
  658. print $out_fh pack('C*', @{rle4_decode([unpack('C*', bwt_decode(pack('C*', @$bytes), $idx))])});
  659. }
  660. # Compress file
  661. sub compress_file ($input, $output) {
  662. open my $fh, '<:raw', $input
  663. or die "Can't open file <<$input>> for reading: $!";
  664. my $header = SIGNATURE;
  665. # Open the output file for writing
  666. open my $out_fh, '>:raw', $output
  667. or die "Can't open file <<$output>> for write: $!";
  668. # Print the header
  669. print $out_fh $header;
  670. # Compress data
  671. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  672. compression($chunk, $out_fh);
  673. }
  674. # Close the output file
  675. close $out_fh;
  676. }
  677. # Decompress file
  678. sub decompress_file ($input, $output) {
  679. # Open and validate the input file
  680. open my $fh, '<:raw', $input
  681. or die "Can't open file <<$input>> for reading: $!";
  682. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  683. # Open the output file
  684. open my $out_fh, '>:raw', $output
  685. or die "Can't open file <<$output>> for writing: $!";
  686. while (!eof($fh)) {
  687. decompression($fh, $out_fh);
  688. }
  689. # Close the output file
  690. close $out_fh;
  691. }
  692. main();
  693. exit(0);