bwlzad_file_compression.pl 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 15 June 2023
  4. # Edit: 07 March 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using Burrows-Wheeler Transform (BWT) + Move-to-front transform (MTF) + LZ77 compression (LZSS) + Adaptive Arithmetic Coding (in fixed bits).
  7. # Encoding the literals and the pointers using a DEFLATE-like approach.
  8. # References:
  9. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  10. # https://youtube.com/watch?v=SJPvNi4HrWQ
  11. #
  12. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  13. # https://youtube.com/watch?v=cvoZbBZ3M2A
  14. #
  15. # Basic arithmetic coder in C++
  16. # https://github.com/billbird/arith32
  17. use 5.036;
  18. use Getopt::Std qw(getopts);
  19. use File::Basename qw(basename);
  20. use List::Util qw(max uniq sum);
  21. use POSIX qw(ceil log2);
  22. use constant {
  23. PKGNAME => 'BWLZAD',
  24. VERSION => '0.01',
  25. FORMAT => 'bwlzad',
  26. CHUNK_SIZE => 1 << 17, # higher value = better compression
  27. LOOKAHEAD_LEN => 128,
  28. };
  29. # Arithmetic Coding settings
  30. use constant BITS => 32;
  31. use constant MAX => oct('0b' . ('1' x BITS));
  32. use constant INITIAL_FREQ => 1;
  33. # Container signature
  34. use constant SIGNATURE => uc(FORMAT) . chr(1);
  35. # [distance value, offset bits]
  36. my @DISTANCE_SYMBOLS = map { [$_, 0] } (0 .. 4);
  37. until ($DISTANCE_SYMBOLS[-1][0] > CHUNK_SIZE) {
  38. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
  39. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
  40. }
  41. # [length, offset bits]
  42. my @LENGTH_SYMBOLS = ((map { [$_, 0] } (3 .. 10)));
  43. {
  44. my $delta = 1;
  45. until ($LENGTH_SYMBOLS[-1][0] > 163) {
  46. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1] + 1];
  47. $delta *= 2;
  48. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  49. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  50. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  51. }
  52. push @LENGTH_SYMBOLS, [258, 0];
  53. }
  54. my @DISTANCE_INDICES;
  55. foreach my $i (0 .. $#DISTANCE_SYMBOLS) {
  56. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$i]};
  57. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  58. last if ($k > CHUNK_SIZE);
  59. $DISTANCE_INDICES[$k] = $i;
  60. }
  61. }
  62. my @LENGTH_INDICES;
  63. foreach my $i (0 .. $#LENGTH_SYMBOLS) {
  64. my ($min, $bits) = @{$LENGTH_SYMBOLS[$i]};
  65. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  66. $LENGTH_INDICES[$k] = $i;
  67. }
  68. }
  69. sub usage {
  70. my ($code) = @_;
  71. print <<"EOH";
  72. usage: $0 [options] [input file] [output file]
  73. options:
  74. -e : extract
  75. -i <filename> : input filename
  76. -o <filename> : output filename
  77. -r : rewrite output
  78. -v : version number
  79. -h : this message
  80. examples:
  81. $0 document.txt
  82. $0 document.txt archive.${\FORMAT}
  83. $0 archive.${\FORMAT} document.txt
  84. $0 -e -i archive.${\FORMAT} -o document.txt
  85. EOH
  86. exit($code // 0);
  87. }
  88. sub version {
  89. printf("%s %s\n", PKGNAME, VERSION);
  90. exit;
  91. }
  92. sub valid_archive {
  93. my ($fh) = @_;
  94. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  95. $sig eq SIGNATURE || return;
  96. }
  97. return 1;
  98. }
  99. sub main {
  100. my %opt;
  101. getopts('ei:o:vhr', \%opt);
  102. $opt{h} && usage(0);
  103. $opt{v} && version();
  104. my ($input, $output) = @ARGV;
  105. $input //= $opt{i} // usage(2);
  106. $output //= $opt{o};
  107. my $ext = qr{\.${\FORMAT}\z}io;
  108. if ($opt{e} || $input =~ $ext) {
  109. if (not defined $output) {
  110. ($output = basename($input)) =~ s{$ext}{}
  111. || die "$0: no output file specified!\n";
  112. }
  113. if (not $opt{r} and -e $output) {
  114. print "'$output' already exists! -- Replace? [y/N] ";
  115. <STDIN> =~ /^y/i || exit 17;
  116. }
  117. decompress_file($input, $output)
  118. || die "$0: error: decompression failed!\n";
  119. }
  120. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  121. $output //= basename($input) . '.' . FORMAT;
  122. compress_file($input, $output)
  123. || die "$0: error: compression failed!\n";
  124. }
  125. else {
  126. warn "$0: don't know what to do...\n";
  127. usage(1);
  128. }
  129. }
  130. sub lz77_compression ($str, $uncompressed, $indices, $lengths) {
  131. my $la = 0;
  132. my $prefix = '';
  133. my @chars = split(//, $str);
  134. my $end = $#chars;
  135. my $min_len = $LENGTH_SYMBOLS[0][0];
  136. my $max_len = $LENGTH_SYMBOLS[-1][0];
  137. my %literal_freq;
  138. my %distance_freq;
  139. my $literal_count = 0;
  140. my $distance_count = 0;
  141. while ($la <= $end) {
  142. my $n = 1;
  143. my $p = length($prefix);
  144. my $tmp;
  145. my $token = $chars[$la];
  146. while ( $n <= $max_len
  147. and $la + $n <= $end
  148. and ($tmp = rindex($prefix, $token, $p)) >= 0) {
  149. $p = $tmp;
  150. $token .= $chars[$la + $n];
  151. ++$n;
  152. }
  153. my $enc_bits_len = 0;
  154. my $literal_bits_len = 0;
  155. if ($n > $min_len) {
  156. my $dist = $DISTANCE_SYMBOLS[$DISTANCE_INDICES[$la - $p]];
  157. $enc_bits_len += $dist->[1] + ceil(log2((1 + $distance_count) / (1 + ($distance_freq{$dist->[0]} // 0))));
  158. my $len_idx = $LENGTH_INDICES[$n - 1];
  159. my $len = $LENGTH_SYMBOLS[$len_idx];
  160. $enc_bits_len += $len->[1] + ceil(log2((1 + $literal_count) / (1 + ($literal_freq{$len_idx + 256} // 0))));
  161. my %freq;
  162. foreach my $c (unpack('C*', substr($prefix, $p, $n - 1) . $chars[$la + $n - 1])) {
  163. ++$freq{$c};
  164. $literal_bits_len += ceil(log2(($n + $literal_count) / ($freq{$c} + ($literal_freq{$c} // 0))));
  165. }
  166. }
  167. if ($n > $min_len and $enc_bits_len < $literal_bits_len) {
  168. push @$lengths, $n - 1;
  169. push @$indices, $la - $p;
  170. push @$uncompressed, undef;
  171. my $dist_idx = $DISTANCE_INDICES[$la - $p];
  172. my $dist = $DISTANCE_SYMBOLS[$dist_idx];
  173. ++$distance_count;
  174. ++$distance_freq{$dist->[0]};
  175. ++$literal_count;
  176. ++$literal_freq{$LENGTH_INDICES[$n - 1] + 256};
  177. $la += $n - 1;
  178. $prefix .= substr($token, 0, -1);
  179. }
  180. else {
  181. my @bytes = unpack('C*', substr($prefix, $p, $n - 1) . $chars[$la + $n - 1]);
  182. push @$uncompressed, @bytes;
  183. push @$lengths, (0) x scalar(@bytes);
  184. push @$indices, (0) x scalar(@bytes);
  185. ++$literal_freq{$_} for @bytes;
  186. $literal_count += $n;
  187. $la += $n;
  188. $prefix .= $token;
  189. }
  190. }
  191. return;
  192. }
  193. sub lz77_decompression ($literals, $distances, $lengths) {
  194. my $chunk = '';
  195. my $offset = 0;
  196. foreach my $i (0 .. $#$literals) {
  197. if ($lengths->[$i] != 0) {
  198. $chunk .= substr($chunk, $offset - $distances->[$i], $lengths->[$i]);
  199. $offset += $lengths->[$i];
  200. }
  201. else {
  202. $chunk .= chr($literals->[$i]);
  203. $offset += 1;
  204. }
  205. }
  206. return $chunk;
  207. }
  208. sub read_bit ($fh, $bitstring) {
  209. if (($$bitstring // '') eq '') {
  210. $$bitstring = unpack('b*', getc($fh) // return undef);
  211. }
  212. chop($$bitstring);
  213. }
  214. sub read_bits ($fh, $bits_len) {
  215. my $data = '';
  216. read($fh, $data, $bits_len >> 3);
  217. $data = unpack('B*', $data);
  218. while (length($data) < $bits_len) {
  219. $data .= unpack('B*', getc($fh) // return undef);
  220. }
  221. if (length($data) > $bits_len) {
  222. $data = substr($data, 0, $bits_len);
  223. }
  224. return $data;
  225. }
  226. sub delta_encode ($integers, $double = 0) {
  227. my @deltas;
  228. my $prev = 0;
  229. unshift(@$integers, scalar(@$integers));
  230. while (@$integers) {
  231. my $curr = shift(@$integers);
  232. push @deltas, $curr - $prev;
  233. $prev = $curr;
  234. }
  235. my $bitstring = '';
  236. foreach my $d (@deltas) {
  237. if ($d == 0) {
  238. $bitstring .= '0';
  239. }
  240. elsif ($double) {
  241. my $t = sprintf('%b', abs($d) + 1);
  242. my $l = sprintf('%b', length($t));
  243. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  244. }
  245. else {
  246. my $t = sprintf('%b', abs($d));
  247. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  248. }
  249. }
  250. pack('B*', $bitstring);
  251. }
  252. sub delta_decode ($fh, $double = 0) {
  253. my @deltas;
  254. my $buffer = '';
  255. my $len = 0;
  256. for (my $k = 0 ; $k <= $len ; ++$k) {
  257. my $bit = read_bit($fh, \$buffer);
  258. if ($bit eq '0') {
  259. push @deltas, 0;
  260. }
  261. elsif ($double) {
  262. my $bit = read_bit($fh, \$buffer);
  263. my $bl = 0;
  264. ++$bl while (read_bit($fh, \$buffer) eq '1');
  265. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  266. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  267. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  268. }
  269. else {
  270. my $bit = read_bit($fh, \$buffer);
  271. my $n = 0;
  272. ++$n while (read_bit($fh, \$buffer) eq '1');
  273. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  274. push @deltas, ($bit eq '1' ? $d : -$d);
  275. }
  276. if ($k == 0) {
  277. $len = pop(@deltas);
  278. }
  279. }
  280. my @acc;
  281. my $prev = $len;
  282. foreach my $d (@deltas) {
  283. $prev += $d;
  284. push @acc, $prev;
  285. }
  286. return \@acc;
  287. }
  288. sub create_cfreq ($freq_value, $max_symbol) {
  289. my $T = 0;
  290. my (@cf, @freq);
  291. foreach my $i (0 .. $max_symbol) {
  292. $freq[$i] = $freq_value;
  293. $cf[$i] = $T;
  294. $T += $freq_value;
  295. $cf[$i + 1] = $T;
  296. }
  297. return (\@freq, \@cf, $T);
  298. }
  299. sub increment_freq ($c, $max_symbol, $freq, $cf) {
  300. ++$freq->[$c];
  301. my $T = $cf->[$c];
  302. foreach my $i ($c .. $max_symbol) {
  303. $cf->[$i] = $T;
  304. $T += $freq->[$i];
  305. $cf->[$i + 1] = $T;
  306. }
  307. return $T;
  308. }
  309. sub ac_encode ($bytes_arr) {
  310. my $enc = '';
  311. my @bytes = (@$bytes_arr, (max(@$bytes_arr) // 0) + 1);
  312. my $max_symbol = max(@bytes) // 0;
  313. my ($freq, $cf, $T) = create_cfreq(INITIAL_FREQ, $max_symbol);
  314. if ($T > MAX) {
  315. die "Too few bits: $T > ${\MAX}";
  316. }
  317. my $low = 0;
  318. my $high = MAX;
  319. my $uf_count = 0;
  320. foreach my $c (@bytes) {
  321. my $w = $high - $low + 1;
  322. $high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX;
  323. $low = ($low + int(($w * $cf->[$c]) / $T)) & MAX;
  324. $T = increment_freq($c, $max_symbol, $freq, $cf);
  325. if ($high > MAX) {
  326. die "high > MAX: $high > ${\MAX}";
  327. }
  328. if ($low >= $high) { die "$low >= $high" }
  329. while (1) {
  330. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  331. my $bit = $high >> (BITS - 1);
  332. $enc .= $bit;
  333. if ($uf_count > 0) {
  334. $enc .= join('', 1 - $bit) x $uf_count;
  335. $uf_count = 0;
  336. }
  337. $low <<= 1;
  338. ($high <<= 1) |= 1;
  339. }
  340. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  341. ($high <<= 1) |= (1 << (BITS - 1));
  342. $high |= 1;
  343. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  344. ++$uf_count;
  345. }
  346. else {
  347. last;
  348. }
  349. $low &= MAX;
  350. $high &= MAX;
  351. }
  352. }
  353. $enc .= '0';
  354. $enc .= '1';
  355. while (length($enc) % 8 != 0) {
  356. $enc .= '1';
  357. }
  358. return ($enc, $max_symbol);
  359. }
  360. sub ac_decode ($fh, $max_symbol) {
  361. my ($freq, $cf, $T) = create_cfreq(INITIAL_FREQ, $max_symbol);
  362. my @dec;
  363. my $low = 0;
  364. my $high = MAX;
  365. my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS);
  366. while (1) {
  367. my $w = ($high + 1) - $low;
  368. my $ss = int((($T * ($enc - $low + 1)) - 1) / $w);
  369. my $i = 0;
  370. foreach my $j (0 .. $max_symbol) {
  371. if ($cf->[$j] <= $ss and $ss < $cf->[$j + 1]) {
  372. $i = $j;
  373. last;
  374. }
  375. }
  376. last if ($i == $max_symbol);
  377. push @dec, $i;
  378. $high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX;
  379. $low = ($low + int(($w * $cf->[$i]) / $T)) & MAX;
  380. $T = increment_freq($i, $max_symbol, $freq, $cf);
  381. if ($high > MAX) {
  382. die "high > MAX: ($high > ${\MAX})";
  383. }
  384. if ($low >= $high) { die "$low >= $high" }
  385. while (1) {
  386. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  387. ($high <<= 1) |= 1;
  388. $low <<= 1;
  389. ($enc <<= 1) |= (getc($fh) // 1);
  390. }
  391. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  392. ($high <<= 1) |= (1 << (BITS - 1));
  393. $high |= 1;
  394. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  395. $enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1);
  396. }
  397. else {
  398. last;
  399. }
  400. $low &= MAX;
  401. $high &= MAX;
  402. $enc &= MAX;
  403. }
  404. }
  405. return \@dec;
  406. }
  407. sub create_ac_entry ($bytes, $out_fh) {
  408. my ($enc, $max_symbol) = ac_encode($bytes);
  409. print $out_fh delta_encode([$max_symbol, length($enc)], 1);
  410. print $out_fh pack("B*", $enc);
  411. }
  412. sub decode_ac_entry ($fh) {
  413. my ($max_symbol, $enc_len) = @{delta_decode($fh, 1)};
  414. say "Encoded length: $enc_len";
  415. if ($enc_len > 0) {
  416. my $bits = read_bits($fh, $enc_len);
  417. open my $bits_fh, '<:raw', \$bits;
  418. return ac_decode($bits_fh, $max_symbol);
  419. }
  420. return [];
  421. }
  422. sub deflate_encode ($literals, $distances, $lengths, $out_fh) {
  423. my @len_symbols;
  424. my @dist_symbols;
  425. my $offset_bits = '';
  426. foreach my $j (0 .. $#{$literals}) {
  427. if ($lengths->[$j] == 0) {
  428. push @len_symbols, $literals->[$j];
  429. next;
  430. }
  431. my $len = $lengths->[$j];
  432. my $dist = $distances->[$j];
  433. {
  434. my $len_idx = $LENGTH_INDICES[$len];
  435. my ($min, $bits) = @{$LENGTH_SYMBOLS[$len_idx]};
  436. push @len_symbols, $len_idx + 256;
  437. if ($bits > 0) {
  438. $offset_bits .= sprintf('%0*b', $bits, $len - $min);
  439. }
  440. }
  441. {
  442. my $dist_idx = $DISTANCE_INDICES[$dist];
  443. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$dist_idx]};
  444. push @dist_symbols, $dist_idx;
  445. if ($bits > 0) {
  446. $offset_bits .= sprintf('%0*b', $bits, $dist - $min);
  447. }
  448. }
  449. }
  450. create_ac_entry(\@len_symbols, $out_fh);
  451. create_ac_entry(\@dist_symbols, $out_fh);
  452. print $out_fh pack('B*', $offset_bits);
  453. }
  454. sub deflate_decode ($fh) {
  455. my $len_symbols = decode_ac_entry($fh);
  456. my $dist_symbols = decode_ac_entry($fh);
  457. my $bits_len = 0;
  458. foreach my $i (@$dist_symbols) {
  459. $bits_len += $DISTANCE_SYMBOLS[$i][1];
  460. }
  461. foreach my $i (@$len_symbols) {
  462. if ($i >= 256) {
  463. $bits_len += $LENGTH_SYMBOLS[$i - 256][1];
  464. }
  465. }
  466. my $bits = read_bits($fh, $bits_len);
  467. my @literals;
  468. my @lengths;
  469. my @distances;
  470. my $j = 0;
  471. foreach my $i (@$len_symbols) {
  472. if ($i >= 256) {
  473. my $dist = $dist_symbols->[$j++];
  474. push @literals, undef;
  475. push @lengths, $LENGTH_SYMBOLS[$i - 256][0] + oct('0b' . substr($bits, 0, $LENGTH_SYMBOLS[$i - 256][1], ''));
  476. push @distances, $DISTANCE_SYMBOLS[$dist][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS[$dist][1], ''));
  477. }
  478. else {
  479. push @literals, $i;
  480. push @lengths, 0;
  481. push @distances, 0;
  482. }
  483. }
  484. return (\@literals, \@distances, \@lengths);
  485. }
  486. sub mtf_encode ($bytes, $alphabet = [0 .. 255]) {
  487. my @C;
  488. my @table;
  489. @table[@$alphabet] = (0 .. $#{$alphabet});
  490. foreach my $c (@$bytes) {
  491. push @C, (my $index = $table[$c]);
  492. unshift(@$alphabet, splice(@$alphabet, $index, 1));
  493. @table[@{$alphabet}[0 .. $index]] = (0 .. $index);
  494. }
  495. return \@C;
  496. }
  497. sub mtf_decode ($encoded, $alphabet = [0 .. 255]) {
  498. my @S;
  499. foreach my $p (@$encoded) {
  500. push @S, $alphabet->[$p];
  501. unshift(@$alphabet, splice(@$alphabet, $p, 1));
  502. }
  503. return \@S;
  504. }
  505. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  506. #<<<
  507. [
  508. map { $_->[1] } sort {
  509. ($a->[0] cmp $b->[0])
  510. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  511. }
  512. map {
  513. my $t = substr($s, $_, LOOKAHEAD_LEN);
  514. if (length($t) < LOOKAHEAD_LEN) {
  515. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  516. }
  517. [$t, $_]
  518. } 0 .. length($s) - 1
  519. ];
  520. #>>>
  521. }
  522. sub bwt_encode ($s) {
  523. my $bwt = bwt_balanced($s);
  524. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  525. my $idx = 0;
  526. foreach my $i (@$bwt) {
  527. $i || last;
  528. ++$idx;
  529. }
  530. return ($ret, $idx);
  531. }
  532. sub bwt_decode ($bwt, $idx) { # fast inversion
  533. my @tail = split(//, $bwt);
  534. my @head = sort @tail;
  535. my %indices;
  536. foreach my $i (0 .. $#tail) {
  537. push @{$indices{$tail[$i]}}, $i;
  538. }
  539. my @table;
  540. foreach my $v (@head) {
  541. push @table, shift(@{$indices{$v}});
  542. }
  543. my $dec = '';
  544. my $i = $idx;
  545. for (1 .. scalar(@head)) {
  546. $dec .= $head[$i];
  547. $i = $table[$i];
  548. }
  549. return $dec;
  550. }
  551. sub rle4_encode ($bytes) { # RLE1
  552. my @rle;
  553. my $end = $#{$bytes};
  554. my $prev = -1;
  555. my $run = 0;
  556. for (my $i = 0 ; $i <= $end ; ++$i) {
  557. if ($bytes->[$i] == $prev) {
  558. ++$run;
  559. }
  560. else {
  561. $run = 1;
  562. }
  563. push @rle, $bytes->[$i];
  564. $prev = $bytes->[$i];
  565. if ($run >= 4) {
  566. $run = 0;
  567. $i += 1;
  568. while ($run < 254 and $i <= $end and $bytes->[$i] == $prev) {
  569. ++$run;
  570. ++$i;
  571. }
  572. push @rle, $run;
  573. $run = 1;
  574. if ($i <= $end) {
  575. $prev = $bytes->[$i];
  576. push @rle, $bytes->[$i];
  577. }
  578. }
  579. }
  580. return \@rle;
  581. }
  582. sub rle4_decode ($bytes) { # RLE1
  583. my @dec = $bytes->[0];
  584. my $end = $#{$bytes};
  585. my $prev = $bytes->[0];
  586. my $run = 1;
  587. for (my $i = 1 ; $i <= $end ; ++$i) {
  588. if ($bytes->[$i] == $prev) {
  589. ++$run;
  590. }
  591. else {
  592. $run = 1;
  593. }
  594. push @dec, $bytes->[$i];
  595. $prev = $bytes->[$i];
  596. if ($run >= 4) {
  597. if (++$i <= $end) {
  598. $run = $bytes->[$i];
  599. push @dec, (($prev) x $run);
  600. }
  601. $run = 0;
  602. }
  603. }
  604. return \@dec;
  605. }
  606. sub rle_encode ($bytes) { # RLE2
  607. my @rle;
  608. my $end = $#{$bytes};
  609. for (my $i = 0 ; $i <= $end ; ++$i) {
  610. my $run = 0;
  611. while ($i <= $end and $bytes->[$i] == 0) {
  612. ++$run;
  613. ++$i;
  614. }
  615. if ($run >= 1) {
  616. my $t = sprintf('%b', $run + 1);
  617. push @rle, split(//, substr($t, 1));
  618. }
  619. if ($i <= $end) {
  620. push @rle, $bytes->[$i] + 1;
  621. }
  622. }
  623. return \@rle;
  624. }
  625. sub rle_decode ($rle) { # RLE2
  626. my @dec;
  627. my $end = $#{$rle};
  628. for (my $i = 0 ; $i <= $end ; ++$i) {
  629. my $k = $rle->[$i];
  630. if ($k == 0 or $k == 1) {
  631. my $run = 1;
  632. while (($i <= $end) and ($k == 0 or $k == 1)) {
  633. ($run <<= 1) |= $k;
  634. $k = $rle->[++$i];
  635. }
  636. push @dec, (0) x ($run - 1);
  637. }
  638. if ($i <= $end) {
  639. push @dec, $k - 1;
  640. }
  641. }
  642. return \@dec;
  643. }
  644. sub encode_alphabet ($alphabet) {
  645. my %table;
  646. @table{@$alphabet} = ();
  647. my $populated = 0;
  648. my @marked;
  649. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  650. my $enc = 0;
  651. foreach my $j (0 .. 31) {
  652. if (exists($table{$i + $j})) {
  653. $enc |= 1 << $j;
  654. }
  655. }
  656. if ($enc == 0) {
  657. $populated <<= 1;
  658. }
  659. else {
  660. ($populated <<= 1) |= 1;
  661. push @marked, $enc;
  662. }
  663. }
  664. my $delta = delta_encode([@marked], 1);
  665. say "Populated : ", sprintf('%08b', $populated);
  666. say "Marked : @marked";
  667. say "Delta len : ", length($delta);
  668. my $encoded = '';
  669. $encoded .= chr($populated);
  670. $encoded .= $delta;
  671. return $encoded;
  672. }
  673. sub decode_alphabet ($fh) {
  674. my @populated = split(//, sprintf('%08b', ord(getc($fh) // die "error")));
  675. my $marked = delta_decode($fh, 1);
  676. my @alphabet;
  677. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  678. if (shift(@populated)) {
  679. my $m = shift(@$marked);
  680. foreach my $j (0 .. 31) {
  681. if ($m & 1) {
  682. push @alphabet, $i + $j;
  683. }
  684. $m >>= 1;
  685. }
  686. }
  687. }
  688. return \@alphabet;
  689. }
  690. sub lzss_compression ($chunk, $out_fh) {
  691. my (@uncompressed, @indices, @lengths);
  692. lz77_compression($chunk, \@uncompressed, \@indices, \@lengths);
  693. my $est_ratio = length($chunk) / (scalar(@uncompressed) + scalar(@lengths) + 2 * scalar(@indices));
  694. say "\nEst. ratio: ", $est_ratio, " (", scalar(@uncompressed), " uncompressed bytes)";
  695. deflate_encode(\@uncompressed, \@indices, \@lengths, $out_fh);
  696. }
  697. sub lzss_decompression ($fh) {
  698. my ($uncompressed, $indices, $lengths) = deflate_decode($fh);
  699. lz77_decompression($uncompressed, $indices, $lengths);
  700. }
  701. sub compression ($chunk, $out_fh) {
  702. my @chunk_bytes = unpack('C*', $chunk);
  703. my $data = pack('C*', @{rle4_encode(\@chunk_bytes)});
  704. my ($bwt, $idx) = bwt_encode($data);
  705. my @bytes = unpack('C*', $bwt);
  706. my @alphabet = sort { $a <=> $b } uniq(@bytes);
  707. my $enc_bytes = mtf_encode(\@bytes, [@alphabet]);
  708. if (max(@$enc_bytes) < 255) {
  709. print $out_fh chr(1);
  710. $enc_bytes = rle_encode($enc_bytes);
  711. }
  712. else {
  713. print $out_fh chr(0);
  714. $enc_bytes = rle4_encode($enc_bytes);
  715. }
  716. print $out_fh pack('N', $idx);
  717. print $out_fh encode_alphabet(\@alphabet);
  718. lzss_compression(pack('C*', @$enc_bytes), $out_fh);
  719. }
  720. sub decompression ($fh, $out_fh) {
  721. my $rle_encoded = ord(getc($fh) // die "error");
  722. my $idx = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  723. my $alphabet = decode_alphabet($fh);
  724. my $bytes = [unpack('C*', lzss_decompression($fh))];
  725. if ($rle_encoded) {
  726. $bytes = rle_decode($bytes);
  727. }
  728. else {
  729. $bytes = rle4_decode($bytes);
  730. }
  731. $bytes = mtf_decode($bytes, [@$alphabet]);
  732. print $out_fh pack('C*', @{rle4_decode([unpack('C*', bwt_decode(pack('C*', @$bytes), $idx))])});
  733. }
  734. # Compress file
  735. sub compress_file ($input, $output) {
  736. open my $fh, '<:raw', $input
  737. or die "Can't open file <<$input>> for reading: $!";
  738. my $header = SIGNATURE;
  739. # Open the output file for writing
  740. open my $out_fh, '>:raw', $output
  741. or die "Can't open file <<$output>> for write: $!";
  742. # Print the header
  743. print $out_fh $header;
  744. # Compress data
  745. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  746. compression($chunk, $out_fh);
  747. }
  748. # Close the output file
  749. close $out_fh;
  750. }
  751. # Decompress file
  752. sub decompress_file ($input, $output) {
  753. # Open and validate the input file
  754. open my $fh, '<:raw', $input
  755. or die "Can't open file <<$input>> for reading: $!";
  756. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  757. # Open the output file
  758. open my $out_fh, '>:raw', $output
  759. or die "Can't open file <<$output>> for writing: $!";
  760. while (!eof($fh)) {
  761. decompression($fh, $out_fh);
  762. }
  763. # Close the output file
  764. close $out_fh;
  765. }
  766. main();
  767. exit(0);