bwlza_file_compression.pl 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 15 June 2023
  4. # Edit: 06 February 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using Burrows-Wheeler Transform (BWT) + Move-to-front transform (MTF) + LZ77 compression (LZSS) + Arithmetic Coding (in fixed bits).
  7. # Encoding the literals and the pointers using a DEFLATE-like approach.
  8. # References:
  9. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  10. # https://youtube.com/watch?v=SJPvNi4HrWQ
  11. #
  12. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  13. # https://youtube.com/watch?v=cvoZbBZ3M2A
  14. #
  15. # Basic arithmetic coder in C++
  16. # https://github.com/billbird/arith32
  17. use 5.036;
  18. use Getopt::Std qw(getopts);
  19. use File::Basename qw(basename);
  20. use List::Util qw(max uniq sum);
  21. use POSIX qw(ceil log2);
  22. use constant {
  23. PKGNAME => 'BWLZA',
  24. VERSION => '0.03',
  25. FORMAT => 'bwlza',
  26. CHUNK_SIZE => 1 << 17, # higher value = better compression
  27. LOOKAHEAD_LEN => 128,
  28. };
  29. # Arithmetic Coding settings
  30. use constant BITS => 32;
  31. use constant MAX => oct('0b' . ('1' x BITS));
  32. # Container signature
  33. use constant SIGNATURE => uc(FORMAT) . chr(3);
  34. # [distance value, offset bits]
  35. my @DISTANCE_SYMBOLS = map { [$_, 0] } (0 .. 4);
  36. until ($DISTANCE_SYMBOLS[-1][0] > CHUNK_SIZE) {
  37. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
  38. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
  39. }
  40. # [length, offset bits]
  41. my @LENGTH_SYMBOLS = ((map { [$_, 0] } (3 .. 10)));
  42. {
  43. my $delta = 1;
  44. until ($LENGTH_SYMBOLS[-1][0] > 163) {
  45. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1] + 1];
  46. $delta *= 2;
  47. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  48. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  49. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  50. }
  51. push @LENGTH_SYMBOLS, [258, 0];
  52. }
  53. my @DISTANCE_INDICES;
  54. foreach my $i (0 .. $#DISTANCE_SYMBOLS) {
  55. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$i]};
  56. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  57. last if ($k > CHUNK_SIZE);
  58. $DISTANCE_INDICES[$k] = $i;
  59. }
  60. }
  61. my @LENGTH_INDICES;
  62. foreach my $i (0 .. $#LENGTH_SYMBOLS) {
  63. my ($min, $bits) = @{$LENGTH_SYMBOLS[$i]};
  64. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  65. $LENGTH_INDICES[$k] = $i;
  66. }
  67. }
  68. sub usage {
  69. my ($code) = @_;
  70. print <<"EOH";
  71. usage: $0 [options] [input file] [output file]
  72. options:
  73. -e : extract
  74. -i <filename> : input filename
  75. -o <filename> : output filename
  76. -r : rewrite output
  77. -v : version number
  78. -h : this message
  79. examples:
  80. $0 document.txt
  81. $0 document.txt archive.${\FORMAT}
  82. $0 archive.${\FORMAT} document.txt
  83. $0 -e -i archive.${\FORMAT} -o document.txt
  84. EOH
  85. exit($code // 0);
  86. }
  87. sub version {
  88. printf("%s %s\n", PKGNAME, VERSION);
  89. exit;
  90. }
  91. sub valid_archive {
  92. my ($fh) = @_;
  93. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  94. $sig eq SIGNATURE || return;
  95. }
  96. return 1;
  97. }
  98. sub main {
  99. my %opt;
  100. getopts('ei:o:vhr', \%opt);
  101. $opt{h} && usage(0);
  102. $opt{v} && version();
  103. my ($input, $output) = @ARGV;
  104. $input //= $opt{i} // usage(2);
  105. $output //= $opt{o};
  106. my $ext = qr{\.${\FORMAT}\z}io;
  107. if ($opt{e} || $input =~ $ext) {
  108. if (not defined $output) {
  109. ($output = basename($input)) =~ s{$ext}{}
  110. || die "$0: no output file specified!\n";
  111. }
  112. if (not $opt{r} and -e $output) {
  113. print "'$output' already exists! -- Replace? [y/N] ";
  114. <STDIN> =~ /^y/i || exit 17;
  115. }
  116. decompress_file($input, $output)
  117. || die "$0: error: decompression failed!\n";
  118. }
  119. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  120. $output //= basename($input) . '.' . FORMAT;
  121. compress_file($input, $output)
  122. || die "$0: error: compression failed!\n";
  123. }
  124. else {
  125. warn "$0: don't know what to do...\n";
  126. usage(1);
  127. }
  128. }
  129. sub lz77_compression ($str, $uncompressed, $indices, $lengths) {
  130. my $la = 0;
  131. my $prefix = '';
  132. my @chars = split(//, $str);
  133. my $end = $#chars;
  134. my $min_len = $LENGTH_SYMBOLS[0][0];
  135. my $max_len = $LENGTH_SYMBOLS[-1][0];
  136. my %literal_freq;
  137. my %distance_freq;
  138. my $literal_count = 0;
  139. my $distance_count = 0;
  140. while ($la <= $end) {
  141. my $n = 1;
  142. my $p = length($prefix);
  143. my $tmp;
  144. my $token = $chars[$la];
  145. while ( $n <= $max_len
  146. and $la + $n <= $end
  147. and ($tmp = rindex($prefix, $token, $p)) >= 0) {
  148. $p = $tmp;
  149. $token .= $chars[$la + $n];
  150. ++$n;
  151. }
  152. my $enc_bits_len = 0;
  153. my $literal_bits_len = 0;
  154. if ($n > $min_len) {
  155. my $dist = $DISTANCE_SYMBOLS[$DISTANCE_INDICES[$la - $p]];
  156. $enc_bits_len += $dist->[1] + ceil(log2((1 + $distance_count) / (1 + ($distance_freq{$dist->[0]} // 0))));
  157. my $len_idx = $LENGTH_INDICES[$n - 1];
  158. my $len = $LENGTH_SYMBOLS[$len_idx];
  159. $enc_bits_len += $len->[1] + ceil(log2((1 + $literal_count) / (1 + ($literal_freq{$len_idx + 256} // 0))));
  160. my %freq;
  161. foreach my $c (unpack('C*', substr($prefix, $p, $n - 1) . $chars[$la + $n - 1])) {
  162. ++$freq{$c};
  163. $literal_bits_len += ceil(log2(($n + $literal_count) / ($freq{$c} + ($literal_freq{$c} // 0))));
  164. }
  165. }
  166. if ($n > $min_len and $enc_bits_len < $literal_bits_len) {
  167. push @$lengths, $n - 1;
  168. push @$indices, $la - $p;
  169. push @$uncompressed, undef;
  170. my $dist_idx = $DISTANCE_INDICES[$la - $p];
  171. my $dist = $DISTANCE_SYMBOLS[$dist_idx];
  172. ++$distance_count;
  173. ++$distance_freq{$dist->[0]};
  174. ++$literal_count;
  175. ++$literal_freq{$LENGTH_INDICES[$n - 1] + 256};
  176. $la += $n - 1;
  177. $prefix .= substr($token, 0, -1);
  178. }
  179. else {
  180. my @bytes = unpack('C*', substr($prefix, $p, $n - 1) . $chars[$la + $n - 1]);
  181. push @$uncompressed, @bytes;
  182. push @$lengths, (0) x scalar(@bytes);
  183. push @$indices, (0) x scalar(@bytes);
  184. ++$literal_freq{$_} for @bytes;
  185. $literal_count += $n;
  186. $la += $n;
  187. $prefix .= $token;
  188. }
  189. }
  190. return;
  191. }
  192. sub lz77_decompression ($literals, $distances, $lengths) {
  193. my $chunk = '';
  194. my $offset = 0;
  195. foreach my $i (0 .. $#$literals) {
  196. if ($lengths->[$i] != 0) {
  197. $chunk .= substr($chunk, $offset - $distances->[$i], $lengths->[$i]);
  198. $offset += $lengths->[$i];
  199. }
  200. else {
  201. $chunk .= chr($literals->[$i]);
  202. $offset += 1;
  203. }
  204. }
  205. return $chunk;
  206. }
  207. sub read_bit ($fh, $bitstring) {
  208. if (($$bitstring // '') eq '') {
  209. $$bitstring = unpack('b*', getc($fh) // return undef);
  210. }
  211. chop($$bitstring);
  212. }
  213. sub read_bits ($fh, $bits_len) {
  214. my $data = '';
  215. read($fh, $data, $bits_len >> 3);
  216. $data = unpack('B*', $data);
  217. while (length($data) < $bits_len) {
  218. $data .= unpack('B*', getc($fh) // return undef);
  219. }
  220. if (length($data) > $bits_len) {
  221. $data = substr($data, 0, $bits_len);
  222. }
  223. return $data;
  224. }
  225. sub delta_encode ($integers, $double = 0) {
  226. my @deltas;
  227. my $prev = 0;
  228. unshift(@$integers, scalar(@$integers));
  229. while (@$integers) {
  230. my $curr = shift(@$integers);
  231. push @deltas, $curr - $prev;
  232. $prev = $curr;
  233. }
  234. my $bitstring = '';
  235. foreach my $d (@deltas) {
  236. if ($d == 0) {
  237. $bitstring .= '0';
  238. }
  239. elsif ($double) {
  240. my $t = sprintf('%b', abs($d) + 1);
  241. my $l = sprintf('%b', length($t));
  242. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  243. }
  244. else {
  245. my $t = sprintf('%b', abs($d));
  246. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  247. }
  248. }
  249. pack('B*', $bitstring);
  250. }
  251. sub delta_decode ($fh, $double = 0) {
  252. my @deltas;
  253. my $buffer = '';
  254. my $len = 0;
  255. for (my $k = 0 ; $k <= $len ; ++$k) {
  256. my $bit = read_bit($fh, \$buffer);
  257. if ($bit eq '0') {
  258. push @deltas, 0;
  259. }
  260. elsif ($double) {
  261. my $bit = read_bit($fh, \$buffer);
  262. my $bl = 0;
  263. ++$bl while (read_bit($fh, \$buffer) eq '1');
  264. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  265. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  266. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  267. }
  268. else {
  269. my $bit = read_bit($fh, \$buffer);
  270. my $n = 0;
  271. ++$n while (read_bit($fh, \$buffer) eq '1');
  272. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  273. push @deltas, ($bit eq '1' ? $d : -$d);
  274. }
  275. if ($k == 0) {
  276. $len = pop(@deltas);
  277. }
  278. }
  279. my @acc;
  280. my $prev = $len;
  281. foreach my $d (@deltas) {
  282. $prev += $d;
  283. push @acc, $prev;
  284. }
  285. return \@acc;
  286. }
  287. sub create_cfreq ($freq) {
  288. my @cf;
  289. my $T = 0;
  290. foreach my $i (sort { $a <=> $b } keys %$freq) {
  291. $freq->{$i} // next;
  292. $cf[$i] = $T;
  293. $T += $freq->{$i};
  294. $cf[$i + 1] = $T;
  295. }
  296. return (\@cf, $T);
  297. }
  298. sub ac_encode ($bytes_arr) {
  299. my $enc = '';
  300. my $EOF_SYMBOL = (max(@$bytes_arr) // 0) + 1;
  301. my @bytes = (@$bytes_arr, $EOF_SYMBOL);
  302. my %freq;
  303. ++$freq{$_} for @bytes;
  304. my ($cf, $T) = create_cfreq(\%freq);
  305. if ($T > MAX) {
  306. die "Too few bits: $T > ${\MAX}";
  307. }
  308. my $low = 0;
  309. my $high = MAX;
  310. my $uf_count = 0;
  311. foreach my $c (@bytes) {
  312. my $w = $high - $low + 1;
  313. $high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX;
  314. $low = ($low + int(($w * $cf->[$c]) / $T)) & MAX;
  315. if ($high > MAX) {
  316. die "high > MAX: $high > ${\MAX}";
  317. }
  318. if ($low >= $high) { die "$low >= $high" }
  319. while (1) {
  320. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  321. my $bit = $high >> (BITS - 1);
  322. $enc .= $bit;
  323. if ($uf_count > 0) {
  324. $enc .= join('', 1 - $bit) x $uf_count;
  325. $uf_count = 0;
  326. }
  327. $low <<= 1;
  328. ($high <<= 1) |= 1;
  329. }
  330. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  331. ($high <<= 1) |= (1 << (BITS - 1));
  332. $high |= 1;
  333. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  334. ++$uf_count;
  335. }
  336. else {
  337. last;
  338. }
  339. $low &= MAX;
  340. $high &= MAX;
  341. }
  342. }
  343. $enc .= '0';
  344. $enc .= '1';
  345. while (length($enc) % 8 != 0) {
  346. $enc .= '1';
  347. }
  348. return ($enc, \%freq);
  349. }
  350. sub ac_decode ($fh, $freq) {
  351. my ($cf, $T) = create_cfreq($freq);
  352. my @dec;
  353. my $low = 0;
  354. my $high = MAX;
  355. my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS);
  356. my @table;
  357. foreach my $i (sort { $a <=> $b } keys %$freq) {
  358. foreach my $j ($cf->[$i] .. $cf->[$i + 1] - 1) {
  359. $table[$j] = $i;
  360. }
  361. }
  362. my $EOF_SYMBOL = max(keys %$freq) // 0;
  363. while (1) {
  364. my $w = $high - $low + 1;
  365. my $ss = int((($T * ($enc - $low + 1)) - 1) / $w);
  366. my $i = $table[$ss] // last;
  367. last if ($i == $EOF_SYMBOL);
  368. push @dec, $i;
  369. $high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX;
  370. $low = ($low + int(($w * $cf->[$i]) / $T)) & MAX;
  371. if ($high > MAX) {
  372. die "error";
  373. }
  374. if ($low >= $high) { die "$low >= $high" }
  375. while (1) {
  376. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  377. ($high <<= 1) |= 1;
  378. $low <<= 1;
  379. ($enc <<= 1) |= (getc($fh) // 1);
  380. }
  381. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  382. ($high <<= 1) |= (1 << (BITS - 1));
  383. $high |= 1;
  384. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  385. $enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1);
  386. }
  387. else {
  388. last;
  389. }
  390. $low &= MAX;
  391. $high &= MAX;
  392. $enc &= MAX;
  393. }
  394. }
  395. return \@dec;
  396. }
  397. sub create_ac_entry ($bytes, $out_fh) {
  398. my ($enc, $freq) = ac_encode($bytes);
  399. my $max_symbol = max(keys %$freq) // 0;
  400. my @freqs;
  401. foreach my $k (0 .. $max_symbol) {
  402. push @freqs, $freq->{$k} // 0;
  403. }
  404. push @freqs, length($enc) >> 3;
  405. say "Max symbol: $max_symbol";
  406. print $out_fh delta_encode(\@freqs);
  407. print $out_fh pack("B*", $enc);
  408. }
  409. sub decode_ac_entry ($fh) {
  410. my @freqs = @{delta_decode($fh)};
  411. my $bits_len = pop(@freqs);
  412. my %freq;
  413. foreach my $i (0 .. $#freqs) {
  414. if ($freqs[$i]) {
  415. $freq{$i} = $freqs[$i];
  416. }
  417. }
  418. say "Encoded length: $bits_len";
  419. my $bits = read_bits($fh, $bits_len << 3);
  420. if ($bits_len > 0) {
  421. open my $bits_fh, '<:raw', \$bits;
  422. return ac_decode($bits_fh, \%freq);
  423. }
  424. return [];
  425. }
  426. sub deflate_encode ($literals, $distances, $lengths, $out_fh) {
  427. my @len_symbols;
  428. my @dist_symbols;
  429. my $offset_bits = '';
  430. foreach my $j (0 .. $#{$literals}) {
  431. if ($lengths->[$j] == 0) {
  432. push @len_symbols, $literals->[$j];
  433. next;
  434. }
  435. my $len = $lengths->[$j];
  436. my $dist = $distances->[$j];
  437. {
  438. my $len_idx = $LENGTH_INDICES[$len];
  439. my ($min, $bits) = @{$LENGTH_SYMBOLS[$len_idx]};
  440. push @len_symbols, $len_idx + 256;
  441. if ($bits > 0) {
  442. $offset_bits .= sprintf('%0*b', $bits, $len - $min);
  443. }
  444. }
  445. {
  446. my $dist_idx = $DISTANCE_INDICES[$dist];
  447. my ($min, $bits) = @{$DISTANCE_SYMBOLS[$dist_idx]};
  448. push @dist_symbols, $dist_idx;
  449. if ($bits > 0) {
  450. $offset_bits .= sprintf('%0*b', $bits, $dist - $min);
  451. }
  452. }
  453. }
  454. create_ac_entry(\@len_symbols, $out_fh);
  455. create_ac_entry(\@dist_symbols, $out_fh);
  456. print $out_fh pack('B*', $offset_bits);
  457. }
  458. sub deflate_decode ($fh) {
  459. my $len_symbols = decode_ac_entry($fh);
  460. my $dist_symbols = decode_ac_entry($fh);
  461. my $bits_len = 0;
  462. foreach my $i (@$dist_symbols) {
  463. $bits_len += $DISTANCE_SYMBOLS[$i][1];
  464. }
  465. foreach my $i (@$len_symbols) {
  466. if ($i >= 256) {
  467. $bits_len += $LENGTH_SYMBOLS[$i - 256][1];
  468. }
  469. }
  470. my $bits = read_bits($fh, $bits_len);
  471. my @literals;
  472. my @lengths;
  473. my @distances;
  474. my $j = 0;
  475. foreach my $i (@$len_symbols) {
  476. if ($i >= 256) {
  477. my $dist = $dist_symbols->[$j++];
  478. push @literals, undef;
  479. push @lengths, $LENGTH_SYMBOLS[$i - 256][0] + oct('0b' . substr($bits, 0, $LENGTH_SYMBOLS[$i - 256][1], ''));
  480. push @distances, $DISTANCE_SYMBOLS[$dist][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS[$dist][1], ''));
  481. }
  482. else {
  483. push @literals, $i;
  484. push @lengths, 0;
  485. push @distances, 0;
  486. }
  487. }
  488. return (\@literals, \@distances, \@lengths);
  489. }
  490. sub mtf_encode ($bytes, $alphabet = [0 .. 255]) {
  491. my @C;
  492. my @table;
  493. @table[@$alphabet] = (0 .. $#{$alphabet});
  494. foreach my $c (@$bytes) {
  495. push @C, (my $index = $table[$c]);
  496. unshift(@$alphabet, splice(@$alphabet, $index, 1));
  497. @table[@{$alphabet}[0 .. $index]] = (0 .. $index);
  498. }
  499. return \@C;
  500. }
  501. sub mtf_decode ($encoded, $alphabet = [0 .. 255]) {
  502. my @S;
  503. foreach my $p (@$encoded) {
  504. push @S, $alphabet->[$p];
  505. unshift(@$alphabet, splice(@$alphabet, $p, 1));
  506. }
  507. return \@S;
  508. }
  509. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  510. #<<<
  511. [
  512. map { $_->[1] } sort {
  513. ($a->[0] cmp $b->[0])
  514. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  515. }
  516. map {
  517. my $t = substr($s, $_, LOOKAHEAD_LEN);
  518. if (length($t) < LOOKAHEAD_LEN) {
  519. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  520. }
  521. [$t, $_]
  522. } 0 .. length($s) - 1
  523. ];
  524. #>>>
  525. }
  526. sub bwt_encode ($s) {
  527. my $bwt = bwt_balanced($s);
  528. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  529. my $idx = 0;
  530. foreach my $i (@$bwt) {
  531. $i || last;
  532. ++$idx;
  533. }
  534. return ($ret, $idx);
  535. }
  536. sub bwt_decode ($bwt, $idx) { # fast inversion
  537. my @tail = split(//, $bwt);
  538. my @head = sort @tail;
  539. my %indices;
  540. foreach my $i (0 .. $#tail) {
  541. push @{$indices{$tail[$i]}}, $i;
  542. }
  543. my @table;
  544. foreach my $v (@head) {
  545. push @table, shift(@{$indices{$v}});
  546. }
  547. my $dec = '';
  548. my $i = $idx;
  549. for (1 .. scalar(@head)) {
  550. $dec .= $head[$i];
  551. $i = $table[$i];
  552. }
  553. return $dec;
  554. }
  555. sub rle4_encode ($bytes) { # RLE1
  556. my @rle;
  557. my $end = $#{$bytes};
  558. my $prev = -1;
  559. my $run = 0;
  560. for (my $i = 0 ; $i <= $end ; ++$i) {
  561. if ($bytes->[$i] == $prev) {
  562. ++$run;
  563. }
  564. else {
  565. $run = 1;
  566. }
  567. push @rle, $bytes->[$i];
  568. $prev = $bytes->[$i];
  569. if ($run >= 4) {
  570. $run = 0;
  571. $i += 1;
  572. while ($run < 254 and $i <= $end and $bytes->[$i] == $prev) {
  573. ++$run;
  574. ++$i;
  575. }
  576. push @rle, $run;
  577. $run = 1;
  578. if ($i <= $end) {
  579. $prev = $bytes->[$i];
  580. push @rle, $bytes->[$i];
  581. }
  582. }
  583. }
  584. return \@rle;
  585. }
  586. sub rle4_decode ($bytes) { # RLE1
  587. my @dec = $bytes->[0];
  588. my $end = $#{$bytes};
  589. my $prev = $bytes->[0];
  590. my $run = 1;
  591. for (my $i = 1 ; $i <= $end ; ++$i) {
  592. if ($bytes->[$i] == $prev) {
  593. ++$run;
  594. }
  595. else {
  596. $run = 1;
  597. }
  598. push @dec, $bytes->[$i];
  599. $prev = $bytes->[$i];
  600. if ($run >= 4) {
  601. if (++$i <= $end) {
  602. $run = $bytes->[$i];
  603. push @dec, (($prev) x $run);
  604. }
  605. $run = 0;
  606. }
  607. }
  608. return \@dec;
  609. }
  610. sub rle_encode ($bytes) { # RLE2
  611. my @rle;
  612. my $end = $#{$bytes};
  613. for (my $i = 0 ; $i <= $end ; ++$i) {
  614. my $run = 0;
  615. while ($i <= $end and $bytes->[$i] == 0) {
  616. ++$run;
  617. ++$i;
  618. }
  619. if ($run >= 1) {
  620. my $t = sprintf('%b', $run + 1);
  621. push @rle, split(//, substr($t, 1));
  622. }
  623. if ($i <= $end) {
  624. push @rle, $bytes->[$i] + 1;
  625. }
  626. }
  627. return \@rle;
  628. }
  629. sub rle_decode ($rle) { # RLE2
  630. my @dec;
  631. my $end = $#{$rle};
  632. for (my $i = 0 ; $i <= $end ; ++$i) {
  633. my $k = $rle->[$i];
  634. if ($k == 0 or $k == 1) {
  635. my $run = 1;
  636. while (($i <= $end) and ($k == 0 or $k == 1)) {
  637. ($run <<= 1) |= $k;
  638. $k = $rle->[++$i];
  639. }
  640. push @dec, (0) x ($run - 1);
  641. }
  642. if ($i <= $end) {
  643. push @dec, $k - 1;
  644. }
  645. }
  646. return \@dec;
  647. }
  648. sub encode_alphabet ($alphabet) {
  649. my %table;
  650. @table{@$alphabet} = ();
  651. my $populated = 0;
  652. my @marked;
  653. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  654. my $enc = 0;
  655. foreach my $j (0 .. 31) {
  656. if (exists($table{$i + $j})) {
  657. $enc |= 1 << $j;
  658. }
  659. }
  660. if ($enc == 0) {
  661. $populated <<= 1;
  662. }
  663. else {
  664. ($populated <<= 1) |= 1;
  665. push @marked, $enc;
  666. }
  667. }
  668. my $delta = delta_encode([@marked], 1);
  669. say "Populated : ", sprintf('%08b', $populated);
  670. say "Marked : @marked";
  671. say "Delta len : ", length($delta);
  672. my $encoded = '';
  673. $encoded .= chr($populated);
  674. $encoded .= $delta;
  675. return $encoded;
  676. }
  677. sub decode_alphabet ($fh) {
  678. my @populated = split(//, sprintf('%08b', ord(getc($fh) // die "error")));
  679. my $marked = delta_decode($fh, 1);
  680. my @alphabet;
  681. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  682. if (shift(@populated)) {
  683. my $m = shift(@$marked);
  684. foreach my $j (0 .. 31) {
  685. if ($m & 1) {
  686. push @alphabet, $i + $j;
  687. }
  688. $m >>= 1;
  689. }
  690. }
  691. }
  692. return \@alphabet;
  693. }
  694. sub lzss_compression ($data, $out_fh) {
  695. my (@uncompressed, @indices, @lengths);
  696. lz77_compression($data, \@uncompressed, \@indices, \@lengths);
  697. my $est_ratio = length($data) / (scalar(@uncompressed) + scalar(@lengths) + 2 * scalar(@indices));
  698. say "\nEst. ratio: ", $est_ratio, " (", scalar(@uncompressed), " uncompressed bytes)";
  699. deflate_encode(\@uncompressed, \@indices, \@lengths, $out_fh);
  700. }
  701. sub lzss_decompression ($fh) {
  702. my ($uncompressed, $indices, $lengths) = deflate_decode($fh);
  703. lz77_decompression($uncompressed, $indices, $lengths);
  704. }
  705. sub compression ($chunk, $out_fh) {
  706. my @chunk_bytes = unpack('C*', $chunk);
  707. my $data = pack('C*', @{rle4_encode(\@chunk_bytes)});
  708. my ($bwt, $idx) = bwt_encode($data);
  709. my @bytes = unpack('C*', $bwt);
  710. my @alphabet = sort { $a <=> $b } uniq(@bytes);
  711. my $enc_bytes = mtf_encode(\@bytes, [@alphabet]);
  712. if (max(@$enc_bytes) < 255) {
  713. print $out_fh chr(1);
  714. $enc_bytes = rle_encode($enc_bytes);
  715. }
  716. else {
  717. print $out_fh chr(0);
  718. $enc_bytes = rle4_encode($enc_bytes);
  719. }
  720. print $out_fh pack('N', $idx);
  721. print $out_fh encode_alphabet(\@alphabet);
  722. lzss_compression(pack('C*', @$enc_bytes), $out_fh);
  723. }
  724. sub decompression ($fh, $out_fh) {
  725. my $rle_encoded = ord(getc($fh) // die "error");
  726. my $idx = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  727. my $alphabet = decode_alphabet($fh);
  728. my $dec = lzss_decompression($fh);
  729. my $bytes = [unpack('C*', $dec)];
  730. if ($rle_encoded) {
  731. $bytes = rle_decode($bytes);
  732. }
  733. else {
  734. $bytes = rle4_decode($bytes);
  735. }
  736. $bytes = mtf_decode($bytes, [@$alphabet]);
  737. print $out_fh pack('C*', @{rle4_decode([unpack('C*', bwt_decode(pack('C*', @$bytes), $idx))])});
  738. }
  739. # Compress file
  740. sub compress_file ($input, $output) {
  741. open my $fh, '<:raw', $input
  742. or die "Can't open file <<$input>> for reading: $!";
  743. my $header = SIGNATURE;
  744. # Open the output file for writing
  745. open my $out_fh, '>:raw', $output
  746. or die "Can't open file <<$output>> for write: $!";
  747. # Print the header
  748. print $out_fh $header;
  749. # Compress data
  750. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  751. compression($chunk, $out_fh);
  752. }
  753. # Close the output file
  754. close $out_fh;
  755. }
  756. # Decompress file
  757. sub decompress_file ($input, $output) {
  758. # Open and validate the input file
  759. open my $fh, '<:raw', $input
  760. or die "Can't open file <<$input>> for reading: $!";
  761. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  762. # Open the output file
  763. open my $out_fh, '>:raw', $output
  764. or die "Can't open file <<$output>> for writing: $!";
  765. while (!eof($fh)) {
  766. decompression($fh, $out_fh);
  767. }
  768. # Close the output file
  769. close $out_fh;
  770. }
  771. main();
  772. exit(0);