mrh_file_compression.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 15 August 2023
  5. # https://github.com/trizen
  6. # Compress/decompress files using Move-to-Front Transform + Run-length encoding + Huffman coding.
  7. use 5.036;
  8. use Getopt::Std qw(getopts);
  9. use File::Basename qw(basename);
  10. use List::Util qw(max uniq);
  11. use constant {
  12. PKGNAME => 'MRH',
  13. VERSION => '0.03',
  14. FORMAT => 'mrh',
  15. CHUNK_SIZE => 1 << 16,
  16. };
  17. # Container signature
  18. use constant SIGNATURE => uc(FORMAT) . chr(3);
  19. sub usage {
  20. my ($code) = @_;
  21. print <<"EOH";
  22. usage: $0 [options] [input file] [output file]
  23. options:
  24. -e : extract
  25. -i <filename> : input filename
  26. -o <filename> : output filename
  27. -r : rewrite output
  28. -v : version number
  29. -h : this message
  30. examples:
  31. $0 document.txt
  32. $0 document.txt archive.${\FORMAT}
  33. $0 archive.${\FORMAT} document.txt
  34. $0 -e -i archive.${\FORMAT} -o document.txt
  35. EOH
  36. exit($code // 0);
  37. }
  38. sub version {
  39. printf("%s %s\n", PKGNAME, VERSION);
  40. exit;
  41. }
  42. sub valid_archive {
  43. my ($fh) = @_;
  44. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  45. $sig eq SIGNATURE || return;
  46. }
  47. return 1;
  48. }
  49. sub main {
  50. my %opt;
  51. getopts('ei:o:vhr', \%opt);
  52. $opt{h} && usage(0);
  53. $opt{v} && version();
  54. my ($input, $output) = @ARGV;
  55. $input //= $opt{i} // usage(2);
  56. $output //= $opt{o};
  57. my $ext = qr{\.${\FORMAT}\z}io;
  58. if ($opt{e} || $input =~ $ext) {
  59. if (not defined $output) {
  60. ($output = basename($input)) =~ s{$ext}{}
  61. || die "$0: no output file specified!\n";
  62. }
  63. if (not $opt{r} and -e $output) {
  64. print "'$output' already exists! -- Replace? [y/N] ";
  65. <STDIN> =~ /^y/i || exit 17;
  66. }
  67. decompress_file($input, $output)
  68. || die "$0: error: decompression failed!\n";
  69. }
  70. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  71. $output //= basename($input) . '.' . FORMAT;
  72. compress_file($input, $output)
  73. || die "$0: error: compression failed!\n";
  74. }
  75. else {
  76. warn "$0: don't know what to do...\n";
  77. usage(1);
  78. }
  79. }
  80. sub mtf_encode ($bytes, $alphabet = [0 .. 255]) {
  81. my @C;
  82. my @table;
  83. @table[@$alphabet] = (0 .. $#{$alphabet});
  84. foreach my $c (@$bytes) {
  85. push @C, (my $index = $table[$c]);
  86. unshift(@$alphabet, splice(@$alphabet, $index, 1));
  87. @table[@{$alphabet}[0 .. $index]] = (0 .. $index);
  88. }
  89. return \@C;
  90. }
  91. sub mtf_decode ($encoded, $alphabet = [0 .. 255]) {
  92. my @S;
  93. foreach my $p (@$encoded) {
  94. push @S, $alphabet->[$p];
  95. unshift(@$alphabet, splice(@$alphabet, $p, 1));
  96. }
  97. return \@S;
  98. }
  99. sub read_bit ($fh, $bitstring) {
  100. if (($$bitstring // '') eq '') {
  101. $$bitstring = unpack('b*', getc($fh) // return undef);
  102. }
  103. chop($$bitstring);
  104. }
  105. sub read_bits ($fh, $bits_len) {
  106. my $data = '';
  107. read($fh, $data, $bits_len >> 3);
  108. $data = unpack('B*', $data);
  109. while (length($data) < $bits_len) {
  110. $data .= unpack('B*', getc($fh) // return undef);
  111. }
  112. if (length($data) > $bits_len) {
  113. $data = substr($data, 0, $bits_len);
  114. }
  115. return $data;
  116. }
  117. sub delta_encode ($integers, $double = 0) {
  118. my @deltas;
  119. my $prev = 0;
  120. unshift(@$integers, scalar(@$integers));
  121. while (@$integers) {
  122. my $curr = shift(@$integers);
  123. push @deltas, $curr - $prev;
  124. $prev = $curr;
  125. }
  126. my $bitstring = '';
  127. foreach my $d (@deltas) {
  128. if ($d == 0) {
  129. $bitstring .= '0';
  130. }
  131. elsif ($double) {
  132. my $t = sprintf('%b', abs($d) + 1);
  133. my $l = sprintf('%b', length($t));
  134. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  135. }
  136. else {
  137. my $t = sprintf('%b', abs($d));
  138. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  139. }
  140. }
  141. pack('B*', $bitstring);
  142. }
  143. sub delta_decode ($fh, $double = 0) {
  144. my @deltas;
  145. my $buffer = '';
  146. my $len = 0;
  147. for (my $k = 0 ; $k <= $len ; ++$k) {
  148. my $bit = read_bit($fh, \$buffer);
  149. if ($bit eq '0') {
  150. push @deltas, 0;
  151. }
  152. elsif ($double) {
  153. my $bit = read_bit($fh, \$buffer);
  154. my $bl = 0;
  155. ++$bl while (read_bit($fh, \$buffer) eq '1');
  156. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  157. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  158. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  159. }
  160. else {
  161. my $bit = read_bit($fh, \$buffer);
  162. my $n = 0;
  163. ++$n while (read_bit($fh, \$buffer) eq '1');
  164. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  165. push @deltas, ($bit eq '1' ? $d : -$d);
  166. }
  167. if ($k == 0) {
  168. $len = pop(@deltas);
  169. }
  170. }
  171. my @acc;
  172. my $prev = $len;
  173. foreach my $d (@deltas) {
  174. $prev += $d;
  175. push @acc, $prev;
  176. }
  177. return \@acc;
  178. }
  179. # produce encode and decode dictionary from a tree
  180. sub walk ($node, $code, $h, $rev_h) {
  181. my $c = $node->[0] // return ($h, $rev_h);
  182. if (ref $c) { walk($c->[$_], $code . $_, $h, $rev_h) for ('0', '1') }
  183. else { $h->{$c} = $code; $rev_h->{$code} = $c }
  184. return ($h, $rev_h);
  185. }
  186. # make a tree, and return resulting dictionaries
  187. sub mktree_from_freq ($freq) {
  188. my @nodes = map { [$_, $freq->{$_}] } sort { $a <=> $b } keys %$freq;
  189. do { # poor man's priority queue
  190. @nodes = sort { $a->[1] <=> $b->[1] } @nodes;
  191. my ($x, $y) = splice(@nodes, 0, 2);
  192. if (defined($x)) {
  193. if (defined($y)) {
  194. push @nodes, [[$x, $y], $x->[1] + $y->[1]];
  195. }
  196. else {
  197. push @nodes, [[$x], $x->[1]];
  198. }
  199. }
  200. } while (@nodes > 1);
  201. walk($nodes[0], '', {}, {});
  202. }
  203. sub huffman_encode ($bytes, $dict) {
  204. join('', @{$dict}{@$bytes});
  205. }
  206. sub huffman_decode ($bits, $hash) {
  207. local $" = '|';
  208. [split(' ', $bits =~ s/(@{[sort { length($a) <=> length($b) } keys %{$hash}]})/$hash->{$1} /gr)]; # very fast
  209. }
  210. sub create_huffman_entry ($bytes, $out_fh) {
  211. my %freq;
  212. ++$freq{$_} for @$bytes;
  213. my ($h, $rev_h) = mktree_from_freq(\%freq);
  214. my $enc = huffman_encode($bytes, $h);
  215. my $max_symbol = max(keys %freq) // 0;
  216. say "Max symbol: $max_symbol\n";
  217. my @freqs;
  218. foreach my $i (0 .. $max_symbol) {
  219. push @freqs, $freq{$i} // 0;
  220. }
  221. print $out_fh delta_encode(\@freqs);
  222. print $out_fh pack("N", length($enc));
  223. print $out_fh pack("B*", $enc);
  224. }
  225. sub decode_huffman_entry ($fh) {
  226. my @freqs = @{delta_decode($fh)};
  227. my %freq;
  228. foreach my $i (0 .. $#freqs) {
  229. if ($freqs[$i]) {
  230. $freq{$i} = $freqs[$i];
  231. }
  232. }
  233. my (undef, $rev_dict) = mktree_from_freq(\%freq);
  234. my $enc_len = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  235. say "Encoded length: $enc_len\n";
  236. if ($enc_len > 0) {
  237. return huffman_decode(read_bits($fh, $enc_len), $rev_dict);
  238. }
  239. return [];
  240. }
  241. sub rle4_encode ($bytes) { # RLE1
  242. my @rle;
  243. my $end = $#{$bytes};
  244. my $prev = -1;
  245. my $run = 0;
  246. for (my $i = 0 ; $i <= $end ; ++$i) {
  247. if ($bytes->[$i] == $prev) {
  248. ++$run;
  249. }
  250. else {
  251. $run = 1;
  252. }
  253. push @rle, $bytes->[$i];
  254. $prev = $bytes->[$i];
  255. if ($run >= 4) {
  256. $run = 0;
  257. $i += 1;
  258. while ($run < 254 and $i <= $end and $bytes->[$i] == $prev) {
  259. ++$run;
  260. ++$i;
  261. }
  262. push @rle, $run;
  263. $run = 1;
  264. if ($i <= $end) {
  265. $prev = $bytes->[$i];
  266. push @rle, $bytes->[$i];
  267. }
  268. }
  269. }
  270. return \@rle;
  271. }
  272. sub rle4_decode ($bytes) { # RLE1
  273. my @dec = $bytes->[0];
  274. my $end = $#{$bytes};
  275. my $prev = $bytes->[0];
  276. my $run = 1;
  277. for (my $i = 1 ; $i <= $end ; ++$i) {
  278. if ($bytes->[$i] == $prev) {
  279. ++$run;
  280. }
  281. else {
  282. $run = 1;
  283. }
  284. push @dec, $bytes->[$i];
  285. $prev = $bytes->[$i];
  286. if ($run >= 4) {
  287. if (++$i <= $end) {
  288. $run = $bytes->[$i];
  289. push @dec, (($prev) x $run);
  290. }
  291. $run = 0;
  292. }
  293. }
  294. return \@dec;
  295. }
  296. sub rle_encode ($bytes) { # RLE2
  297. my @rle;
  298. my $end = $#{$bytes};
  299. for (my $i = 0 ; $i <= $end ; ++$i) {
  300. my $run = 0;
  301. while ($i <= $end and $bytes->[$i] == 0) {
  302. ++$run;
  303. ++$i;
  304. }
  305. if ($run >= 1) {
  306. my $t = sprintf('%b', $run + 1);
  307. push @rle, split(//, substr($t, 1));
  308. }
  309. if ($i <= $end) {
  310. push @rle, $bytes->[$i] + 1;
  311. }
  312. }
  313. return \@rle;
  314. }
  315. sub rle_decode ($rle) { # RLE2
  316. my @dec;
  317. my $end = $#{$rle};
  318. for (my $i = 0 ; $i <= $end ; ++$i) {
  319. my $k = $rle->[$i];
  320. if ($k == 0 or $k == 1) {
  321. my $run = 1;
  322. while (($i <= $end) and ($k == 0 or $k == 1)) {
  323. ($run <<= 1) |= $k;
  324. $k = $rle->[++$i];
  325. }
  326. push @dec, (0) x ($run - 1);
  327. }
  328. if ($i <= $end) {
  329. push @dec, $k - 1;
  330. }
  331. }
  332. return \@dec;
  333. }
  334. sub encode_alphabet ($alphabet) {
  335. my %table;
  336. @table{@$alphabet} = ();
  337. my $populated = 0;
  338. my @marked;
  339. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  340. my $enc = 0;
  341. foreach my $j (0 .. 31) {
  342. if (exists($table{$i + $j})) {
  343. $enc |= 1 << $j;
  344. }
  345. }
  346. if ($enc == 0) {
  347. $populated <<= 1;
  348. }
  349. else {
  350. ($populated <<= 1) |= 1;
  351. push @marked, $enc;
  352. }
  353. }
  354. my $delta = delta_encode([@marked], 1);
  355. say "Populated : ", sprintf('%08b', $populated);
  356. say "Marked : @marked";
  357. say "Delta len : ", length($delta);
  358. my $encoded = '';
  359. $encoded .= chr($populated);
  360. $encoded .= $delta;
  361. return $encoded;
  362. }
  363. sub decode_alphabet ($fh) {
  364. my @populated = split(//, sprintf('%08b', ord(getc($fh) // die "error")));
  365. my $marked = delta_decode($fh, 1);
  366. my @alphabet;
  367. for (my $i = 0 ; $i <= 255 ; $i += 32) {
  368. if (shift(@populated)) {
  369. my $m = shift(@$marked);
  370. foreach my $j (0 .. 31) {
  371. if ($m & 1) {
  372. push @alphabet, $i + $j;
  373. }
  374. $m >>= 1;
  375. }
  376. }
  377. }
  378. return \@alphabet;
  379. }
  380. sub compression ($chunk, $out_fh) {
  381. my $bytes = [unpack('C*', $chunk)];
  382. my @alphabet = sort { $a <=> $b } uniq(@$bytes);
  383. my $alphabet_enc = encode_alphabet(\@alphabet);
  384. $bytes = mtf_encode($bytes, [@alphabet]);
  385. $bytes = rle_encode($bytes);
  386. $bytes = rle4_encode($bytes);
  387. print $out_fh $alphabet_enc;
  388. create_huffman_entry($bytes, $out_fh);
  389. }
  390. sub decompression ($fh, $out_fh) {
  391. my $alphabet = decode_alphabet($fh);
  392. say "Alphabet size: ", scalar(@$alphabet);
  393. my $bytes = decode_huffman_entry($fh);
  394. $bytes = rle4_decode($bytes);
  395. $bytes = rle_decode($bytes);
  396. $bytes = mtf_decode($bytes, [@$alphabet]);
  397. print $out_fh pack('C*', @$bytes);
  398. }
  399. # Compress file
  400. sub compress_file ($input, $output) {
  401. open my $fh, '<:raw', $input
  402. or die "Can't open file <<$input>> for reading: $!";
  403. my $header = SIGNATURE;
  404. # Open the output file for writing
  405. open my $out_fh, '>:raw', $output
  406. or die "Can't open file <<$output>> for write: $!";
  407. # Print the header
  408. print $out_fh $header;
  409. # Compress data
  410. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  411. compression($chunk, $out_fh);
  412. }
  413. # Close the file
  414. close $out_fh;
  415. }
  416. # Decompress file
  417. sub decompress_file ($input, $output) {
  418. # Open and validate the input file
  419. open my $fh, '<:raw', $input
  420. or die "Can't open file <<$input>> for reading: $!";
  421. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  422. # Open the output file
  423. open my $out_fh, '>:raw', $output
  424. or die "Can't open file <<$output>> for writing: $!";
  425. while (!eof($fh)) {
  426. decompression($fh, $out_fh);
  427. }
  428. # Close the file
  429. close $fh;
  430. close $out_fh;
  431. }
  432. main();
  433. exit(0);