lzac_file_compression.pl 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 15 December 2022
  4. # Edit: 11 April 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using LZ77 compression + Arithmetic Coding (in fixed bits).
  7. # Encoding the distances/indices using a DEFLATE-like approach.
  8. # References:
  9. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  10. # https://youtube.com/watch?v=SJPvNi4HrWQ
  11. #
  12. # Basic arithmetic coder in C++
  13. # https://github.com/billbird/arith32
  14. use 5.036;
  15. use Getopt::Std qw(getopts);
  16. use File::Basename qw(basename);
  17. use Compression::Util qw(:all);
  18. use constant {
  19. PKGNAME => 'LZAC',
  20. VERSION => '0.02',
  21. FORMAT => 'lzac',
  22. COMPRESSED_BYTE => chr(1),
  23. UNCOMPRESSED_BYTE => chr(0),
  24. CHUNK_SIZE => 1 << 16, # higher value = better compression
  25. RANDOM_DATA_THRESHOLD => 1, # in ratio
  26. };
  27. # Container signature
  28. use constant SIGNATURE => uc(FORMAT) . chr(2);
  29. sub usage {
  30. my ($code) = @_;
  31. print <<"EOH";
  32. usage: $0 [options] [input file] [output file]
  33. options:
  34. -e : extract
  35. -i <filename> : input filename
  36. -o <filename> : output filename
  37. -r : rewrite output
  38. -v : version number
  39. -h : this message
  40. examples:
  41. $0 document.txt
  42. $0 document.txt archive.${\FORMAT}
  43. $0 archive.${\FORMAT} document.txt
  44. $0 -e -i archive.${\FORMAT} -o document.txt
  45. EOH
  46. exit($code // 0);
  47. }
  48. sub version {
  49. printf("%s %s\n", PKGNAME, VERSION);
  50. exit;
  51. }
  52. sub valid_archive {
  53. my ($fh) = @_;
  54. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  55. $sig eq SIGNATURE || return;
  56. }
  57. return 1;
  58. }
  59. sub main {
  60. my %opt;
  61. getopts('ei:o:vhr', \%opt);
  62. $opt{h} && usage(0);
  63. $opt{v} && version();
  64. my ($input, $output) = @ARGV;
  65. $input //= $opt{i} // usage(2);
  66. $output //= $opt{o};
  67. my $ext = qr{\.${\FORMAT}\z}io;
  68. if ($opt{e} || $input =~ $ext) {
  69. if (not defined $output) {
  70. ($output = basename($input)) =~ s{$ext}{}
  71. || die "$0: no output file specified!\n";
  72. }
  73. if (not $opt{r} and -e $output) {
  74. print "'$output' already exists! -- Replace? [y/N] ";
  75. <STDIN> =~ /^y/i || exit 17;
  76. }
  77. decompress_file($input, $output)
  78. || die "$0: error: decompression failed!\n";
  79. }
  80. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  81. $output //= basename($input) . '.' . FORMAT;
  82. compress_file($input, $output)
  83. || die "$0: error: compression failed!\n";
  84. }
  85. else {
  86. warn "$0: don't know what to do...\n";
  87. usage(1);
  88. }
  89. }
  90. # Compress file
  91. sub compress_file ($input, $output) {
  92. open my $fh, '<:raw', $input
  93. or die "Can't open file <<$input>> for reading: $!";
  94. my $header = SIGNATURE;
  95. # Open the output file for writing
  96. open my $out_fh, '>:raw', $output
  97. or die "Can't open file <<$output>> for write: $!";
  98. # Print the header
  99. print $out_fh $header;
  100. # Compress data
  101. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  102. my ($uncompressed, $distances, $lengths) = lz77_encode($chunk);
  103. my $est_ratio = length($chunk) / (4 * scalar(@$uncompressed));
  104. say(scalar(@$uncompressed), ' -> ', $est_ratio);
  105. if ($est_ratio > RANDOM_DATA_THRESHOLD) {
  106. print $out_fh COMPRESSED_BYTE;
  107. print $out_fh create_ac_entry($uncompressed);
  108. print $out_fh create_ac_entry($lengths);
  109. print $out_fh obh_encode($distances, \&create_ac_entry);
  110. }
  111. else {
  112. print $out_fh UNCOMPRESSED_BYTE;
  113. print $out_fh create_ac_entry([unpack('C*', $chunk)]);
  114. }
  115. }
  116. # Close the file
  117. close $out_fh;
  118. }
  119. # Decompress file
  120. sub decompress_file ($input, $output) {
  121. # Open and validate the input file
  122. open my $fh, '<:raw', $input
  123. or die "Can't open file <<$input>> for reading: $!";
  124. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  125. # Open the output file
  126. open my $out_fh, '>:raw', $output
  127. or die "Can't open file <<$output>> for writing: $!";
  128. while (!eof($fh)) {
  129. my $compression_byte = getc($fh) // die "decompression error";
  130. if ($compression_byte eq COMPRESSED_BYTE) {
  131. my $uncompressed = decode_ac_entry($fh);
  132. my $lengths = decode_ac_entry($fh);
  133. my $distances = obh_decode($fh, \&decode_ac_entry);
  134. print $out_fh lz77_decode($uncompressed, $distances, $lengths);
  135. }
  136. elsif ($compression_byte eq UNCOMPRESSED_BYTE) {
  137. print $out_fh pack('C*', @{decode_ac_entry($fh)});
  138. }
  139. else {
  140. die "Invalid compression...";
  141. }
  142. }
  143. # Close the file
  144. close $fh;
  145. close $out_fh;
  146. }
  147. main();
  148. exit(0);