gzip_block_type_2_simple.pl 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 13 January 2024
  4. # Edit: 09 April 2024
  5. # https://github.com/trizen
  6. # Create a valid Gzip container, using DEFLATE's Block Type 2: LZSS + dynamic prefix codes.
  7. # Reference:
  8. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  9. # https://youtube.com/watch?v=SJPvNi4HrWQ
  10. use 5.036;
  11. use Digest::CRC qw();
  12. use File::Basename qw(basename);
  13. use Compression::Util qw(:all);
  14. use List::Util qw(uniq);
  15. use constant {
  16. WINDOW_SIZE => 32_768, # 2^15
  17. };
  18. my $MAGIC = pack('C*', 0x1f, 0x8b); # magic MIME type
  19. my $CM = chr(0x08); # 0x08 = DEFLATE
  20. my $FLAGS = chr(0x00); # flags
  21. my $MTIME = pack('C*', (0x00) x 4); # modification time
  22. my $XFLAGS = chr(0x00); # extra flags
  23. my $OS = chr(0x03); # 0x03 = Unix
  24. my $input = $ARGV[0] // die "usage: $0 [input] [output.gz]\n";
  25. my $output = $ARGV[1] // (basename($input) . '.gz');
  26. open my $in_fh, '<:raw', $input
  27. or die "Can't open file <<$input>> for reading: $!";
  28. open my $out_fh, '>:raw', $output
  29. or die "Can't open file <<$output>> for writing: $!";
  30. print $out_fh $MAGIC, $CM, $FLAGS, $MTIME, $XFLAGS, $OS;
  31. my $total_length = 0;
  32. my $crc32 = Digest::CRC->new(type => "crc32");
  33. my $bitstring = '';
  34. my $block_type = '01'; # 00 = store; 10 = LZSS + Fixed codes; 01 = LZSS + Dynamic codes
  35. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  36. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables(WINDOW_SIZE);
  37. if (eof($in_fh)) { # empty file
  38. $bitstring = '1' . '10' . '0000000';
  39. }
  40. while (read($in_fh, (my $chunk), WINDOW_SIZE)) {
  41. my $chunk_len = length($chunk);
  42. my $is_last = eof($in_fh) ? '1' : '0';
  43. my $block_header = join('', $is_last, $block_type);
  44. my ($literals, $distances, $lengths) = lzss_encode($chunk);
  45. my @len_symbols;
  46. my @dist_symbols;
  47. my $offset_bits = '';
  48. foreach my $k (0 .. $#$literals) {
  49. if ($lengths->[$k] == 0) {
  50. push @len_symbols, $literals->[$k];
  51. next;
  52. }
  53. my $len = $lengths->[$k];
  54. my $dist = $distances->[$k];
  55. {
  56. my $len_idx = $LENGTH_INDICES->[$len];
  57. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  58. push @len_symbols, [$len_idx + 256 - 1, $bits];
  59. $offset_bits .= int2bits_lsb($len - $min, $bits) if ($bits > 0);
  60. }
  61. {
  62. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  63. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  64. push @dist_symbols, [$dist_idx - 1, $bits];
  65. $offset_bits .= int2bits_lsb($dist - $min, $bits) if ($bits > 0);
  66. }
  67. }
  68. push @len_symbols, 256; # end-of-block marker
  69. my ($dict) = huffman_from_symbols([map { ref($_) eq 'ARRAY' ? $_->[0] : $_ } @len_symbols]);
  70. my ($dist_dict) = huffman_from_symbols([map { $_->[0] } @dist_symbols]);
  71. my @LL_code_lengths;
  72. foreach my $symbol (0 .. 285) {
  73. if (exists($dict->{$symbol})) {
  74. push @LL_code_lengths, length($dict->{$symbol});
  75. }
  76. else {
  77. push @LL_code_lengths, 0;
  78. }
  79. }
  80. while (scalar(@LL_code_lengths) > 1 and $LL_code_lengths[-1] == 0) {
  81. pop @LL_code_lengths;
  82. }
  83. my @distance_code_lengths;
  84. foreach my $symbol (0 .. 29) {
  85. if (exists($dist_dict->{$symbol})) {
  86. push @distance_code_lengths, length($dist_dict->{$symbol});
  87. }
  88. else {
  89. push @distance_code_lengths, 0;
  90. }
  91. }
  92. while (scalar(@distance_code_lengths) > 1 and $distance_code_lengths[-1] == 0) {
  93. pop @distance_code_lengths;
  94. }
  95. my @CL_code = uniq(@LL_code_lengths, @distance_code_lengths);
  96. my ($cl_dict) = huffman_from_symbols(\@CL_code);
  97. my @CL_code_lenghts;
  98. foreach my $symbol (0 .. 18) {
  99. if (exists($cl_dict->{$symbol})) {
  100. push @CL_code_lenghts, length($cl_dict->{$symbol});
  101. }
  102. else {
  103. push @CL_code_lenghts, 0;
  104. }
  105. }
  106. # Put the CL codes in the required order
  107. @CL_code_lenghts = @CL_code_lenghts[@CL_order];
  108. while (scalar(@CL_code_lenghts) > 4 and $CL_code_lenghts[-1] == 0) {
  109. pop @CL_code_lenghts;
  110. }
  111. my $CL_code_lengths_bitstring = join('', map { int2bits_lsb($_, 3) } @CL_code_lenghts);
  112. my $LL_code_lengths_bitstring = join('', @{$cl_dict}{@LL_code_lengths});
  113. my $distance_code_lengths_bitstring = join('', @{$cl_dict}{@distance_code_lengths});
  114. # (5 bits) HLIT = (number of LL code entries present) - 257
  115. my $HLIT = scalar(@LL_code_lengths) - 257;
  116. # (5 bits) HDIST = (number of distance code entries present) - 1
  117. my $HDIST = scalar(@distance_code_lengths) - 1;
  118. # (4 bits) HCLEN = (number of CL code entries present) - 4
  119. my $HCLEN = scalar(@CL_code_lenghts) - 4;
  120. $block_header .= int2bits_lsb($HLIT, 5);
  121. $block_header .= int2bits_lsb($HDIST, 5);
  122. $block_header .= int2bits_lsb($HCLEN, 4);
  123. $block_header .= $CL_code_lengths_bitstring;
  124. $block_header .= $LL_code_lengths_bitstring;
  125. $block_header .= $distance_code_lengths_bitstring;
  126. $bitstring .= $block_header;
  127. foreach my $symbol (@len_symbols) {
  128. if (ref($symbol) eq 'ARRAY') {
  129. my ($len, $len_offset) = @$symbol;
  130. $bitstring .= $dict->{$len};
  131. $bitstring .= substr($offset_bits, 0, $len_offset, '') if ($len_offset > 0);
  132. my ($dist, $dist_offset) = @{shift(@dist_symbols)};
  133. $bitstring .= $dist_dict->{$dist};
  134. $bitstring .= substr($offset_bits, 0, $dist_offset, '') if ($dist_offset > 0);
  135. }
  136. else {
  137. $bitstring .= $dict->{$symbol};
  138. }
  139. }
  140. print $out_fh pack('b*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), ''));
  141. $crc32->add($chunk);
  142. $total_length += $chunk_len;
  143. }
  144. if ($bitstring ne '') {
  145. print $out_fh pack('b*', $bitstring);
  146. }
  147. print $out_fh pack('b*', int2bits_lsb($crc32->digest, 32));
  148. print $out_fh pack('b*', int2bits_lsb($total_length, 32));
  149. close $in_fh;
  150. close $out_fh;