compress.pl 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 05 May 2023
  4. # https://github.com/trizen
  5. # A basic implementation of the UNIX `compress` tool, creating a .Z compressed file, using LZW compression.
  6. # This implementation reads from STDIN and outputs to STDOUT:
  7. # perl compress.pl < input.txt > output.Z
  8. # Reference:
  9. # Data Compression (Summer 2023) - Lecture 4 - The Unix 'compress' Program
  10. # https://youtube.com/watch?v=1cJL9Va80Pk
  11. # See also:
  12. # https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  13. use 5.036;
  14. use constant {
  15. BUFFER_SIZE => 8 * 512, # must be a multiple of 8
  16. MAGIC_SIGNATURE => "\x1f\x9d\x90",
  17. };
  18. sub compress ($in_fh, $out_fh) {
  19. binmode($in_fh, ':raw');
  20. binmode($out_fh, ':raw');
  21. print {$out_fh} MAGIC_SIGNATURE;
  22. my $dict_size = 256;
  23. my %dictionary = (map { (chr($_), $_) } 0 .. $dict_size - 1);
  24. ++$dict_size; # 256 is the 'RESET' marker
  25. my $num_bits = 9;
  26. my $max_bits = 16;
  27. my $max_bits_size = (1 << $num_bits);
  28. my $max_dict_size = (1 << $max_bits);
  29. my $bitstream = '';
  30. my $bitstream_size = 0;
  31. my sub output_index ($symbol) {
  32. $bitstream .= reverse(sprintf('%0*b', $num_bits, $dictionary{$symbol}));
  33. $bitstream_size += $num_bits;
  34. if ($bitstream_size % BUFFER_SIZE == 0) {
  35. print {$out_fh} pack("b*", $bitstream);
  36. $bitstream = '';
  37. $bitstream_size = 0;
  38. }
  39. }
  40. my $w = '';
  41. while (defined(my $c = getc($in_fh))) {
  42. my $wc = $w . $c;
  43. if (exists($dictionary{$wc})) {
  44. $w = $wc;
  45. }
  46. else {
  47. output_index($w);
  48. if ($dict_size < $max_dict_size) {
  49. $dictionary{$wc} = $dict_size++;
  50. if ($dict_size > $max_bits_size) {
  51. ++$num_bits;
  52. $max_bits_size <<= 1;
  53. }
  54. }
  55. $w = $c;
  56. }
  57. }
  58. if ($w ne '') {
  59. output_index($w);
  60. }
  61. if ($bitstream ne '') {
  62. print {$out_fh} pack('b*', $bitstream);
  63. }
  64. return 1;
  65. }
  66. compress(\*STDIN, \*STDOUT);