lzss_encoding_hash_table.pl 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 02 May 2024
  4. # https://github.com/trizen
  5. # Implementation of LZSS encoding, using an hash table.
  6. use 5.036;
  7. sub lzss_encode ($str) {
  8. my $la = 0;
  9. my @chars = split(//, $str);
  10. my $end = $#chars;
  11. my $min_len = 4; # minimum match length
  12. my $max_len = 255; # maximum match length
  13. my $max_dist = (1 << 16) - 1; # maximum match distance
  14. my $max_chain_len = 16; # how many recent positions to keep track of
  15. my (@literals, @distances, @lengths, %table);
  16. while ($la <= $end) {
  17. my $best_n = 1;
  18. my $best_p = $la;
  19. my $lookahead = substr($str, $la, $min_len);
  20. if (exists($table{$lookahead})) {
  21. foreach my $p (@{$table{$lookahead}}) {
  22. if ($la - $p > $max_dist) {
  23. last;
  24. }
  25. my $n = $min_len;
  26. while ($n <= $max_len and $la + $n <= $end and $chars[$la + $n - 1] eq $chars[$p + $n - 1]) {
  27. ++$n;
  28. }
  29. if ($n > $best_n) {
  30. $best_p = $p;
  31. $best_n = $n;
  32. }
  33. }
  34. my $matched = substr($str, $la, $best_n);
  35. foreach my $i (0 .. length($matched) - $min_len) {
  36. my $key = substr($matched, $i, $min_len);
  37. unshift @{$table{$key}}, $la + $i;
  38. if (scalar(@{$table{$key}}) > $max_chain_len) {
  39. pop @{$table{$key}};
  40. }
  41. }
  42. }
  43. else {
  44. $table{$lookahead} = [$la];
  45. }
  46. if ($best_n > $min_len) {
  47. push @lengths, $best_n - 1;
  48. push @distances, $la - $best_p;
  49. push @literals, undef;
  50. $la += $best_n - 1;
  51. }
  52. else {
  53. push @lengths, (0) x $best_n;
  54. push @distances, (0) x $best_n;
  55. push @literals, @chars[$best_p .. $best_p + $best_n - 1];
  56. $la += $best_n;
  57. }
  58. }
  59. return (\@literals, \@distances, \@lengths);
  60. }
  61. sub lzss_decode ($literals, $distances, $lengths) {
  62. my @data;
  63. my $data_len = 0;
  64. foreach my $i (0 .. $#$lengths) {
  65. if ($lengths->[$i] == 0) {
  66. push @data, $literals->[$i];
  67. $data_len += 1;
  68. next;
  69. }
  70. my $length = $lengths->[$i];
  71. my $dist = $distances->[$i];
  72. foreach my $j (1 .. $length) {
  73. push @data, $data[$data_len + $j - $dist - 1];
  74. }
  75. $data_len += $length;
  76. }
  77. return join('', @data);
  78. }
  79. my $string = "abbaabbaabaabaaaa";
  80. my ($literals, $distances, $lengths) = lzss_encode($string);
  81. my $decoded = lzss_decode($literals, $distances, $lengths);
  82. $string eq $decoded or die "error: <<$string>> != <<$decoded>>";
  83. foreach my $i (0 .. $#$literals) {
  84. if ($lengths->[$i] == 0) {
  85. say $literals->[$i];
  86. }
  87. else {
  88. say "[$distances->[$i], $lengths->[$i]]";
  89. }
  90. }
  91. foreach my $file (__FILE__, $^X) { # several tests
  92. my $string = do {
  93. open my $fh, '<:raw', $file or die "error for <<$file>>: $!";
  94. local $/;
  95. <$fh>;
  96. };
  97. my ($literals, $distances, $lengths) = lzss_encode($string);
  98. my $decoded = lzss_decode($literals, $distances, $lengths);
  99. say "Ratio: ", scalar(@$literals) / scalar(grep { defined($_) } @$literals);
  100. $string eq $decoded or die "error: <<$string>> != <<$decoded>>";
  101. }
  102. __END__
  103. a
  104. b
  105. b
  106. a
  107. [4, 6]
  108. [3, 5]
  109. a
  110. a
  111. Ratio: 1.35733333333333
  112. Ratio: 1.44651830581479