lzss_encoding_symbolic.pl 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 02 May 2024
  4. # https://github.com/trizen
  5. # Symbolic implementation of LZSS encoding, using an hash table.
  6. use 5.036;
  7. sub lzss_encode_symbolic ($symbols) {
  8. my $la = 0;
  9. my $end = $#$symbols;
  10. my $min_len = 4; # minimum match length
  11. my $max_len = 255; # maximum match length
  12. my $max_chain_len = 16; # how many recent positions to keep track of
  13. my (@literals, @distances, @lengths, %table);
  14. while ($la <= $end) {
  15. my $best_n = 1;
  16. my $best_p = $la;
  17. my @lookahead_symbols;
  18. if ($la + $min_len - 1 <= $end) {
  19. push @lookahead_symbols, @{$symbols}[$la .. $la + $min_len - 1];
  20. }
  21. else {
  22. for (my $j = 0 ; ($j < $min_len and $la + $j <= $end) ; ++$j) {
  23. push @lookahead_symbols, $symbols->[$la + $j];
  24. }
  25. }
  26. my $lookahead = join(' ', @lookahead_symbols);
  27. if (exists($table{$lookahead})) {
  28. foreach my $p (@{$table{$lookahead}}) {
  29. my $n = $min_len;
  30. while ($n <= $max_len and $la + $n <= $end and $symbols->[$la + $n - 1] == $symbols->[$p + $n - 1]) {
  31. ++$n;
  32. }
  33. if ($n > $best_n) {
  34. $best_p = $p;
  35. $best_n = $n;
  36. }
  37. }
  38. my @matched = @{$symbols}[$la .. $la + $best_n - 1];
  39. foreach my $i (0 .. scalar(@matched) - $min_len) {
  40. my $key = join(' ', @matched[$i .. $i + $min_len - 1]);
  41. unshift @{$table{$key}}, $la + $i;
  42. if (scalar(@{$table{$key}}) > $max_chain_len) {
  43. pop @{$table{$key}};
  44. }
  45. }
  46. }
  47. else {
  48. $table{$lookahead} = [$la];
  49. }
  50. if ($best_n > $min_len) {
  51. push @lengths, $best_n - 1;
  52. push @distances, $la - $best_p;
  53. push @literals, undef;
  54. $la += $best_n - 1;
  55. }
  56. else {
  57. push @lengths, (0) x $best_n;
  58. push @distances, (0) x $best_n;
  59. push @literals, @$symbols[$best_p .. $best_p + $best_n - 1];
  60. $la += $best_n;
  61. }
  62. }
  63. return (\@literals, \@distances, \@lengths);
  64. }
  65. sub lzss_decode_symbolic ($literals, $distances, $lengths) {
  66. my @data;
  67. my $data_len = 0;
  68. foreach my $i (0 .. $#$lengths) {
  69. if ($lengths->[$i] == 0) {
  70. push @data, $literals->[$i];
  71. $data_len += 1;
  72. next;
  73. }
  74. my $length = $lengths->[$i];
  75. my $dist = $distances->[$i];
  76. foreach my $j (1 .. $length) {
  77. push @data, $data[$data_len + $j - $dist - 1];
  78. }
  79. $data_len += $length;
  80. }
  81. return \@data;
  82. }
  83. my $string = "abbaabbaabaabaaaa";
  84. my ($literals, $distances, $lengths) = lzss_encode_symbolic([unpack('C*', $string)]);
  85. my $decoded = lzss_decode_symbolic($literals, $distances, $lengths);
  86. $string eq pack('C*', @$decoded) or die "error: <<$string>> != <<@$decoded>>";
  87. foreach my $i (0 .. $#$literals) {
  88. if ($lengths->[$i] == 0) {
  89. say $literals->[$i];
  90. }
  91. else {
  92. say "[$distances->[$i], $lengths->[$i]]";
  93. }
  94. }
  95. foreach my $file (__FILE__, $^X) { # several tests
  96. my $string = do {
  97. open my $fh, '<:raw', $file or die "error for <<$file>>: $!";
  98. local $/;
  99. <$fh>;
  100. };
  101. my ($literals, $distances, $lengths) = lzss_encode_symbolic([unpack('C*', $string)]);
  102. my $decoded = lzss_decode_symbolic($literals, $distances, $lengths);
  103. say "Ratio: ", scalar(@$literals) / scalar(grep { defined($_) } @$literals);
  104. $string eq pack('C*', @$decoded) or die "error: <<$string>> != <<@$decoded>>";
  105. }
  106. __END__
  107. 97
  108. 98
  109. 98
  110. 97
  111. [4, 6]
  112. [3, 5]
  113. 97
  114. 97
  115. Ratio: 1.38851802403204
  116. Ratio: 1.44651830581479