123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- #!/usr/bin/perl
- # Author: Trizen
- # Date: 02 May 2024
- # https://github.com/trizen
- # Implementation of LZSS encoding, using an hash table.
- use 5.036;
- sub lzss_encode ($str) {
- my $la = 0;
- my @chars = split(//, $str);
- my $end = $#chars;
- my $min_len = 4; # minimum match length
- my $max_len = 255; # maximum match length
- my $max_dist = (1 << 16) - 1; # maximum match distance
- my $max_chain_len = 16; # how many recent positions to keep track of
- my (@literals, @distances, @lengths, %table);
- while ($la <= $end) {
- my $best_n = 1;
- my $best_p = $la;
- my $lookahead = substr($str, $la, $min_len);
- if (exists($table{$lookahead})) {
- foreach my $p (@{$table{$lookahead}}) {
- if ($la - $p > $max_dist) {
- last;
- }
- my $n = $min_len;
- while ($n <= $max_len and $la + $n <= $end and $chars[$la + $n - 1] eq $chars[$p + $n - 1]) {
- ++$n;
- }
- if ($n > $best_n) {
- $best_p = $p;
- $best_n = $n;
- }
- }
- my $matched = substr($str, $la, $best_n);
- foreach my $i (0 .. length($matched) - $min_len) {
- my $key = substr($matched, $i, $min_len);
- unshift @{$table{$key}}, $la + $i;
- if (scalar(@{$table{$key}}) > $max_chain_len) {
- pop @{$table{$key}};
- }
- }
- }
- else {
- $table{$lookahead} = [$la];
- }
- if ($best_n > $min_len) {
- push @lengths, $best_n - 1;
- push @distances, $la - $best_p;
- push @literals, undef;
- $la += $best_n - 1;
- }
- else {
- push @lengths, (0) x $best_n;
- push @distances, (0) x $best_n;
- push @literals, @chars[$best_p .. $best_p + $best_n - 1];
- $la += $best_n;
- }
- }
- return (\@literals, \@distances, \@lengths);
- }
- sub lzss_decode ($literals, $distances, $lengths) {
- my @data;
- my $data_len = 0;
- foreach my $i (0 .. $#$lengths) {
- if ($lengths->[$i] == 0) {
- push @data, $literals->[$i];
- $data_len += 1;
- next;
- }
- my $length = $lengths->[$i];
- my $dist = $distances->[$i];
- foreach my $j (1 .. $length) {
- push @data, $data[$data_len + $j - $dist - 1];
- }
- $data_len += $length;
- }
- return join('', @data);
- }
- my $string = "abbaabbaabaabaaaa";
- my ($literals, $distances, $lengths) = lzss_encode($string);
- my $decoded = lzss_decode($literals, $distances, $lengths);
- $string eq $decoded or die "error: <<$string>> != <<$decoded>>";
- foreach my $i (0 .. $#$literals) {
- if ($lengths->[$i] == 0) {
- say $literals->[$i];
- }
- else {
- say "[$distances->[$i], $lengths->[$i]]";
- }
- }
- foreach my $file (__FILE__, $^X) { # several tests
- my $string = do {
- open my $fh, '<:raw', $file or die "error for <<$file>>: $!";
- local $/;
- <$fh>;
- };
- my ($literals, $distances, $lengths) = lzss_encode($string);
- my $decoded = lzss_decode($literals, $distances, $lengths);
- say "Ratio: ", scalar(@$literals) / scalar(grep { defined($_) } @$literals);
- $string eq $decoded or die "error: <<$string>> != <<$decoded>>";
- }
- __END__
- a
- b
- b
- a
- [4, 6]
- [3, 5]
- a
- a
- Ratio: 1.35733333333333
- Ratio: 1.44651830581479
|