length_encoder.pl 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #!?usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 04 May 2015
  5. # Website: https://github.com/trizen
  6. # A very basic length encoder
  7. use 5.010;
  8. use strict;
  9. use warnings;
  10. use Data::Dump qw(pp);
  11. # produce encode and decode dictionary from a tree
  12. sub walk {
  13. my ($node, $code, $h) = @_;
  14. my $c = $node->[0];
  15. if (ref $c) { walk($c->[$_], $code . $_, $h) for 0, 1 }
  16. else { $h->{$c} = $code }
  17. $h;
  18. }
  19. # make a tree, and return resulting dictionaries
  20. sub mktree {
  21. my %freq = @_;
  22. my @nodes = map { [$_, $freq{$_}] } keys %freq;
  23. if (@nodes == 1) {
  24. return {$nodes[0][0] => '0'};
  25. }
  26. do { # poor man's priority queue
  27. @nodes = sort { $a->[1] <=> $b->[1] } @nodes;
  28. my ($x, $y) = splice(@nodes, 0, 2);
  29. push @nodes, [[$x, $y], $x->[1] + $y->[1]];
  30. } while (@nodes > 1);
  31. walk($nodes[0], '', {}, {});
  32. }
  33. sub length_encoder {
  34. my ($str) = @_;
  35. my %table;
  36. my @chars = split(//, $str);
  37. my $lim = $#chars;
  38. my %t;
  39. for (my $i = 0 ; $i < $lim ; $i++) {
  40. for (my $j = $i + 1 ; $j <= $lim ; $j++) {
  41. last if $j + ($j - $i) + 1 > $lim;
  42. my $key = join('', @chars[$i .. $j]);
  43. if (join('', @chars[$j + 1 .. $j + ($j - $i) + 1]) eq $key) {
  44. if (not exists $t{$key}) {
  45. if (exists $t{substr($key, 0, -1)}) {
  46. last;
  47. }
  48. $t{$key} = length($key);
  49. }
  50. else {
  51. $t{$key}++;
  52. }
  53. }
  54. }
  55. }
  56. my ($dict) = keys(%t) ? mktree(%t) : {};
  57. my @sorted_tokens =
  58. sort { length($dict->{$a}) <=> length($dict->{$b}) or $t{$b} <=> $t{$a} or $a cmp $b } keys %t;
  59. say "Weights: ", pp(\%t);
  60. say "Sorted: @sorted_tokens";
  61. say "Bits: ", pp($dict);
  62. my $regex = do {
  63. my @tries = map { "(?<token>\Q$_\E)(?<rest>(?:\Q$_\E)*+)" } @sorted_tokens;
  64. local $" = '|';
  65. @sorted_tokens ? qr/^(?:@tries|(?<token>.))/s : qr/^(?<token>.)/s;
  66. };
  67. my $enc = '';
  68. while ($str =~ s/$regex//) {
  69. my $m = $+{token};
  70. my $r = $+{rest};
  71. if (defined $r) {
  72. $enc .= ("[$dict->{$m}x" . (1 + length($r) / length($m)) . "]");
  73. }
  74. else {
  75. $enc .= $m;
  76. }
  77. }
  78. return $enc;
  79. }
  80. foreach my $str (
  81. qw(
  82. ABABABAB
  83. ABABABABAAAAAAAAAAAAAFFFFFFFFFFFFFFFFFFFDDDDDDDDDDDDDDDDDDDDJKLABABVADSABABAB
  84. DABDDB DABDDBBDDBA ABBDDD ABRACADABRA TOBEORNOTTOBEORTOBEORNOT
  85. )
  86. ) {
  87. say "Encoding: $str";
  88. say "Encoded: ", length_encoder($str);
  89. say "-" x 80;
  90. }