arithmetic_coding.pl 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/perl
  2. #
  3. ## The arithmetic coding algorithm.
  4. #
  5. # See: http://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
  6. use 5.010;
  7. use strict;
  8. use warnings;
  9. use lib qw(../lib);
  10. use Math::AnyNum qw(ipow ipow10 ilog10 idiv);
  11. sub asciibet {
  12. map { chr } 0 .. 255;
  13. }
  14. sub cumulative_freq {
  15. my ($freq) = @_;
  16. my %cf;
  17. my $total = Math::AnyNum->new(0);
  18. foreach my $c (asciibet()) {
  19. if (exists $freq->{$c}) {
  20. $cf{$c} = $total;
  21. $total += $freq->{$c};
  22. }
  23. }
  24. return %cf;
  25. }
  26. sub arithmethic_coding {
  27. my ($str) = @_;
  28. my @chars = split(//, $str);
  29. # The frequency characters
  30. my %freq;
  31. $freq{$_}++ for @chars;
  32. # The cumulative frequency table
  33. my %cf = cumulative_freq(\%freq);
  34. # Base
  35. my $base = Math::AnyNum->new(scalar @chars);
  36. # Lower bound
  37. my $L = Math::AnyNum->new(0);
  38. # Product of all frequencies
  39. my $pf = Math::AnyNum->new(1);
  40. # Each term is multiplied by the product of the
  41. # frequencies of all previously occurring symbols
  42. foreach my $c (@chars) {
  43. $L *= $base;
  44. $L += $cf{$c} * $pf;
  45. $pf *= $freq{$c};
  46. }
  47. # Upper bound
  48. my $U = $L + $pf;
  49. #~ say $L;
  50. #~ say $U;
  51. my $pow = ilog10($pf);
  52. my $enc = idiv($U - 1, ipow10($pow));
  53. return ($enc, $pow, \%freq);
  54. }
  55. sub arithmethic_decoding {
  56. my ($enc, $pow, $freq) = @_;
  57. # Multiply enc by 10^pow
  58. $enc *= ipow10($pow);
  59. my $base = Math::AnyNum->new(0);
  60. $base += $_ for values %{$freq};
  61. # Create the cumulative frequency table
  62. my %cf = cumulative_freq($freq);
  63. # Create the dictionary
  64. my %dict;
  65. while (my ($k, $v) = each %cf) {
  66. $dict{$v} = $k;
  67. }
  68. # Fill the gaps in the dictionary
  69. my $lchar;
  70. foreach my $i (0 .. $base - 1) {
  71. if (exists $dict{$i}) {
  72. $lchar = $dict{$i};
  73. }
  74. elsif (defined $lchar) {
  75. $dict{$i} = $lchar;
  76. }
  77. }
  78. # Decode the input number
  79. my $decoded = '';
  80. for (my $pow = ipow($base, $base - 1) ; $pow > 0 ; $pow = idiv($pow, $base)) {
  81. my $div = idiv($enc, $pow);
  82. my $c = $dict{$div};
  83. my $fv = $freq->{$c};
  84. my $cv = $cf{$c};
  85. my $rem = idiv($enc - $pow * $cv, $fv);
  86. #~ say "$enc / $base^$pow = $div ($c)";
  87. #~ say "($enc - $base^$pow * $cv) / $fv = $rem\n";
  88. $enc = $rem;
  89. $decoded .= $c;
  90. }
  91. # Return the decoded output
  92. return $decoded;
  93. }
  94. #
  95. ## Run some tests
  96. #
  97. foreach my $str (
  98. qw(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT),
  99. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '
  100. . 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '
  101. . '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '
  102. . 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  103. ) {
  104. my ($enc, $pow, $freq) = arithmethic_coding($str);
  105. my $dec = arithmethic_decoding($enc, $pow, $freq);
  106. say "Encoded: $enc";
  107. say "Decoded: $dec";
  108. if ($str ne $dec) {
  109. die "\tHowever that is incorrect!";
  110. }
  111. say "-" x 80;
  112. }
  113. open my $fh, '<', __FILE__;
  114. my $content = do { local $/; <$fh> };
  115. my ($enc, $pow, $freq) = arithmethic_coding($content);
  116. my $dec = arithmethic_decoding($enc, $pow, $freq);
  117. if ($dec ne $content) {
  118. die "Failed to encode and decode the __FILE__ correctly.";
  119. }
  120. say "Done!";