binradix_arithmetic_coding_anynum.pl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # Date: 07 May 2015
  4. # https://github.com/trizen
  5. # The arithmetic coding algorithm (radix+binary).
  6. # See also:
  7. # https://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use Math::AnyNum qw(ipow ipow10 idiv);
  12. sub cumulative_freq {
  13. my ($freq) = @_;
  14. my %cf;
  15. my $total = Math::AnyNum->new(0);
  16. foreach my $c (sort keys %$freq) {
  17. $cf{$c} = $total;
  18. $total += $freq->{$c};
  19. }
  20. return %cf;
  21. }
  22. sub arithmethic_coding {
  23. my ($str) = @_;
  24. my @chars = split(//, $str);
  25. # The frequency characters
  26. my %freq;
  27. $freq{$_}++ for @chars;
  28. # The cumulative frequency table
  29. my %cf = cumulative_freq(\%freq);
  30. # Limit and base
  31. my $base = scalar @chars;
  32. # Lower bound
  33. my $L = Math::AnyNum->new(0);
  34. # Product of all frequencies
  35. my $pf = Math::AnyNum->new(1);
  36. # Each term is multiplied by the product of the
  37. # frequencies of all previously occurring symbols
  38. for my $c (@chars) {
  39. $L *= $base;
  40. $L += $cf{$c} * $pf;
  41. $pf *= $freq{$c};
  42. }
  43. # Upper bound
  44. my $U = $L + $pf;
  45. my $len = $L->length;
  46. $L = Math::AnyNum->new("$L / " . ipow10($len));
  47. $U = Math::AnyNum->new("$U / " . ipow10($len));
  48. my $t = Math::AnyNum->new(1);
  49. my $n = Math::AnyNum->new(0);
  50. my $bin = '';
  51. while ($n < $L || $n >= $U) {
  52. my $m = 1 / ($t <<= 1);
  53. if ($n + $m < $U) {
  54. $n += $m;
  55. $bin .= '1';
  56. }
  57. else {
  58. $bin .= '0';
  59. }
  60. }
  61. #~ say $L;
  62. #~ say $U;
  63. return ($bin, $len, \%freq);
  64. }
  65. sub arithmethic_decoding {
  66. my ($enc, $pow, $freq) = @_;
  67. my $t = Math::AnyNum->new(1);
  68. my $line = Math::AnyNum->new(0);
  69. my @bin = split(//, $enc);
  70. foreach my $i (0 .. $#bin) {
  71. $line += $bin[$i] / ($t <<= 1);
  72. }
  73. $enc = $line * ipow10($pow);
  74. my $base = Math::AnyNum->new(0);
  75. $base += $_ for values %{$freq};
  76. # Create the cumulative frequency table
  77. my %cf = cumulative_freq($freq);
  78. # Create the dictionary
  79. my %dict;
  80. while (my ($k, $v) = each %cf) {
  81. $dict{$v} = $k;
  82. }
  83. # Fill the gaps in the dictionary
  84. my $lchar;
  85. foreach my $i (0 .. $base - 1) {
  86. if (exists $dict{$i}) {
  87. $lchar = $dict{$i};
  88. }
  89. elsif (defined $lchar) {
  90. $dict{$i} = $lchar;
  91. }
  92. }
  93. # Decode the input number
  94. my $decoded = '';
  95. for (my $i = $base - 1 ; $i >= 0 ; $i--) {
  96. my $pow = ipow($base, $i);
  97. my $div = idiv($enc, $pow);
  98. my $c = $dict{$div};
  99. my $fv = $freq->{$c};
  100. my $cv = $cf{$c};
  101. my $rem = ($enc - $pow * $cv) / $fv;
  102. #~ say "$enc / $base^$i = $div ($c)";
  103. #~ say "($enc - $base^$i * $cv) / $fv = $rem\n";
  104. $enc = $rem;
  105. $decoded .= $c;
  106. }
  107. # Return the decoded output
  108. return $decoded;
  109. }
  110. #
  111. ## Run some tests
  112. #
  113. foreach my $str (
  114. qw(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT),
  115. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '
  116. . 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '
  117. . '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '
  118. . 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  119. ) {
  120. my ($enc, $pow, $freq) = arithmethic_coding($str);
  121. my $dec = arithmethic_decoding($enc, $pow, $freq);
  122. say "Encoded: $enc";
  123. say "Decoded: $dec";
  124. if ($str ne $dec) {
  125. die "\tHowever that is incorrect!";
  126. }
  127. say "-" x 80;
  128. }