arithmetic_coding_anynum.pl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # Date: 01 May 2015
  4. # https://github.com/trizen
  5. # The arithmetic coding algorithm, as_a_generalized_change_of_radix.
  6. # See also:
  7. # https://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use Math::AnyNum qw(ipow ilog idiv);
  12. sub asciibet {
  13. map { chr } 0 .. 255;
  14. }
  15. sub cumulative_freq {
  16. my ($freq) = @_;
  17. my %cf;
  18. my $total = Math::AnyNum->new(0);
  19. foreach my $c (asciibet()) {
  20. if (exists $freq->{$c}) {
  21. $cf{$c} = $total;
  22. $total += $freq->{$c};
  23. }
  24. }
  25. return %cf;
  26. }
  27. sub arithmethic_coding {
  28. my ($str, $radix) = @_;
  29. my @chars = split(//, $str);
  30. # The frequency characters
  31. my %freq;
  32. $freq{$_}++ for @chars;
  33. # The cumulative frequency table
  34. my %cf = cumulative_freq(\%freq);
  35. # Base
  36. my $base = Math::AnyNum->new(scalar @chars);
  37. # Lower bound
  38. my $L = Math::AnyNum->new(0);
  39. # Product of all frequencies
  40. my $pf = Math::AnyNum->new(1);
  41. # Each term is multiplied by the product of the
  42. # frequencies of all previously occurring symbols
  43. foreach my $c (@chars) {
  44. $L *= $base;
  45. $L += $cf{$c} * $pf;
  46. $pf *= $freq{$c};
  47. }
  48. # Upper bound
  49. my $U = $L + $pf;
  50. #~ say $L;
  51. #~ say $U;
  52. my $pow = ilog($pf, $radix);
  53. my $enc = idiv($U - 1, ipow($radix, $pow));
  54. return ($enc, $pow, \%freq);
  55. }
  56. sub arithmethic_decoding {
  57. my ($enc, $radix, $pow, $freq) = @_;
  58. # Multiply enc by 10^pow
  59. $enc *= ipow($radix, $pow);
  60. my $base = Math::AnyNum->new(0);
  61. $base += $_ for values %{$freq};
  62. # Create the cumulative frequency table
  63. my %cf = cumulative_freq($freq);
  64. # Create the dictionary
  65. my %dict;
  66. while (my ($k, $v) = each %cf) {
  67. $dict{$v} = $k;
  68. }
  69. # Fill the gaps in the dictionary
  70. my $lchar;
  71. foreach my $i (0 .. $base - 1) {
  72. if (exists $dict{$i}) {
  73. $lchar = $dict{$i};
  74. }
  75. elsif (defined $lchar) {
  76. $dict{$i} = $lchar;
  77. }
  78. }
  79. # Decode the input number
  80. my $decoded = '';
  81. for (my $pow = ipow($base, $base - 1) ; $pow > 0 ; $pow = idiv($pow, $base)) {
  82. my $div = idiv($enc, $pow);
  83. my $c = $dict{$div};
  84. my $fv = $freq->{$c};
  85. my $cv = $cf{$c};
  86. my $rem = idiv($enc - $pow * $cv, $fv);
  87. #~ say "$enc / $base^$pow = $div ($c)";
  88. #~ say "($enc - $base^$pow * $cv) / $fv = $rem\n";
  89. $enc = $rem;
  90. $decoded .= $c;
  91. }
  92. # Return the decoded output
  93. return $decoded;
  94. }
  95. #
  96. ## Run some tests
  97. #
  98. my $radix = 10; # can be any integer >= 2
  99. foreach my $str (
  100. qw(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT 吹吹打打),
  101. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '
  102. . 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '
  103. . '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '
  104. . 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  105. ) {
  106. my ($enc, $pow, $freq) = arithmethic_coding($str, $radix);
  107. my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
  108. say "Encoded: $enc";
  109. say "Decoded: $dec";
  110. if ($str ne $dec) {
  111. die "\tHowever that is incorrect!";
  112. }
  113. say "-" x 80;
  114. }
  115. open my $fh, '<', __FILE__;
  116. my $content = do { local $/; <$fh> };
  117. my ($enc, $pow, $freq) = arithmethic_coding($content, $radix);
  118. my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
  119. if ($dec ne $content) {
  120. die "Failed to encode and decode the __FILE__ correctly.";
  121. }