arithmetic_coding.pl 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # Date: 01 May 2015
  4. # https://github.com/trizen
  5. # The arithmetic coding algorithm, as_a_generalized_change_of_radix.
  6. # See also:
  7. # https://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use Math::BigInt (try => 'GMP');
  12. sub asciibet {
  13. map { chr } 0 .. 255;
  14. }
  15. sub cumulative_freq {
  16. my ($freq) = @_;
  17. my %cf;
  18. my $total = Math::BigInt->new(0);
  19. foreach my $c (asciibet()) {
  20. if (exists $freq->{$c}) {
  21. $cf{$c} = $total;
  22. $total += $freq->{$c};
  23. }
  24. }
  25. return %cf;
  26. }
  27. sub arithmethic_coding {
  28. my ($str, $radix) = @_;
  29. my @chars = split(//, $str);
  30. # The frequency characters
  31. my %freq;
  32. $freq{$_}++ for @chars;
  33. # The cumulative frequency table
  34. my %cf = cumulative_freq(\%freq);
  35. # Limit and base
  36. my $base = scalar @chars;
  37. # Lower bound
  38. my $L = Math::BigInt->new(0);
  39. # Product of all frequencies
  40. my $pf = Math::BigInt->new(1);
  41. # Each term is multiplied by the product of the
  42. # frequencies of all previously occurring symbols
  43. foreach my $c (@chars) {
  44. $L->bmuladd($base, $cf{$c} * $pf);
  45. $pf->bmul($freq{$c});
  46. }
  47. # Upper bound
  48. my $U = $L + $pf;
  49. #~ say $L;
  50. #~ say $U;
  51. my $pow = Math::BigInt->new($pf)->blog($radix);
  52. my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
  53. return ($enc, $pow, \%freq);
  54. }
  55. sub arithmethic_decoding {
  56. my ($enc, $radix, $pow, $freq) = @_;
  57. # Multiply enc by 10^pow
  58. $enc *= $radix**$pow;
  59. my $base = Math::BigInt->new(0);
  60. $base += $_ for values %{$freq};
  61. # Create the cumulative frequency table
  62. my %cf = cumulative_freq($freq);
  63. # Create the dictionary
  64. my %dict;
  65. while (my ($k, $v) = each %cf) {
  66. $dict{$v} = $k;
  67. }
  68. # Fill the gaps in the dictionary
  69. my $lchar;
  70. foreach my $i (0 .. $base - 1) {
  71. if (exists $dict{$i}) {
  72. $lchar = $dict{$i};
  73. }
  74. elsif (defined $lchar) {
  75. $dict{$i} = $lchar;
  76. }
  77. }
  78. # Decode the input number
  79. my $decoded = '';
  80. for (my $i = $base - 1 ; $i >= 0 ; $i--) {
  81. my $pow = $base**$i;
  82. my $div = ($enc / $pow);
  83. my $c = $dict{$div};
  84. my $fv = $freq->{$c};
  85. my $cv = $cf{$c};
  86. my $rem = ($enc - $pow * $cv) / $fv;
  87. #~ say "$enc / $base^$i = $div ($c)";
  88. #~ say "($enc - $base^$i * $cv) / $fv = $rem\n";
  89. $enc = $rem;
  90. $decoded .= $c;
  91. }
  92. # Return the decoded output
  93. return $decoded;
  94. }
  95. #
  96. ## Run some tests
  97. #
  98. my $radix = 10; # can be any integer >= 2
  99. foreach my $str (
  100. qw(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT 吹吹打打),
  101. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '
  102. . 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '
  103. . '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '
  104. . 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  105. ) {
  106. my ($enc, $pow, $freq) = arithmethic_coding($str, $radix);
  107. my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
  108. say "Encoded: $enc";
  109. say "Decoded: $dec";
  110. if ($str ne $dec) {
  111. die "\tHowever that is incorrect!";
  112. }
  113. say "-" x 80;
  114. }