binary_arithmetic_coding.pl 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # Date: 07 May 2015
  4. # https://github.com/trizen
  5. # The binary arithmetic coding algorithm.
  6. # See also:
  7. # https://en.wikipedia.org/wiki/Arithmetic_coding
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use Math::BigInt (try => 'GMP');
  12. use Math::BigRat (try => 'GMP');
  13. sub asciibet {
  14. map { chr } 0 .. 255;
  15. }
  16. sub cumulative_freq {
  17. my ($freq, $sum) = @_;
  18. my %cf;
  19. my $total = 0;
  20. foreach my $c (asciibet()) {
  21. if (exists $freq->{$c}) {
  22. $cf{$c} = $total;
  23. $total += $freq->{$c};
  24. }
  25. }
  26. return %cf;
  27. }
  28. sub mass_function {
  29. my ($freq, $sum) = @_;
  30. my %p;
  31. $p{$_} = Math::BigRat->new($freq->{$_}) / $sum for keys %{$freq};
  32. return %p;
  33. }
  34. sub arithmethic_coding {
  35. my ($str) = @_;
  36. my @chars = split(//, $str);
  37. my %freq;
  38. $freq{$_}++ for @chars;
  39. my $len = @chars;
  40. my %p = mass_function(\%freq, $len);
  41. my %cf = cumulative_freq(\%p, $len);
  42. my $pf = Math::BigRat->new(1);
  43. my $L = Math::BigRat->new(0);
  44. foreach my $c (@chars) {
  45. $L->badd($pf * $cf{$c});
  46. $pf->bmul($p{$c});
  47. }
  48. my $U = $L + $pf;
  49. my $big_two = Math::BigInt->new(2);
  50. my $two_pow = Math::BigInt->new(1);
  51. my $n = Math::BigRat->new(0);
  52. my $bin = '';
  53. for (my $i = Math::BigInt->new(1) ; ($n < $L || $n >= $U) ; $i->binc) {
  54. my $m = Math::BigRat->new(1)->bdiv($two_pow->bmul($big_two));
  55. if ($n + $m < $U) {
  56. $n += $m;
  57. $bin .= '1';
  58. }
  59. else {
  60. $bin .= '0';
  61. }
  62. }
  63. return ($bin, $len, \%freq);
  64. }
  65. sub arithmethic_decoding {
  66. my ($enc, $len, $freq) = @_;
  67. my $two_pow = Math::BigInt->new(1);
  68. my $big_two = Math::BigInt->new(2);
  69. my $line = Math::BigRat->new(0);
  70. my @bin = split(//, $enc);
  71. foreach my $i (0 .. $#bin) {
  72. $line->badd(scalar Math::BigRat->new($bin[$i])->bdiv($two_pow->bmul($big_two)));
  73. }
  74. my %p = mass_function($freq, $len);
  75. my %cf = cumulative_freq(\%p, $len);
  76. my %df;
  77. foreach my $k (keys %p) {
  78. $df{$k} = $cf{$k} + $p{$k};
  79. }
  80. my $L = 0;
  81. my $U = 1;
  82. my $decoded = '';
  83. my @chars = sort { $p{$a} <=> $p{$b} or $a cmp $b } keys %p;
  84. my $i = 0;
  85. while (1) {
  86. foreach my $c (@chars) {
  87. my $w = $U - $L;
  88. my $low = $L + $w * $cf{$c};
  89. my $high = $L + $w * $df{$c};
  90. if ($low <= $line and $line < $high) {
  91. ($L, $U) = ($low, $high);
  92. $decoded .= $c;
  93. if (++$i == $len) {
  94. return $decoded;
  95. }
  96. }
  97. }
  98. }
  99. }
  100. #
  101. ## Run some tests
  102. #
  103. foreach my $str (
  104. 'this is a message for you to encode and to decode correctly!',
  105. join('', 'a' .. 'z', 0 .. 9, 'A' .. 'Z', 0 .. 9),
  106. qw(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT),
  107. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '
  108. . 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '
  109. . '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '
  110. . 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  111. ) {
  112. my ($enc, $len, $freq) = arithmethic_coding($str);
  113. my $dec = arithmethic_decoding($enc, $len, $freq);
  114. say "Encoded: $enc";
  115. say "Decoded: $dec";
  116. if ($str ne $dec) {
  117. die "\tHowever that is incorrect!";
  118. }
  119. say "-" x 80;
  120. }