burrows-wheeler_file_transform.pl 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 17 September 2023
  5. # https://github.com/trizen
  6. # Apply the Burrows–Wheeler transform on a file.
  7. # https://rosettacode.org/wiki/Burrows–Wheeler_transform
  8. # References:
  9. # Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
  10. # https://youtube.com/watch?v=rQ7wwh4HRZM
  11. #
  12. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  13. # https://youtube.com/watch?v=cvoZbBZ3M2A
  14. use 5.036;
  15. use Getopt::Std qw(getopts);
  16. use File::Basename qw(basename);
  17. use constant {
  18. LOOKAHEAD_LEN => 128, # lower values are usually faster
  19. };
  20. sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
  21. #<<<
  22. [
  23. map { $_->[1] } sort {
  24. ($a->[0] cmp $b->[0])
  25. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  26. }
  27. map {
  28. my $t = substr($s, $_, LOOKAHEAD_LEN);
  29. if (length($t) < LOOKAHEAD_LEN) {
  30. $t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
  31. }
  32. [$t, $_]
  33. } 0 .. length($s) - 1
  34. ];
  35. #>>>
  36. }
  37. sub bwt_encode ($s) {
  38. my $bwt = bwt_balanced($s);
  39. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  40. my $idx = 0;
  41. foreach my $i (@$bwt) {
  42. $i || last;
  43. ++$idx;
  44. }
  45. return ($ret, $idx);
  46. }
  47. sub bwt_decode ($bwt, $idx) { # fast inversion: O(n * log(n))
  48. my @tail = split(//, $bwt);
  49. my @head = sort @tail;
  50. if ($idx > $#head) {
  51. die "Invalid bwt-index: $idx (must be <= $#head)\n";
  52. }
  53. my %indices;
  54. foreach my $i (0 .. $#tail) {
  55. push @{$indices{$tail[$i]}}, $i;
  56. }
  57. my @table;
  58. foreach my $v (@head) {
  59. push @table, shift(@{$indices{$v}});
  60. }
  61. my $dec = '';
  62. my $i = $idx;
  63. for (1 .. scalar(@head)) {
  64. $dec .= $head[$i];
  65. $i = $table[$i];
  66. }
  67. return $dec;
  68. }
  69. getopts('dh', \my %opts);
  70. if ($opts{h} or !@ARGV) {
  71. die "usage: $0 [-d] [input file] [output file]\n";
  72. }
  73. my $input_file = $ARGV[0];
  74. my $output_file = $ARGV[1] // (basename($input_file) . ($opts{d} ? '.dec' : '.bw'));
  75. my $content = do {
  76. open my $fh, '<:raw', $input_file
  77. or die "Can't open file <<$input_file>> for reading: $!";
  78. local $/;
  79. <$fh>;
  80. };
  81. if ($opts{d}) { # decode mode
  82. my $idx = unpack('N', substr($content, 0, 4, ''));
  83. my $dec = bwt_decode($content, $idx);
  84. open my $out_fh, '>:raw', $output_file
  85. or die "Can't open file <<$output_file>> for writing: $!";
  86. print $out_fh $dec;
  87. }
  88. else {
  89. my ($bwt, $idx) = bwt_encode($content);
  90. open my $out_fh, '>:raw', $output_file
  91. or die "Can't open file <<$output_file>> for writing: $!";
  92. print $out_fh pack('N', $idx);
  93. print $out_fh $bwt;
  94. }