dup_subtr_finder.pl 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 11 December 2013
  5. # https://trizenx.blogspot.com
  6. # Find the longest duplicated sub-strings inside a string/file (based on a given minimum length).
  7. use 5.010;
  8. use strict;
  9. use autodie;
  10. use warnings;
  11. use List::Util qw(first);
  12. use Data::Dump qw(pp);
  13. use Getopt::Std qw(getopts);
  14. sub find_substrings (&@) {
  15. my ($code, $str, $min) = @_;
  16. my @substrings;
  17. my $len = length($str);
  18. my $max = int($len / 2);
  19. my @pos;
  20. for (my $i = $max ; $i >= $min ; $i--) {
  21. for (my $j = 0 ; $j <= $len - $i * 2 ; $j++) {
  22. #die $i if $i > ($len - ($j + $i)); # not gonna happen
  23. #say "=>> ", substr($str, $j, $i);
  24. if (defined(my $arr = first { $j >= $_->[0] && $j <= $_->[1] } @pos)) {
  25. $j = $arr->[1];
  26. next;
  27. }
  28. if ((my $pos = index($str, substr($str, $j, $i), $j + $i)) != -1) {
  29. $code->({pos => [$j, $pos], len => $i, substr => substr($str, $j, $i)});
  30. push @pos, [$j, $j + $i]; # don't match again in substr
  31. #push @pos, [$pos, $pos + $i]; # don't match again in dup-substr
  32. $j += $i;
  33. }
  34. }
  35. }
  36. =old
  37. for (my $j = 0 ; $j <= $len ; $j++) {
  38. for (my $i = $len - $j > $max ? $max : $len - $j ; $i >= $min ; $i--) {
  39. next if $i > ($len - ($j + $i));
  40. if ((my $pos = index($str, substr($str, $j, $i), $j + $i)) != -1) {
  41. $code->({pos => [$j, $pos], len => $i, substr => substr($str, $j, $i)});
  42. $j += $i;
  43. last;
  44. }
  45. }
  46. }
  47. =cut
  48. return @substrings;
  49. }
  50. #
  51. ## MAIN
  52. #
  53. sub usage {
  54. print <<"USAGE";
  55. usage: $0 [options] [input-file]
  56. options:
  57. -m <int> : the minimum sub-string length
  58. example: perl $0 -m 50 file.txt
  59. USAGE
  60. exit 1;
  61. }
  62. my %opt;
  63. getopts('m:', \%opt);
  64. my $file = @ARGV && (-f $ARGV[0]) ? shift() : usage();
  65. my $minLen = $opt{m} || (-s $file) / 10;
  66. # Dearly spider
  67. find_substrings { say pp(shift) } (
  68. do {
  69. local $/;
  70. open my $fh, '<', $file;
  71. <$fh>;
  72. },
  73. $minLen
  74. );