human-finder-visual.pl 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 20 April 2014
  5. # Website: https://github.com/trizen
  6. # A smart human-like substring finder
  7. # Steps:
  8. # 1. loop from i=2 and count up to int(sqrt(len(text)))
  9. # 2. loop from pos=(i-2)*len(substr)*2 and add int(len(text)/i) to pos while pos <= len(text)
  10. # 3. jump to position pos and scan back and forward and stop if the string is found somewhere nearby
  11. # 4. loop #2 end
  12. # 5. loop #1 end
  13. # 6. return -1
  14. use 5.010;
  15. use strict;
  16. use warnings;
  17. use Term::ANSIColor;
  18. my $TOTAL = 0; # count performance
  19. sub DEBUG () { 1 } # verbose mode
  20. sub random_find {
  21. my ($text, $substr) = @_;
  22. my $tlen = length($text);
  23. my $slen = length($substr);
  24. my $tmax = $tlen - $slen;
  25. my $smax = int($slen / 2); # this value influences the performance
  26. my $counter = 0;
  27. my $locate = sub {
  28. my ($pos, $guess) = @_;
  29. for my $i (0 .. $smax) {
  30. ++$counter if DEBUG; # measure performance
  31. if ( $pos + $i <= $tmax
  32. and substr($guess, $i) eq substr($substr, 0, $slen - $i)
  33. and substr($text, $pos + $i, $slen) eq $substr) {
  34. printf("RIGHT (i: %d; counter: %d):\n> %*s\n> %s\n", $i, $counter, $i + $slen, $substr, $guess) if DEBUG;
  35. $TOTAL += $counter if DEBUG;
  36. return $pos + $i;
  37. }
  38. elsif ( $pos - $i >= 0
  39. and substr($substr, $i) eq substr($guess, 0, $slen - $i)
  40. and substr($text, $pos - $i, $slen) eq $substr) {
  41. printf("LEFT (i: %d; counter: %d):\n> %s\n> %*s\n", $i, $counter, $substr, $i + $slen, $guess) if DEBUG;
  42. $TOTAL += $counter if DEBUG;
  43. return $pos - $i;
  44. }
  45. }
  46. return;
  47. };
  48. my %seen;
  49. foreach my $i (1 .. int(sqrt($tlen))) {
  50. #my $delta = int($tlen / $i)-$slen;
  51. #my $delta = int(($tlen - $slen));
  52. my $delta = int($tlen/$i);
  53. #say $delta;
  54. #for (my $pos = ($i - 1) * $slen ; $pos <= $tmax ; $pos += $delta) {
  55. for (my $pos = int($tlen/$i)-$slen ; $pos <= int(sqrt($tlen))*$i; $pos += $delta) {
  56. #next if $seen{$pos}++;
  57. #$pos -= $slen;
  58. #$delta -= $slen;
  59. say "POS: $pos" if DEBUG;
  60. if ($pos + $slen <= $tlen) {
  61. system 'clear';
  62. say substr($text, 0, $pos), color('bold red'), substr($text, $pos, $slen), color('reset'), substr($text, $pos+$slen);
  63. if (defined(my $i = $locate->($pos, substr($text, $pos, $slen)))) {
  64. say "** FORWARD MATCH!" if DEBUG;
  65. return $i;
  66. }
  67. sleep 1;
  68. }
  69. else {
  70. die "ERROR!";
  71. }
  72. =cut
  73. if ($pos >= $slen) {
  74. system 'clear';
  75. say substr($text, 0, $pos-$slen), color('bold red'), substr($text, $pos-$slen, $slen), color('reset'), substr($text, $pos);
  76. if (defined(my $i = $locate->($pos - $slen, substr($text, $pos - $slen, $slen)))) {
  77. say "** BACKWARD MATCH!" if DEBUG;
  78. return $i;
  79. }
  80. sleep 2;
  81. }
  82. =cut
  83. }
  84. }
  85. return -1;
  86. }
  87. my $text = join('', <DATA>);
  88. my $split = 30;
  89. random_find($text, q{the blue arcs to});
  90. say "TOTAL: ", $TOTAL if DEBUG;
  91. __END__
  92. The data structure has one node for every prefix of every
  93. string in the dictionary. So if (bca) is in the dictionar
  94. then there will be nodes for (bca), (bc), (b), and (). If
  95. is in the dictionary then it is blue node. Otherwise it i
  96. There is a black directed "child" arc from each node to a
  97. is found by appending one character. So there is a black
  98. There is a blue directed "suffix" arc from each node to t
  99. possible strict suffix of it in the graph. For example, f
  100. are (aa) and (a) and (). The longest of these that exists
  101. graph is (a). So there is a blue arc from (caa) to (a). T
  102. a green "dictionary suffix" arc from each node to the nex
  103. in the dictionary that can be reached by following blue a
  104. example, there is a green arc from (bca) to (a) because (
  105. node in the dictionary (i.e. a blue node) that is reached
  106. the blue arcs to (ca) and then on to (a).