jaro-winkler_distance.pl 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 17 October 2015
  5. # Website: https://github.com/trizen
  6. # Implementation of the Jaro-Winkler distance algorithm
  7. # See: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use List::Util qw(min max);
  12. sub jaro {
  13. my ($s, $t) = @_;
  14. my $s_len = length($s);
  15. my $t_len = length($t);
  16. return 1 if ($s_len == 0 and $t_len == 0);
  17. my $match_distance = int(max($s_len, $t_len) / 2) - 1;
  18. my @s_matches;
  19. my @t_matches;
  20. my @s = split(//, $s);
  21. my @t = split(//, $t);
  22. my $matches = 0;
  23. foreach my $i (0 .. $s_len - 1) {
  24. my $start = max(0, $i - $match_distance);
  25. my $end = min($i + $match_distance + 1, $t_len);
  26. foreach my $j ($start .. $end - 1) {
  27. $t_matches[$j] and next;
  28. $s[$i] eq $t[$j] or next;
  29. $s_matches[$i] = 1;
  30. $t_matches[$j] = 1;
  31. $matches++;
  32. last;
  33. }
  34. }
  35. return 0 if $matches == 0;
  36. my $k = 0;
  37. my $trans = 0;
  38. foreach my $i (0 .. $s_len - 1) {
  39. $s_matches[$i] or next;
  40. until ($t_matches[$k]) { ++$k }
  41. $s[$i] eq $t[$k] or ++$trans;
  42. ++$k;
  43. }
  44. #<<<
  45. (($matches / $s_len) + ($matches / $t_len)
  46. + (($matches - $trans / 2) / $matches)) / 3;
  47. #>>>
  48. }
  49. sub jaro_winkler {
  50. my ($s, $t) = @_;
  51. my $distance = jaro($s, $t);
  52. my $prefix = 0;
  53. foreach my $i (0 .. min(3, length($s), length($t))) {
  54. substr($s, $i, 1) eq substr($t, $i, 1) ? ++$prefix : last;
  55. }
  56. $distance + $prefix * 0.1 * (1 - $distance);
  57. }
  58. printf("%f\n", jaro_winkler("MARTHA", "MARHTA"));
  59. printf("%f\n", jaro_winkler("DWAYNE", "DUANE"));
  60. printf("%f\n", jaro_winkler("DIXON", "DICKSONX"));
  61. printf("%f\n", jaro_winkler("ROSETTACODE", "ROSETTASTONE"));