similar_files_levenshtein.pl 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 13 January 2016
  4. # https://github.com/trizen
  5. # Finds files which have almost the same content, using the Levenshtein distance.
  6. #
  7. ## WARNING! For strict duplicates, use the 'fdf' script:
  8. # https://github.com/trizen/perl-scripts/blob/master/Finders/fdf
  9. #
  10. use 5.010;
  11. use strict;
  12. use warnings;
  13. use Fcntl qw(O_RDONLY);
  14. use File::Find qw(find);
  15. use Getopt::Long qw(GetOptions);
  16. use Text::LevenshteinXS qw(distance);
  17. use Number::Bytes::Human qw(parse_bytes);
  18. my $unique = 0;
  19. my $threshold = 70;
  20. my $max_size = '100KB';
  21. sub help {
  22. my ($code) = @_;
  23. print <<"HELP";
  24. usage: $0 [options] [/dir/a] [/dir/b] [...]
  25. options:
  26. -s --size=s : maximum file size (default: $max_size)
  27. -u --unique! : don't include a file in more groups (default: false)
  28. -t --threshold=f : threshold percentage (default: $threshold)
  29. Example:
  30. perl $0 ~/Documents
  31. HELP
  32. exit($code // 0);
  33. }
  34. GetOptions(
  35. 's|size=s' => \$max_size,
  36. 'u|unique!' => \$unique,
  37. 't|threshold=f' => \$threshold,
  38. 'h|help' => \&help,
  39. )
  40. or die("Error in command line arguments");
  41. @ARGV || help();
  42. $max_size = parse_bytes($max_size);
  43. sub look_similar {
  44. my ($f1, $f2) = @_;
  45. sysopen my $fh1, $f1, O_RDONLY or return;
  46. sysopen my $fh2, $f2, O_RDONLY or return;
  47. my $s1 = (-s $f1) || (-s $fh1);
  48. my $s2 = (-s $f2) || (-s $fh2);
  49. my ($min, $max) = $s1 < $s2 ? ($s1, $s2) : ($s2, $s1);
  50. my $diff = int($max * (100 - $threshold) / 100);
  51. ($max - $min) > $diff and return;
  52. sysread($fh1, (my $c1), $s1) || return;
  53. sysread($fh2, (my $c2), $s2) || return;
  54. distance($c1, $c2) <= $diff;
  55. }
  56. sub find_similar_files (&@) {
  57. my $code = shift;
  58. my %files;
  59. find {
  60. no_chdir => 1,
  61. wanted => sub {
  62. lstat;
  63. (-f _) && (not -l _) && do {
  64. my $size = -s _;
  65. if ($size <= $max_size) {
  66. # TODO: better grouping
  67. push @{$files{int log $size}}, $File::Find::name;
  68. }
  69. };
  70. }
  71. } => @_;
  72. foreach my $key (sort { $a <=> $b } keys %files) {
  73. next if $#{$files{$key}} < 1;
  74. my @files = @{$files{$key}};
  75. my %dups;
  76. foreach my $i (0 .. $#files - 1) {
  77. for (my $j = $i + 1 ; $j <= $#files ; $j++) {
  78. if (look_similar($files[$i], $files[$j])) {
  79. push @{$dups{$files[$i]}},
  80. (
  81. $unique
  82. ? splice(@files, $j--, 1)
  83. : $files[$j]
  84. );
  85. }
  86. }
  87. }
  88. while (my ($fparent, $fdups) = each %dups) {
  89. $code->(sort $fparent, @{$fdups});
  90. }
  91. }
  92. return 1;
  93. }
  94. {
  95. local $, = "\n";
  96. find_similar_files {
  97. say @_, "-" x 80 if @_;
  98. }
  99. @ARGV;
  100. }