find_similar_filenames_unidec.pl 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 22 June 2012
  5. # https://github.com/trizen
  6. # Find files which have exactly or *ALMOST*
  7. # exactly the same name in a given path.
  8. # Improved version here:
  9. # https://github.com/trizen/perl-scripts/blob/master/Finders/fsfn.pl
  10. use 5.014;
  11. use strict;
  12. use warnings;
  13. use File::Find qw(find);
  14. use Getopt::Std qw(getopts);
  15. use Text::Unidecode qw(unidecode);
  16. my @dirs = grep { -d } @ARGV;
  17. die <<"HELP" if !@dirs;
  18. usage: $0 [options] /my/path [...]
  19. Options:
  20. -f : keep only the first file
  21. -l : keep only the last file
  22. HELP
  23. my %opts;
  24. if (@ARGV) {
  25. getopts("fl", \%opts);
  26. }
  27. sub compare_strings ($$) {
  28. my ($name1, $name2) = @_;
  29. return 0 if $name1 eq $name2;
  30. if (length($name1) > length($name2)) {
  31. ($name2, $name1) = ($name1, $name2);
  32. }
  33. my $len1 = length($name1);
  34. my $len2 = length($name2);
  35. my $min = int(0.5 + $len2 / 2);
  36. return -1 if $min > $len1;
  37. my $diff = $len1 - $min;
  38. foreach my $i (0 .. $diff) {
  39. foreach my $j ($i .. $diff) {
  40. if (index($name2, substr($name1, $i, $min + $j - $i)) != -1) {
  41. return 0;
  42. }
  43. }
  44. }
  45. return 1;
  46. }
  47. sub find_duplicated_files (&@) {
  48. my $code = shift;
  49. my %files;
  50. find {
  51. wanted => sub {
  52. lstat;
  53. -f _ && (not -l _) && push @{$files{"key"}}, # to group files by size, change the "key" to '-s _' (unquoted)
  54. {
  55. name => do { utf8::decode($_); lc(unidecode($_) =~ s{\.\w+\z}{}r) },
  56. real_name => $File::Find::name,
  57. };
  58. }
  59. } => @_;
  60. foreach my $files (values %files) {
  61. next if $#{$files} < 1;
  62. my %dups;
  63. foreach my $i (0 .. $#{$files} - 1) {
  64. for (my $j = $i + 1 ; $j <= $#{$files} ; $j++) {
  65. if (compare_strings($files->[$i]{name}, $files->[$j]{name}) == 0) {
  66. push @{$dups{$files->[$i]{real_name}}}, ${splice @{$files}, $j--, 1}{real_name};
  67. }
  68. }
  69. }
  70. while (my ($fparent, $fdups) = each %dups) {
  71. $code->(sort $fparent, @{$fdups});
  72. }
  73. }
  74. return 1;
  75. }
  76. {
  77. local $, = "\n";
  78. find_duplicated_files {
  79. say @_, "-" x 80 if @_;
  80. foreach my $i (
  81. $opts{f} ? (1 .. $#_)
  82. : $opts{l} ? (0 .. $#_ - 1)
  83. : ()
  84. ) {
  85. unlink $_[$i] or warn "[error]: Can't delete: $!\n";
  86. }
  87. }
  88. @dirs;
  89. }