word_suffix_top.pl 795 B

12345678910111213141516171819202122232425262728293031323334353637383940
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 05 April 2015
  5. # https://github.com/trizen
  6. # Word suffix top
  7. use 5.014;
  8. use autodie;
  9. use warnings;
  10. use Text::Unidecode qw(unidecode);
  11. my %top;
  12. my $file = shift() // die "usage: $0 file [suffix len]\n";
  13. my $i = shift() // 3;
  14. my $total = 0;
  15. {
  16. open my $fh, '<:utf8', $file;
  17. while (<$fh>) {
  18. s/[_\W]+\z//;
  19. if (/(\w{$i})\z/) {
  20. ++$top{lc(unidecode($1))};
  21. ++$total;
  22. }
  23. }
  24. close $fh;
  25. }
  26. my $lonely = 0;
  27. foreach my $key (sort { $top{$b} <=> $top{$a} or $a cmp $b } keys %top) {
  28. printf("%s%10s%10.02f%%\n", $key, $top{$key}, $top{$key} / $total * 100);
  29. ++$lonely if ($top{$key} == 1);
  30. }
  31. printf "\n** Unique suffixes: %.02f%%\n", $lonely / $total * 100;