reptop.pl 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 29 November 2011
  5. # Edit: 03 November 2012
  6. # https://github.com/trizen
  7. # Find how many times each word exists in a file.
  8. use 5.010;
  9. use strict;
  10. use warnings;
  11. use open IO => ':utf8', ':std';
  12. use Getopt::Long qw(GetOptions :config no_ignore_case);
  13. my $word; # count for a particular word
  14. my $regex; # split by regex
  15. my $lowercase; # lowercase words
  16. my $top = 0; # top of repeated words
  17. my $length = 1; # mimimum length of a word
  18. sub usage {
  19. print <<"HELP";
  20. usage: $0: [options] <file>
  21. \nOptions:
  22. -B : deactivate word match boundary (default: on)
  23. -L : lowercase every word (default: off)
  24. -w=s : show how many times a word repeats in the list
  25. -t=i : show a top list of 'i' words (default: $top)
  26. -l=i : minimum length of a valid word (default: $length)
  27. -r=s : split by a regular expression (default: \\W+)\n
  28. HELP
  29. exit 0;
  30. }
  31. usage() unless @ARGV;
  32. my $no_boundary;
  33. GetOptions(
  34. 'word|w=s' => \$word,
  35. 'top|t=i' => \$top,
  36. 'regex|r=s' => \$regex,
  37. 'no-boundary|B' => \$no_boundary,
  38. 'L|lowercase!' => \$lowercase,
  39. 'length|l=i' => \$length,
  40. 'help|h|usage' => \&usage,
  41. );
  42. my $boundary = $no_boundary ? '' : '\\b';
  43. $regex = defined $regex ? qr/$regex/ : qr/\W+/;
  44. foreach my $file (grep { -f } @ARGV) {
  45. my $file_content;
  46. open my $fh, '<:encoding(UTF-8)', $file or die "Unable to open file '$file': $!\n";
  47. read $fh, $file_content, -s $file;
  48. close $fh;
  49. if ($lowercase) {
  50. $file_content = lc $file_content;
  51. }
  52. study $file_content;
  53. if (defined($word)) {
  54. my $i = 0;
  55. ++$i while $file_content =~ /$boundary\Q$word\E$boundary/go;
  56. printf "Word '%s' repeats %d time%s in the list.\n", $word, $i, ($i == 1 ? '' : 's');
  57. next;
  58. }
  59. my %uniq;
  60. @uniq{split($regex, $file_content)} = ();
  61. my @out;
  62. foreach my $word (keys %uniq) {
  63. next unless length $word >= $length;
  64. my $i = 0;
  65. ++$i while $file_content =~ /$boundary\Q$word\E$boundary/g;
  66. push @out, [$i, $word];
  67. }
  68. my $i = 0;
  69. my @sorted = sort { $b->[0] <=> $a->[0] } @out;
  70. my $max = length $sorted[0][0];
  71. print "> $file\n";
  72. foreach my $out (@sorted) {
  73. printf "%*s -> %s\n", $max, $out->[0], $out->[1];
  74. last if $top and ++$i == $top;
  75. }
  76. }