12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- #!/usr/bin/perl
- # Daniel "Trizen" Șuteu
- # License: GPLv3
- # Date: 29 November 2011
- # Edit: 03 November 2012
- # https://github.com/trizen
- # Find how many times each word exists in a file.
- use 5.010;
- use strict;
- use warnings;
- use open IO => ':utf8', ':std';
- use Getopt::Long qw(GetOptions :config no_ignore_case);
- my $word; # count for a particular word
- my $regex; # split by regex
- my $lowercase; # lowercase words
- my $top = 0; # top of repeated words
- my $length = 1; # mimimum length of a word
- sub usage {
- print <<"HELP";
- usage: $0: [options] <file>
- \nOptions:
- -B : deactivate word match boundary (default: on)
- -L : lowercase every word (default: off)
- -w=s : show how many times a word repeats in the list
- -t=i : show a top list of 'i' words (default: $top)
- -l=i : minimum length of a valid word (default: $length)
- -r=s : split by a regular expression (default: \\W+)\n
- HELP
- exit 0;
- }
- usage() unless @ARGV;
- my $no_boundary;
- GetOptions(
- 'word|w=s' => \$word,
- 'top|t=i' => \$top,
- 'regex|r=s' => \$regex,
- 'no-boundary|B' => \$no_boundary,
- 'L|lowercase!' => \$lowercase,
- 'length|l=i' => \$length,
- 'help|h|usage' => \&usage,
- );
- my $boundary = $no_boundary ? '' : '\\b';
- $regex = defined $regex ? qr/$regex/ : qr/\W+/;
- foreach my $file (grep { -f } @ARGV) {
- my $file_content;
- open my $fh, '<:encoding(UTF-8)', $file or die "Unable to open file '$file': $!\n";
- read $fh, $file_content, -s $file;
- close $fh;
- if ($lowercase) {
- $file_content = lc $file_content;
- }
- study $file_content;
- if (defined($word)) {
- my $i = 0;
- ++$i while $file_content =~ /$boundary\Q$word\E$boundary/go;
- printf "Word '%s' repeats %d time%s in the list.\n", $word, $i, ($i == 1 ? '' : 's');
- next;
- }
- my %uniq;
- @uniq{split($regex, $file_content)} = ();
- my @out;
- foreach my $word (keys %uniq) {
- next unless length $word >= $length;
- my $i = 0;
- ++$i while $file_content =~ /$boundary\Q$word\E$boundary/g;
- push @out, [$i, $word];
- }
- my $i = 0;
- my @sorted = sort { $b->[0] <=> $a->[0] } @out;
- my $max = length $sorted[0][0];
- print "> $file\n";
- foreach my $out (@sorted) {
- printf "%*s -> %s\n", $max, $out->[0], $out->[1];
- last if $top and ++$i == $top;
- }
- }
|