large_file_search.pl 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 28 July 2014
  4. # https://github.com/trizen
  5. # Search for a list of keywords inside a very large file
  6. use 5.010;
  7. use strict;
  8. use autodie;
  9. use warnings;
  10. use Fcntl qw(SEEK_CUR);
  11. use List::Util qw(max);
  12. use Term::ANSIColor qw(colored);
  13. use Getopt::Long qw(GetOptions);
  14. # Input file for search
  15. my $file = __FILE__;
  16. # Print before and after characters
  17. my $before = 5;
  18. my $after = 5;
  19. # Buffer size
  20. my $buffer = 1024**2; # 1 MB
  21. sub usage {
  22. my ($code) = @_;
  23. print <<"USAGE";
  24. usage: $0 [options] [keywords]
  25. options:
  26. --file=s : a very large file
  27. --buffer=i : buffer size (default: $buffer bytes)
  28. --before=i : display this many characters before match (default: $before)
  29. --after=i : display this many characters after match (default: $after)
  30. --help : print this message and exit
  31. example:
  32. $0 --file=document.txt "Foo Bar"
  33. USAGE
  34. exit($code // 0);
  35. }
  36. GetOptions(
  37. 'buffer=i' => \$buffer,
  38. 'file=s' => \$file,
  39. 'before=i' => \$before,
  40. 'after=i' => \$after,
  41. 'help|h' => sub { usage(0) },
  42. );
  43. @ARGV || usage(1);
  44. my @keys = @ARGV;
  45. my $max = max(map { length } @keys);
  46. if ($buffer <= $max) {
  47. die "The buffer value can't be <= than the length of the longest keyword!\n";
  48. }
  49. sysopen(my $fh, $file, 0);
  50. while ((my $size = sysread($fh, (my $chunk), $buffer)) > 0) {
  51. # Search for a given keyword
  52. foreach my $keyword (@keys) {
  53. my $idx = -1;
  54. while (($idx = index($chunk, $keyword, $idx + 1)) != -1) {
  55. # Take the sub-string
  56. my $len = length($keyword);
  57. my $bar = '-' x (40 - $len / 2);
  58. my $from = $idx > $before ? $idx - $before : 0;
  59. my $sstr = substr($chunk, $from, $len + $after + $before);
  60. # Split the sub-string
  61. my $pos = index($sstr, $keyword);
  62. my $bef = substr($sstr, 0, $pos);
  63. my $cur = substr($sstr, $pos, $len);
  64. my $aft = substr($sstr, $pos + $len);
  65. # Highlight and print the results
  66. say $bar, $keyword, $bar, '-' x ($len % 2);
  67. say $bef, colored($cur, 'red'), $aft;
  68. say '-' x 80;
  69. { # Unpack and print the results as character-values
  70. local $, = ' ';
  71. say unpack('C*', $bef), colored(join($,, unpack('C*', $cur)), 'red'), unpack('C*', $aft);
  72. }
  73. say '-' x 80;
  74. }
  75. }
  76. # Rewind back a little bit because we
  77. # might be in the middle of a keyword
  78. if ($size == $buffer) {
  79. sysseek($fh, sysseek($fh, 0, SEEK_CUR) - $max, 0);
  80. }
  81. }
  82. close($fh);