html2text.pl 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 08 January 2022
  4. # https://github.com/trizen
  5. # Convert HTML to text (UTF-8), given either an HTML file, or an URL.
  6. # Dependencies:
  7. # perl-html-tree
  8. # perl-html-formatter
  9. # perl-libwww (optional: when given URLs)
  10. # perl-lwp-protocol-https (optional: when given https:// URLs)
  11. # See also:
  12. # https://github.com/grobian/html2text
  13. use 5.020;
  14. use strict;
  15. use warnings;
  16. use experimental qw(signatures);
  17. use HTML::TreeBuilder 5 qw(-weak);
  18. use HTML::FormatText qw();
  19. use Getopt::Long qw(GetOptions);
  20. binmode(STDIN, ':utf8');
  21. binmode(STDOUT, ':utf8');
  22. sub extract_html ($source) {
  23. if ($source =~ m{^https?://}) {
  24. require LWP::UserAgent;
  25. require HTTP::Message;
  26. my $lwp = LWP::UserAgent->new(
  27. env_proxy => 1,
  28. timeout => 15,
  29. agent => "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
  30. cookie_jar => {},
  31. ssl_opts => {verify_hostname => 0},
  32. );
  33. state $accepted_encodings = HTTP::Message::decodable();
  34. $lwp->default_header('Accept-Encoding' => $accepted_encodings);
  35. my $resp = $lwp->get($source);
  36. $resp->is_success or return;
  37. my $html = $resp->decoded_content;
  38. return $html;
  39. }
  40. if (ref($source) eq 'GLOB') {
  41. my $html = do {
  42. local $/;
  43. <$source>;
  44. };
  45. return $html;
  46. }
  47. my $html = do {
  48. open my $fh, '<:utf8', $source
  49. or die "Can't open file <<$source>> for reading: $!";
  50. local $/;
  51. <$fh>;
  52. };
  53. return $html;
  54. }
  55. sub html2text ($html, $formatter) {
  56. my $tree = HTML::TreeBuilder->new();
  57. $tree->parse($html);
  58. $tree->eof();
  59. $tree->elementify(); # just for safety
  60. my $text = $formatter->format($tree);
  61. return $text;
  62. }
  63. my $left_margin = 0;
  64. my $right_margin = 80;
  65. sub help ($exit_code = 0) {
  66. print <<"EOT";
  67. usage: $0 [options] [URL or HTML file]
  68. -lm --left=i : the column of the left margin. (default: $left_margin)
  69. -rm --right=i : the column of the right margin. (default: $right_margin)
  70. EOT
  71. exit($exit_code);
  72. }
  73. GetOptions(
  74. "lm|left=i" => \$left_margin,
  75. "rm|right=i" => \$right_margin,
  76. "h|help" => sub { help(0) }
  77. )
  78. or do {
  79. warn("Error in command line arguments\n");
  80. help(1);
  81. };
  82. my $stdin_on_tty = -t STDIN;
  83. if (not $stdin_on_tty) { # assume input provided via STDIN
  84. ## ok
  85. }
  86. else {
  87. @ARGV || do {
  88. warn "\nerror: no URL or HTML file provided!\n\n";
  89. help(2);
  90. };
  91. }
  92. my $formatter = HTML::FormatText->new(leftmargin => $left_margin,
  93. rightmargin => $right_margin,);
  94. my $html = extract_html($stdin_on_tty ? $ARGV[0] : \*STDIN);
  95. $html // die "error: unable to extract HTML content";
  96. my $text = html2text($html, $formatter);
  97. $text // die "error: unable to extract text";
  98. print $text;