rosettacode_to_markdown.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 24 April 2015
  5. # Edit: 09 December 2023
  6. # Website: https://github.com/trizen
  7. # Extract markdown code from each task for a given programming language.
  8. use utf8;
  9. use 5.020;
  10. use strict;
  11. use autodie;
  12. use warnings;
  13. use experimental qw(signatures);
  14. use Text::Tabs qw(expand);
  15. use Encode qw(decode_utf8);
  16. use Getopt::Long qw(GetOptions);
  17. use File::Path qw(make_path);
  18. use LWP::UserAgent::Cached qw();
  19. use URI::Escape qw(uri_unescape uri_escape);
  20. use HTML::Entities qw(decode_entities);
  21. use File::Spec::Functions qw(catfile catdir);
  22. binmode(STDOUT, ':utf8');
  23. binmode(STDERR, ':utf8');
  24. sub escape_markdown ($t) {
  25. $t =~ s{([*_`])}{\\$1}g;
  26. return $t;
  27. }
  28. sub escape_lang ($s) {
  29. $s =~ s/\s/_/gr; # replace whitespace with underscores
  30. }
  31. sub _ulist ($s) {
  32. $s =~ s{<li>(.*?)</li>}{* $1\n}gsr;
  33. }
  34. sub _olist ($s) {
  35. my $i = 1;
  36. $s =~ s{<li>(.*?)</li>}{$i++ . '. ' . "$1\n"}egsr;
  37. }
  38. sub tags_to_markdown ($t, $escape = 0) {
  39. my $out = '';
  40. until ($t =~ /\G\z/gc) {
  41. if ($t =~ m{\G<br\h*/\h*>}gc) {
  42. $out .= "\n";
  43. }
  44. elsif ($t =~ m{\G<b>(.*?)</b>}gcs) {
  45. $out .= "**" . tags_to_markdown($1, 1) . "**";
  46. }
  47. elsif ($t =~ m{\G<i>(.*?)</i>}gcs) {
  48. $out .= "*" . tags_to_markdown($1, 1) . "*";
  49. }
  50. elsif ($t =~ m{\G<code>(.*?)</code>}gcs) {
  51. $out .= "`" . decode_entities($1) . "`";
  52. }
  53. elsif ($t =~ m{\G<tt>(.*?)</tt>}gcs) {
  54. $out .= "`" . decode_entities($1) . "`";
  55. }
  56. elsif ($t =~ m{\G<a\b.*? href="(.*?)".*?>(.*?)</a>}gcs) {
  57. my ($url, $label) = ($1, $2);
  58. if ($url =~ m{^/}) {
  59. $url = 'https://rosettacode.org' . $url;
  60. }
  61. $label = tags_to_markdown($label);
  62. $out .= "[$label]($url)";
  63. }
  64. elsif ($t =~ m{\G(<img\b.*? src="/mw/.*?".*?/>)}gc) {
  65. my $html = $1;
  66. $html =~ s{ src="\K/mw/}{https://rosettacode.org/mw/};
  67. $html =~ s{ srcset=".*?"}{};
  68. $out .= $html;
  69. }
  70. elsif ($t =~ m{\G<span><span class="mwe-math-mathml-inline mwe-math-mathml-a11y"}gc) {
  71. $t =~ m{\G.*?</span>}gsc;
  72. if ($t =~ m{\G<meta class="mwe-math-fallback-image-inline".*? url\(&#39;(/mw/index\.php\?(?:.*?))&#39;\).*?/></span>}gc) {
  73. $out .= '![image](https://rosettacode.org' . decode_entities($1) . ')';
  74. }
  75. else {
  76. warn "[!] Failed to parse math meta class!\n";
  77. }
  78. }
  79. elsif ($t =~ m{\G<ul>(.*?)</ul>}gcs) {
  80. $out .= _ulist(tags_to_markdown($1, 1));
  81. }
  82. elsif ($t =~ m{\G<ol>(.*?)</ol>}gcs) {
  83. $out .= _olist(tags_to_markdown($1, 1));
  84. }
  85. elsif ($t =~ /\G([^<]+)/gc) {
  86. $out .= $escape ? escape_markdown($1) : $1;
  87. }
  88. elsif ($t =~ /\G(.)/gcs) {
  89. $out .= $escape ? escape_markdown($1) : $1;
  90. }
  91. }
  92. return $out;
  93. }
  94. sub strip_tags ($s) {
  95. $s =~ s/<.*?>//gsr; # remove HTML tags
  96. }
  97. sub strip_space ($s) {
  98. unpack('A*', $s =~ s/^\s+//r); # remove leading and trailing whitespace
  99. }
  100. sub extract_tasks ($content, $lang) {
  101. my $i = index($content, qq{<h2>Pages in category "$lang"</h2>});
  102. if ($i == -1) {
  103. warn "[!] Can't find any tasks for language: <$lang>!\n";
  104. return;
  105. }
  106. my $tasks_content = substr($content, $i);
  107. my @tasks;
  108. while ($tasks_content =~ m{<a href="/wiki/(.+?)" title=".+?">(.+?)</a></li>}g) {
  109. my ($task, $label) = ($1, $2);
  110. last if $task eq 'Special:Categories';
  111. push @tasks,
  112. {
  113. name => decode_utf8(uri_unescape($task)),
  114. title => $label,
  115. };
  116. }
  117. return \@tasks;
  118. }
  119. sub extract_all_tasks ($main_url, $path_url, $lang) {
  120. my $lwp_uc = LWP::UserAgent->new(
  121. show_progress => 1,
  122. agent => '',
  123. timeout => 60,
  124. );
  125. my $tasks_url = $main_url . $path_url;
  126. my $resp = $lwp_uc->get($tasks_url);
  127. $resp->is_success || die $resp->status_line;
  128. my $content = $resp->decoded_content;
  129. my $tasks = extract_tasks($content, $lang);
  130. my @all_tasks = @$tasks;
  131. if ($content =~ m{<a href="([^"]+)" title="[^"]+">next page</a>}) {
  132. push @all_tasks, __SUB__->($main_url, $1, $lang);
  133. }
  134. return @all_tasks;
  135. }
  136. sub extract_lang ($content, $lang, $lang_alias = $lang) {
  137. my $header = sub {
  138. qq{<span class="mw-headline" id="$_[0]">};
  139. };
  140. my $i = index($content, $header->($lang));
  141. # Try with the language escaped
  142. if ($i == -1) {
  143. $i = index($content, $header->(escape_lang($lang)));
  144. }
  145. # Try with the language alias
  146. if ($i == -1) {
  147. $i = index($content, $header->($lang_alias));
  148. }
  149. # Try with the language alias escaped
  150. if ($i == -1) {
  151. $i = index($content, $header->(escape_lang($lang_alias)));
  152. }
  153. # Give up
  154. if ($i == -1) {
  155. warn "[!] Can't find language: <$lang>\n";
  156. return;
  157. }
  158. my $j = index($content, '<h2>', $i);
  159. if ($j == -1) {
  160. $j = index($content, '<div class="printfooter">', $i);
  161. }
  162. if ($j == -1) {
  163. state $x = 0;
  164. if (++$x <= 3) {
  165. warn "[!] Position `j` will point at the end of the page...\n";
  166. }
  167. $j = length($content);
  168. }
  169. $i = index($content, '</h2>', $i);
  170. if ($i == -1) {
  171. warn "[!] Can't find the end of the header!\n";
  172. return;
  173. }
  174. $i += 5; # past the end of the header
  175. my $part = strip_space(substr($content, $i, $j - $i));
  176. # remove <script> tags
  177. $part =~ s{<script\b.+?</script>}{}gsi;
  178. # replace [email protected] with 'email@example.net'
  179. $part =~ s{<a class="__cf_email__".+?</a>}{email\@example.net}gsi;
  180. my @data;
  181. until ($part =~ /\G\z/gc) {
  182. if ($part =~ m{\G<pre class="(.+?) highlighted_source">(.+)</pre>}gc) { # old way
  183. push @data,
  184. {
  185. code => {
  186. lang => $1,
  187. data => $2,
  188. }
  189. };
  190. }
  191. elsif ($part =~ m{\G<div class="[^"]*mw-highlight-lang-(\S+)[^"]*" dir="ltr"><pre>(.*?)</pre>}sgc) { # new way
  192. push @data,
  193. {
  194. code => {
  195. lang => $1,
  196. data => $2,
  197. }
  198. };
  199. }
  200. elsif ($part =~ m{\G<h([1-4])>(.*?)</h[1-4]>}sgc) {
  201. push @data,
  202. {
  203. header => {
  204. n => $1,
  205. data => $2,
  206. }
  207. };
  208. }
  209. elsif ($part =~ m{\G<p>(.*?)</p>}sgc) {
  210. push @data,
  211. {
  212. text => {
  213. tag => 'p',
  214. data => $1,
  215. },
  216. };
  217. }
  218. elsif ($part =~ m{\G<pre\b[^>]*>(.*?)</pre>}sgc) {
  219. push @data,
  220. {
  221. text => {
  222. tag => 'pre',
  223. data => $1,
  224. }
  225. };
  226. }
  227. elsif ($part =~ m{\G(.)}sgc) {
  228. @data && exists($data[-1]{unknown})
  229. ? ($data[-1]{unknown}{data} .= $1)
  230. : (push @data, {unknown => {data => $1}});
  231. }
  232. }
  233. return \@data;
  234. }
  235. sub to_html ($lang_data) {
  236. my $text = '';
  237. foreach my $item (@{$lang_data}) {
  238. if (exists $item->{text}) {
  239. $text .= qq{<$item->{text}{tag}>$item->{text}{data}</$item->{text}{tag}>};
  240. }
  241. elsif (exists $item->{code}) {
  242. $text .= qq{<pre class="lang $item->{code}{lang}">$item->{code}{data}</pre>};
  243. }
  244. }
  245. return $text;
  246. }
  247. sub to_markdown ($lang_data) {
  248. my $text = '';
  249. my $has_output = 1;
  250. foreach my $item (@{$lang_data}) {
  251. if (exists $item->{header}) {
  252. my $n = $item->{header}{n};
  253. my $data = $item->{header}{data};
  254. my $t = strip_tags(tags_to_markdown(strip_space($data), 1));
  255. $t =~ s/\[\[edit\].*//s;
  256. $text .= "\n\n" . ('#' x $n) . ' ' . $t . "\n\n";
  257. }
  258. elsif (exists $item->{text}) {
  259. my $data = $item->{text}{data};
  260. my $tag = $item->{text}{tag};
  261. if ($tag eq 'p') {
  262. my $t = tags_to_markdown(strip_space($data), 1);
  263. $text .= "\n\n" . $t . "\n\n";
  264. $has_output = 1;
  265. }
  266. elsif ($tag eq 'pre') {
  267. my $t = decode_entities($data);
  268. $t =~ s/^(?:\R)+//;
  269. $t =~ s/(?:\R)+\z//;
  270. $t = join("\n", expand(split(/\R/, $t)));
  271. $text .= "\n#### Output:" if !$has_output;
  272. $text .= "\n```\n$t\n```\n";
  273. }
  274. }
  275. elsif (exists $item->{code}) {
  276. my $code = decode_entities(strip_tags(tags_to_markdown($item->{code}{data})));
  277. my $lang = $item->{code}{lang};
  278. $code =~ s/\[(\w+)\]\(https?:.*?\)/$1/g;
  279. $code =~ s{(?:\R)+\z}{};
  280. $text .= "```$lang\n$code\n```\n";
  281. $has_output = 0;
  282. }
  283. }
  284. return strip_space($text);
  285. }
  286. sub write_to_file ($base_dir, $name, $markdown, $overwrite = 0) {
  287. # Remove parenthesis
  288. $name =~ tr/()//d;
  289. # Substitute bad characters
  290. #$name =~ tr{-A-Za-z0-9[]'*_/À-ÿ}{_}c;
  291. $name =~ s{[^\pL\pN\[\]'*/\-]+}{ }g;
  292. # Replace multiple spaces with a single underscore
  293. $name = join('_', split(' ', $name));
  294. my $char = uc(substr($name, 0, 1));
  295. my $dir = catdir($base_dir, $char);
  296. # Remove directory paths from name (if any)
  297. if ($name =~ s{^(.*)/}{}) {
  298. my $dirname = $1;
  299. $dir = catdir($dir, map { $_ eq 'Sorting_Algorithms' ? 'Sorting_algorithms' : $_ } split(/\//, $dirname));
  300. }
  301. # Create directory if it doesn't exists
  302. if (not -d $dir) {
  303. make_path($dir) or do {
  304. warn "[!] Can't create path `$dir`: $!\n";
  305. return;
  306. };
  307. }
  308. my $file = catfile($dir, "$name.md");
  309. if (not $overwrite) {
  310. return 1 if -e $file; # Don't overwrite existent files
  311. }
  312. say "** Creating file: $file";
  313. open(my $fh, '>:encoding(UTF-8)', $file) or do {
  314. warn "[!] Can't create file `$file`: $!";
  315. return;
  316. };
  317. print {$fh} $markdown;
  318. close $fh;
  319. }
  320. #
  321. ## MAIN
  322. #
  323. my $cache_dir = 'cache';
  324. my $lang = 'Sidef';
  325. my $lang_alias = undef;
  326. my $overwrite = 0;
  327. my $base_dir = 'programming_tasks';
  328. my $main_url = 'https://rosettacode.org';
  329. sub usage {
  330. print <<"EOT";
  331. usage: $0 [options]
  332. options:
  333. --lang=s : the programming language name (default: $lang)
  334. --base-dir=s : where to save the files (default: $base_dir)
  335. --overwrite! : overwrite existent files (default: $overwrite)
  336. --cache-dir=s : cache directory (default: $cache_dir)
  337. --main-url=s : main URL (default: $main_url)
  338. --help : print this message and exit
  339. example:
  340. $0 --lang=Perl --base-dir=perl_tasks
  341. EOT
  342. exit;
  343. }
  344. GetOptions(
  345. 'cache-dir=s' => \$cache_dir,
  346. 'L|language=s' => \$lang,
  347. 'base-dir=s' => \$base_dir,
  348. 'main-url=s' => \$main_url,
  349. 'overwrite!' => \$overwrite,
  350. 'help' => \&usage,
  351. )
  352. or die "[!] Error in command line arguments!";
  353. if (not -d $cache_dir) {
  354. mkdir($cache_dir);
  355. }
  356. my $lwp = LWP::UserAgent::Cached->new(
  357. timeout => 60,
  358. show_progress => 1,
  359. agent => '',
  360. cache_dir => $cache_dir,
  361. nocache_if => sub {
  362. my ($response) = @_;
  363. my $code = $response->code;
  364. return 1 if ($code >= 300); # do not cache any bad response
  365. return 1 if ($code == 401); # don't cache an unauthorized response
  366. return 1 if ($response->request->method ne 'GET'); # cache only GET requests
  367. return;
  368. },
  369. );
  370. {
  371. my $accepted_encodings = HTTP::Message::decodable();
  372. $lwp->default_header('Accept-Encoding' => $accepted_encodings);
  373. require LWP::ConnCache;
  374. my $cache = LWP::ConnCache->new;
  375. $cache->total_capacity(undef); # no limit
  376. $lwp->conn_cache($cache);
  377. }
  378. my @tasks = extract_all_tasks($main_url, '/wiki/' . escape_lang($lang), $lang);
  379. sub my_uri_escape ($path) {
  380. $path =~ s/([?'+])/uri_escape($1)/egr;
  381. }
  382. foreach my $task (@tasks) {
  383. my $name = $task->{name};
  384. my $title = $task->{title};
  385. my $url = "$main_url/wiki/" . my_uri_escape($name);
  386. my $resp = $lwp->get($url);
  387. if ($resp->is_success) {
  388. my $content = $resp->decoded_content;
  389. my $lang_data = extract_lang($content, $lang, $lang_alias) // do { $lwp->uncache; next };
  390. my $header = "[1]: $url\n\n" . "# [$title][1]\n\n";
  391. my $markdown = $header . to_markdown($lang_data) . "\n";
  392. write_to_file($base_dir, $name, $markdown, $overwrite);
  393. }
  394. else {
  395. warn "[" . $resp->status_line . "] Can't fetch: $url\n";
  396. }
  397. }