undo-local-names.pl 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. #! /usr/bin/perl -w
  2. # Copyright (C) 2015 Alex Schroeder <alex@gnu.org>
  3. #
  4. # This program is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation; either version 3 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. use utf8;
  17. use strict;
  18. use warnings;
  19. undef $/; # slurp
  20. my %index = ();
  21. my $verbose = '';
  22. my $LinkPattern = '(\p{Uppercase}+\p{Lowercase}+\p{Uppercase}\p{Alphabetic}*)';
  23. my $FreeLinkPattern = "([-,.()'%&?;<> _1-9A-Za-z\x{0080}-\x{fffd}]|[-,.()'%&?;<> _0-9A-Za-z\x{0080}-\x{fffd}][-,.()'%&?;<> _0-9A-Za-z\x{0080}-\x{fffd}]+)";
  24. my $UrlProtocols = 'http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|prospero|telnet|gopher|irc|feed';
  25. my $UrlChars = '[-a-zA-Z0-9/@=+$_~*.,;:?!\'"()&#%]'; # see RFC 2396
  26. my $FullUrlPattern="((?:$UrlProtocols):$UrlChars+)"; # when used in square brackets
  27. # either a single letter, or a string that begins with a single letter and ends with a non-space
  28. my $words = '([A-Za-z\x{0080}-\x{fffd}](?:[-%.,:;\'"!?0-9 A-Za-z\x{0080}-\x{fffd}]*?[-%.,:;\'"!?0-9A-Za-z\x{0080}-\x{fffd}])?)';
  29. # zero-width assertion to prevent km/h from counting
  30. my $nowordstart = '(?:(?<=[^-0-9A-Za-z\x{0080}-\x{fffd}])|^)';
  31. # zero-width look-ahead assertion to prevent km/h from counting
  32. my $nowordend = '(?=[^-0-9A-Za-z\x{0080}-\x{fffd}]|$)';
  33. my $IrcNickRegexp = qr{[]a-zA-Z^[;\\`_{}|][]^[;\\`_{}|a-zA-Z0-9-]*};
  34. sub FreeToNormal { # trim all spaces and convert them to underlines
  35. my $id = shift;
  36. return '' unless $id;
  37. $id =~ s/ /_/g;
  38. $id =~ s/__+/_/g;
  39. $id =~ s/^_//;
  40. $id =~ s/_$//;
  41. return $id;
  42. }
  43. sub parse_local_names {
  44. my $filename = shift;
  45. print "Reading $filename\n";
  46. open(my $fh, '<:utf8', $filename) or die "Cannot read $filename: $!";
  47. my $data = <$fh>;
  48. close($fh);
  49. print "Parsing $filename\n";
  50. my %names = ();
  51. while ($data =~ m/\[$FullUrlPattern\s+([^\]]+?)\]/g) {
  52. my ($page, $url) = ($2, $1);
  53. my $id = FreeToNormal($page);
  54. $names{$id} = $url;
  55. }
  56. return \%names;
  57. }
  58. sub write_file {
  59. my ($file, $data) = @_;
  60. return unless $data;
  61. open(my $fh, '>:utf8', $file) or die "Cannot write $file: $!";
  62. print $fh $data;
  63. close($fh);
  64. }
  65. sub replacement {
  66. my ($names, $id, $pos, @no_go) = @_;
  67. while (@no_go) {
  68. my $first = shift @no_go;
  69. print "Is $pos between " . $first->[0] . " and " . $first->[1] . "?\n" if $verbose;
  70. return $id if $pos >= $first->[0] and $pos <= $first->[1];
  71. }
  72. return "[[$id]]" if exists $index{$id}; # local page exists
  73. return $id unless $names->{$id};
  74. return '[' . $names->{$id} . ' ' . $id . ']';
  75. }
  76. sub translate_file {
  77. my ($names, $data) = @_;
  78. my @no_go = ();
  79. while ($data =~ /( <nowiki>.*?<\/nowiki>
  80. | <code>.*?<\/code>
  81. | ^ <pre> (.*\n)+ <\/pre>
  82. | ^ {{{ (.*\n)+ }}}
  83. | ${nowordstart} \* ${words} \* ${nowordend}
  84. | ${nowordstart} \/ ${words} \/ ${nowordend}
  85. | ${nowordstart} \_ ${words} \_ ${nowordend}
  86. | ${nowordstart} \! ${words} \! ${nowordend}
  87. | \[\[ $FreeLinkPattern .*? \]\]
  88. | \[ $FullUrlPattern \s+ [^\]]+? \]
  89. | ^( \h+.+\n )+
  90. | ^(?: \[? \d\d?:\d\d (?:am|pm)? \]? )? \s* < $IrcNickRegexp > )/gmx) {
  91. push @no_go, [pos($data) - length $1, pos($data)];
  92. print "no go from " . $no_go[-1]->[0] . ".." . $no_go[-1]->[1] . " for $1\n" if $verbose;
  93. }
  94. my $subs = $data =~ s/(?<![:![])\b$LinkPattern(?![:])/replacement($names, $1, pos($data), @no_go)/ge;
  95. return $data if $subs;
  96. }
  97. sub read_file {
  98. my $file = shift;
  99. open(my $fh, '<:utf8', $file) or die "Cannot read $file: $!";
  100. my $data = <$fh>;
  101. close($fh);
  102. return $data;
  103. }
  104. sub main {
  105. my ($dir, $local_names) = @_;
  106. mkdir($dir . '-new') or die "Cannot create $dir-new: $!";
  107. my $names = parse_local_names("$dir/$local_names");
  108. print "Indexing files\n";
  109. foreach my $file (glob("$dir/.* $dir/*")) {
  110. next unless $file =~ /$dir\/(.+)/;
  111. my $id = $1;
  112. next if $id eq ".";
  113. next if $id eq "..";
  114. next if $id eq "$local_names";
  115. $index{$id} = 1;
  116. }
  117. print "Converting files\n";
  118. foreach my $id (sort keys %index) {
  119. # this is where you debug a particular page
  120. # $verbose = $id eq '2014-12-18_Emacs_Wiki_Migration';
  121. write_file("$dir-new/$id", translate_file($names, read_file("$dir/$id")));
  122. }
  123. }
  124. use Getopt::Long;
  125. my $names = 'LocalNames';
  126. my $dir = 'raw';
  127. my $help = '';
  128. GetOptions ("names=s" => \$names,
  129. "dir=s" => \$dir,
  130. "help" => \$help);
  131. if ($help) {
  132. print qq{
  133. Usage: $0 [--dir=DIR] [--names=LocalNames]
  134. You need to use the raw.pl script to create a directory full of raw
  135. wiki text files.
  136. --dir=DIR is where the raw wiki text files are. Default: raw. The
  137. converted files will be stored in DIR-new, ie. in raw-new by
  138. default.
  139. --names=LocalNames is the page name with all the local names on
  140. it. Default: LocalNames
  141. Example: $0 --dir=~/alexschroeder/raw --names=Names
  142. }
  143. } else {
  144. main ($dir, $names);
  145. }