dicelister.pl 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #!/usr/bin/env perl
  2. package dicelister;
  3. use strict;
  4. use warnings;
  5. use autodie;
  6. # UTF-8 support
  7. use utf8;
  8. # To use decode_utf8
  9. require Encode;
  10. # Allow wide character outputs without warning
  11. binmode STDOUT, ":utf8";
  12. # To enable the use of unicode throughout perl functions
  13. use feature 'unicode_strings';
  14. # Anything that opens a filehandle is to assume that that stream is
  15. # encoded in UTF‑8 unless you tell it otherwise
  16. use open qw( :encoding(UTF-8) :std );
  17. # To convert base of a number
  18. use Math::Base::Convert;
  19. # To handle CLI parameters
  20. use Getopt::Long;
  21. # For array_contains subroutine
  22. use List::Util qw(any);
  23. # For fetching remote URL
  24. use HTTP::Tiny;
  25. use File::Basename;
  26. use Cwd 'abs_path';
  27. our $dir = abs_path(dirname($0));
  28. our $data_dir = "${dir}/data";
  29. # To extract HTML
  30. use Mojo::DOM;
  31. # For SHA1 hash
  32. use Digest::SHA qw(sha1_hex);
  33. # Mode
  34. # 1 = Extract mode
  35. # 2 = Index mode
  36. our $mode = 1;
  37. ## Default values
  38. # Name used mainly for config and output file names
  39. our $config_name = 'default';
  40. # To hold filename of rough list text file
  41. our $rough_list_file;
  42. # To hold filename of indexed list text file
  43. our $indexed_list_file;
  44. # Minimum length of word allowed to be in list
  45. our $minimum_word_length = 3;
  46. # Maximum length of word allowed to be in list
  47. our $maximum_word_length = 14;
  48. # Excluded strings regex
  49. our $excluded_strings_re;
  50. # Minimum number of dots the dice can get
  51. our $dice_face_min = 1;
  52. # Maximum number of dots the dice can get
  53. our $dice_face_max = 6;
  54. # Diceware index length
  55. our $index_length = 5;
  56. # Cache dir for config
  57. our $cache_dir;
  58. our @unique_words;
  59. our @data_sources = (
  60. );
  61. our @exclude_strings = (
  62. '.',
  63. ',',
  64. '!',
  65. '?',
  66. ';',
  67. ':',
  68. '(',
  69. ')',
  70. '[',
  71. ']',
  72. '{',
  73. '}',
  74. '<',
  75. '>',
  76. '/',
  77. '\\',
  78. '+',
  79. '-',
  80. '_',
  81. '"',
  82. "'",
  83. '‘',
  84. '’',
  85. '″',
  86. '°',
  87. '•',
  88. '→',
  89. '←',
  90. '–',
  91. '#',
  92. '&',
  93. '$',
  94. '£',
  95. '%',
  96. '®',
  97. '©',
  98. );
  99. sub help_text {
  100. print("usage: dicelister.pl [-h] [-c CONFIG_FILE] [-e] [-i]
  101. Generates a Diceware wordlist based on config.
  102. optional arguments:
  103. -h, --help show this help message and exit
  104. -c CONFIG_FILE, --config CONFIG_FILE
  105. config file (without extension)
  106. -e, --extract
  107. set extract mode
  108. -i, --index
  109. set index mode
  110. ");
  111. exit;
  112. }
  113. # Process CLI parameters and update config values as necessary
  114. GetOptions ("c|config=s" => \$config_name,
  115. "e|extract" => sub { $mode = 1 },
  116. "i|index" => sub { $mode = 2 },
  117. "h|help" => \&help_text)
  118. or die("Error in command line arguments. Please review and try again.\n");
  119. require "${dir}/config/${config_name}.pl" or die("Error: ${dir}/config/${config_name}.pl is not found");
  120. # Hook: Runs before rough file preparation process is started
  121. if (eval "defined(&after_config_import)") {
  122. after_config_import();
  123. }
  124. # Variable values based on config name
  125. $rough_list_file = "${config_name}.rough.txt";
  126. $indexed_list_file = "${config_name}.wordlist.txt";
  127. $cache_dir = "${data_dir}/${config_name}/_cache";
  128. unless ( -d "${data_dir}/${config_name}" ) {
  129. mkdir "${data_dir}/${config_name}";
  130. }
  131. if ( $mode == 1 ) {
  132. # Hook: Runs before rough file preparation process is started
  133. if (eval "defined(&before_rough_file_process)") {
  134. before_rough_file_process();
  135. }
  136. ## Prepare for regex ##
  137. # Declare the subroutine if it hasn't been overriden
  138. unless (eval "defined(&escape_regex)") {
  139. # Escapes characters in regex pattern.
  140. # Does not handle characters inside character classes separately.
  141. # Ref: https://stackoverflow.com/a/400316
  142. sub escape_regex {
  143. my $pattern = shift;
  144. if ( defined $pattern ) {
  145. $pattern =~ s/(\.|\^|\$|\*|\+|\?|\(|\)|\[|\{|\\|\|)/\\$1/ig;
  146. return $pattern;
  147. }
  148. }
  149. }
  150. escape_regex();
  151. # Prepare the regex string for removal of excluded strings
  152. unless (eval "defined(&prepare_exclude_regex)") {
  153. sub prepare_exclude_regex {
  154. foreach ( @exclude_strings ) {
  155. $_ = escape_regex($_);
  156. }
  157. $excluded_strings_re = join('|', @exclude_strings);
  158. }
  159. }
  160. prepare_exclude_regex();
  161. ## Get source content ##
  162. unless (eval "defined(&check_sources)") {
  163. sub check_sources {
  164. if ( scalar @data_sources < 1 ) {
  165. die("No sources are defined. Please add some sources in \@data_sources array on your config file.");
  166. }
  167. }
  168. }
  169. check_sources();
  170. # Get source content text
  171. our $text = '';
  172. unless (eval "defined(&get_source_file)") {
  173. sub get_source_file {
  174. my $filepath = shift;
  175. if ( defined $filepath ) {
  176. open( my $file, "<", $filepath ) or die("File ${filepath} not found");
  177. my $file_content = do { local $/; <$file> };
  178. return "$file_content";
  179. }
  180. }
  181. }
  182. # Requires: HTTP::Tiny
  183. unless (eval "defined(&get_source_http)") {
  184. sub get_source_http {
  185. my $url = shift;
  186. my $html;
  187. # Write to cache
  188. unless ( -d $cache_dir ) {
  189. mkdir $cache_dir;
  190. }
  191. # substr call is to strip an extra \n at the end
  192. my $cache_file = substr join('', $cache_dir, '/', sha1_hex($url)), 0, -1;
  193. # If cache file is not present write HTML to cache
  194. unless ( -e $cache_file ) {
  195. my $response = HTTP::Tiny->new->get($url);
  196. if ( $response->{success} ) {
  197. open(my $CWF, '>', $cache_file) or print "Can't write cache file. $!\n";
  198. print $CWF $response->{content};
  199. close($CWF);
  200. $html = $response->{content};
  201. } else {
  202. print "Failed to fetch ${url} ...\n";
  203. }
  204. # If cache is present, use it
  205. } else {
  206. print "Found cache... using it instead of fetching...\n";
  207. open my $CRF, '<', $cache_file or die "Can't open cache file $!";
  208. $html = do { local $/; <$CRF> };
  209. }
  210. # Process
  211. my $dom = Mojo::DOM->new( $html );
  212. my $html_text = $dom->find('p')->map('text')->join("\n");
  213. return Encode::decode_utf8($html_text);
  214. }
  215. }
  216. # Get source content for each data source
  217. unless (eval "defined(&get_source_contents)") {
  218. sub get_source_contents {
  219. foreach (@data_sources) {
  220. # Local file
  221. if ( $_ =~ /^file\:\/\/(.*)/i ) {
  222. print "Processing $1 ...\n";
  223. my $file_content = get_source_file("$1");
  224. $text = "$text $file_content";
  225. # HTTP(S) URL
  226. } elsif ( $_ =~ /^https*\:\/\//i ) {
  227. print "Processing $_ ...\n";
  228. $text = "$text " . get_source_http("$_");
  229. # String
  230. } else {
  231. print join("", "Processing string '", (substr $_, 0, 20) ,"...' ...\n");
  232. $text = "$text $_";
  233. }
  234. }
  235. }
  236. }
  237. get_source_contents();
  238. # Replace all excluded strings to spaces so that it's easier to process
  239. $text =~ s/$excluded_strings_re/ /g;
  240. ## Process words ##
  241. our @words = split(' ', $text);
  242. unless (eval "defined(&is_numeric)") {
  243. sub is_numeric {
  244. my $val = shift;
  245. if ( defined $val ) {
  246. return $val =~ /^\d+$/ ? 1 : 0;
  247. } else {
  248. warn "No argument given to is_numeric!";
  249. }
  250. }
  251. }
  252. # Requires: use List::Util qw(any);
  253. unless (eval "defined(&array_contains)") {
  254. sub array_contains {
  255. my $value = shift;
  256. # https://stackoverflow.com/a/16690762
  257. return any { $_ eq $value } @_;
  258. }
  259. }
  260. # Process word if it should be added to unique words list.
  261. unless (eval "defined(&check_word)") {
  262. sub check_word {
  263. my $word = shift;
  264. my $word_length = length($word);
  265. if ( not array_contains($word, @unique_words)
  266. and $word_length >= $minimum_word_length
  267. and $word_length <= $maximum_word_length
  268. and not is_numeric($word)
  269. ) {
  270. return 1;
  271. }
  272. return undef;
  273. }
  274. }
  275. # Prepare unique word list
  276. unless (eval "defined(&prepare_unique_words)") {
  277. sub prepare_unique_words {
  278. # Prepare unique words array
  279. foreach (@words) {
  280. my $word = lc("$_");
  281. if ( check_word($word) ) {
  282. push @unique_words, $word;
  283. }
  284. }
  285. @unique_words = sort @unique_words;
  286. my $words_listed = scalar @unique_words;
  287. my $max_words_needed = $dice_face_max ** $index_length;
  288. if ( scalar @unique_words < $max_words_needed ) {
  289. print "WARNING!! There are only ${words_listed} words extracted from the sources in the config file. But there should be a total of ${max_words_needed} words or more. It may cause malfunction in the passphrase generator.\nPlease add more data_sources in config or add more words manually in the rough list.\n";
  290. }
  291. }
  292. }
  293. prepare_unique_words();
  294. # Hook: Runs before rough file is written
  295. if (eval "defined(&before_rough_file_write)") {
  296. before_rough_file_write();
  297. }
  298. unless (eval "defined(&write_rough_list)") {
  299. sub write_rough_list {
  300. if (-e -f "${data_dir}/${rough_list_file}") {
  301. unlink("${data_dir}/${rough_list_file}") or die "Can't unlink ${data_dir}/${rough_list_file}: $!";
  302. }
  303. open( my $RF, ">", "${data_dir}/${rough_list_file}") or die "Cannot open ${data_dir}/${rough_list_file} for write";
  304. foreach (@unique_words) {
  305. print $RF "$_\n";
  306. }
  307. close $RF;
  308. print "${data_dir}/${rough_list_file} has been generated. Please edit the file if you need to and run the same command with -i to create the wordlist.\n";
  309. }
  310. }
  311. write_rough_list();
  312. # Hook: Runs after rough file is written
  313. if (eval "defined(&after_rough_file_write)") {
  314. after_rough_file_write();
  315. }
  316. } elsif ( $mode == 2 ) {
  317. # Hook: Runs before indexing file process starts
  318. if (eval "defined(&before_indexed_file_process)") {
  319. before_indexed_file_process();
  320. }
  321. # To aid in getting index number
  322. my $base_enc = [$dice_face_min..$dice_face_max];
  323. my $index_conv = new Math::Base::Convert('10', $base_enc);
  324. # Returns Diceware index number for an $nth item.
  325. # Params:
  326. # 1: index - has to be 0-based (starts from 0)
  327. unless (eval "defined(&get_diceware_index_num)") {
  328. sub get_diceware_index_num {
  329. my $n = shift;
  330. my $ind = eval { $index_conv->cnv($n) };
  331. my $ind_length = length($ind);
  332. # Fill up empty spaces with 1s (or $dice_face_min)
  333. if ( $index_length > $ind_length ) {
  334. $ind = ( $dice_face_min x ($index_length - $ind_length) ) . $ind;
  335. } else {
  336. $ind = ( $dice_face_min x ($ind_length - $index_length) ) . $ind;
  337. }
  338. return $ind;
  339. }
  340. }
  341. # Read rough file
  342. my @file_content;
  343. unless (eval "defined(&read_rough_file_for_indexing)") {
  344. sub read_rough_file_for_indexing {
  345. open my $WLR, "${data_dir}/${rough_list_file}" or die "Could not open ${data_dir}/${rough_list_file}: $!";
  346. while( my $line = <$WLR>) {
  347. push @file_content, $line;
  348. }
  349. close $WLR;
  350. }
  351. }
  352. read_rough_file_for_indexing();
  353. # Sort before putting into wordlist file
  354. unless (eval "defined(&sort_rough_file_for_indexing)") {
  355. sub sort_rough_file_for_indexing {
  356. @file_content = sort @file_content;
  357. }
  358. }
  359. sort_rough_file_for_indexing();
  360. # Hook: Runs before indexed file is written
  361. if (eval "defined(&before_indexed_file_write)") {
  362. before_indexed_file_write();
  363. }
  364. # Write wordlist file
  365. unless (eval "defined(&write_rough_file_for_indexing)") {
  366. sub write_rough_file_for_indexing {
  367. open( my $WLW, ">", "${data_dir}/${indexed_list_file}") or die "Cannot open ${data_dir}/${indexed_list_file} for write";
  368. my $line_index = 0;
  369. my $max_index = $dice_face_max x $index_length;
  370. my $idx;
  371. foreach (@file_content) {
  372. # Add the index number and put in the file
  373. $idx = get_diceware_index_num($line_index);
  374. print $WLW join('', $idx, "\t$_");
  375. if ( $idx == $max_index ) {
  376. print "Reached maximum possible dice index number: ${max_index}.\nWill ignore rest of the words.\nLast word was: $_\n";
  377. last;
  378. }
  379. $line_index++;
  380. }
  381. if ( $idx != $max_index ) {
  382. print "WARNING!! The word count haven't reached the maximum possible dice index number ${max_index} but only at ${idx}.\nThis may result in malfunction in the passphrase generator and may not function as expected.\nPlease add more data_sources in config to increase word count.\n";
  383. }
  384. print "${data_dir}/${indexed_list_file} file has been created.\n";
  385. close $WLW;
  386. }
  387. }
  388. write_rough_file_for_indexing();
  389. # Hook: Runs after indexed file is written
  390. if (eval "defined(&after_indexed_file_write)") {
  391. after_indexed_file_write();
  392. }
  393. }