123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445 |
- #!/usr/bin/env perl
- package dicelister;
- use strict;
- use warnings;
- use autodie;
- # UTF-8 support
- use utf8;
- # To use decode_utf8
- require Encode;
- # Allow wide character outputs without warning
- binmode STDOUT, ":utf8";
- # To enable the use of unicode throughout perl functions
- use feature 'unicode_strings';
- # Anything that opens a filehandle is to assume that that stream is
- # encoded in UTF‑8 unless you tell it otherwise
- use open qw( :encoding(UTF-8) :std );
- # To convert base of a number
- use Math::Base::Convert;
- # To handle CLI parameters
- use Getopt::Long;
- # For array_contains subroutine
- use List::Util qw(any);
- # For fetching remote URL
- use HTTP::Tiny;
- use File::Basename;
- use Cwd 'abs_path';
- our $dir = abs_path(dirname($0));
- our $data_dir = "${dir}/data";
- # To extract HTML
- use Mojo::DOM;
- # For SHA1 hash
- use Digest::SHA qw(sha1_hex);
- # Mode
- # 1 = Extract mode
- # 2 = Index mode
- our $mode = 1;
- ## Default values
- # Name used mainly for config and output file names
- our $config_name = 'default';
- # To hold filename of rough list text file
- our $rough_list_file;
- # To hold filename of indexed list text file
- our $indexed_list_file;
- # Minimum length of word allowed to be in list
- our $minimum_word_length = 3;
- # Maximum length of word allowed to be in list
- our $maximum_word_length = 14;
- # Excluded strings regex
- our $excluded_strings_re;
- # Minimum number of dots the dice can get
- our $dice_face_min = 1;
- # Maximum number of dots the dice can get
- our $dice_face_max = 6;
- # Diceware index length
- our $index_length = 5;
- # Cache dir for config
- our $cache_dir;
- our @unique_words;
- our @data_sources = (
- );
- our @exclude_strings = (
- '.',
- ',',
- '!',
- '?',
- ';',
- ':',
- '(',
- ')',
- '[',
- ']',
- '{',
- '}',
- '<',
- '>',
- '/',
- '\\',
- '+',
- '-',
- '_',
- '"',
- "'",
- '‘',
- '’',
- '″',
- '°',
- '•',
- '→',
- '←',
- '–',
- '#',
- '&',
- '$',
- '£',
- '%',
- '®',
- '©',
- );
- sub help_text {
- print("usage: dicelister.pl [-h] [-c CONFIG_FILE] [-e] [-i]
- Generates a Diceware wordlist based on config.
- optional arguments:
- -h, --help show this help message and exit
- -c CONFIG_FILE, --config CONFIG_FILE
- config file (without extension)
- -e, --extract
- set extract mode
- -i, --index
- set index mode
- ");
- exit;
- }
- # Process CLI parameters and update config values as necessary
- GetOptions ("c|config=s" => \$config_name,
- "e|extract" => sub { $mode = 1 },
- "i|index" => sub { $mode = 2 },
- "h|help" => \&help_text)
- or die("Error in command line arguments. Please review and try again.\n");
- require "${dir}/config/${config_name}.pl" or die("Error: ${dir}/config/${config_name}.pl is not found");
- # Hook: Runs before rough file preparation process is started
- if (eval "defined(&after_config_import)") {
- after_config_import();
- }
- # Variable values based on config name
- $rough_list_file = "${config_name}.rough.txt";
- $indexed_list_file = "${config_name}.wordlist.txt";
- $cache_dir = "${data_dir}/${config_name}/_cache";
- unless ( -d "${data_dir}/${config_name}" ) {
- mkdir "${data_dir}/${config_name}";
- }
- if ( $mode == 1 ) {
- # Hook: Runs before rough file preparation process is started
- if (eval "defined(&before_rough_file_process)") {
- before_rough_file_process();
- }
- ## Prepare for regex ##
- # Declare the subroutine if it hasn't been overriden
- unless (eval "defined(&escape_regex)") {
- # Escapes characters in regex pattern.
- # Does not handle characters inside character classes separately.
- # Ref: https://stackoverflow.com/a/400316
- sub escape_regex {
- my $pattern = shift;
- if ( defined $pattern ) {
- $pattern =~ s/(\.|\^|\$|\*|\+|\?|\(|\)|\[|\{|\\|\|)/\\$1/ig;
- return $pattern;
- }
- }
- }
- escape_regex();
- # Prepare the regex string for removal of excluded strings
- unless (eval "defined(&prepare_exclude_regex)") {
- sub prepare_exclude_regex {
- foreach ( @exclude_strings ) {
- $_ = escape_regex($_);
- }
- $excluded_strings_re = join('|', @exclude_strings);
- }
- }
- prepare_exclude_regex();
- ## Get source content ##
- unless (eval "defined(&check_sources)") {
- sub check_sources {
- if ( scalar @data_sources < 1 ) {
- die("No sources are defined. Please add some sources in \@data_sources array on your config file.");
- }
- }
- }
- check_sources();
- # Get source content text
- our $text = '';
- unless (eval "defined(&get_source_file)") {
- sub get_source_file {
- my $filepath = shift;
- if ( defined $filepath ) {
- open( my $file, "<", $filepath ) or die("File ${filepath} not found");
- my $file_content = do { local $/; <$file> };
- return "$file_content";
- }
- }
- }
- # Requires: HTTP::Tiny
- unless (eval "defined(&get_source_http)") {
- sub get_source_http {
- my $url = shift;
- my $html;
- # Write to cache
- unless ( -d $cache_dir ) {
- mkdir $cache_dir;
- }
- # substr call is to strip an extra \n at the end
- my $cache_file = substr join('', $cache_dir, '/', sha1_hex($url)), 0, -1;
- # If cache file is not present write HTML to cache
- unless ( -e $cache_file ) {
- my $response = HTTP::Tiny->new->get($url);
- if ( $response->{success} ) {
- open(my $CWF, '>', $cache_file) or print "Can't write cache file. $!\n";
- print $CWF $response->{content};
- close($CWF);
- $html = $response->{content};
- } else {
- print "Failed to fetch ${url} ...\n";
- }
- # If cache is present, use it
- } else {
- print "Found cache... using it instead of fetching...\n";
- open my $CRF, '<', $cache_file or die "Can't open cache file $!";
- $html = do { local $/; <$CRF> };
- }
- # Process
- my $dom = Mojo::DOM->new( $html );
- my $html_text = $dom->find('p')->map('text')->join("\n");
- return Encode::decode_utf8($html_text);
- }
- }
- # Get source content for each data source
- unless (eval "defined(&get_source_contents)") {
- sub get_source_contents {
- foreach (@data_sources) {
- # Local file
- if ( $_ =~ /^file\:\/\/(.*)/i ) {
- print "Processing $1 ...\n";
- my $file_content = get_source_file("$1");
- $text = "$text $file_content";
- # HTTP(S) URL
- } elsif ( $_ =~ /^https*\:\/\//i ) {
- print "Processing $_ ...\n";
- $text = "$text " . get_source_http("$_");
- # String
- } else {
- print join("", "Processing string '", (substr $_, 0, 20) ,"...' ...\n");
- $text = "$text $_";
- }
- }
- }
- }
- get_source_contents();
- # Replace all excluded strings to spaces so that it's easier to process
- $text =~ s/$excluded_strings_re/ /g;
- ## Process words ##
- our @words = split(' ', $text);
- unless (eval "defined(&is_numeric)") {
- sub is_numeric {
- my $val = shift;
- if ( defined $val ) {
- return $val =~ /^\d+$/ ? 1 : 0;
- } else {
- warn "No argument given to is_numeric!";
- }
- }
- }
- # Requires: use List::Util qw(any);
- unless (eval "defined(&array_contains)") {
- sub array_contains {
- my $value = shift;
- # https://stackoverflow.com/a/16690762
- return any { $_ eq $value } @_;
- }
- }
- # Process word if it should be added to unique words list.
- unless (eval "defined(&check_word)") {
- sub check_word {
- my $word = shift;
- my $word_length = length($word);
- if ( not array_contains($word, @unique_words)
- and $word_length >= $minimum_word_length
- and $word_length <= $maximum_word_length
- and not is_numeric($word)
- ) {
- return 1;
- }
- return undef;
- }
- }
- # Prepare unique word list
- unless (eval "defined(&prepare_unique_words)") {
- sub prepare_unique_words {
- # Prepare unique words array
- foreach (@words) {
- my $word = lc("$_");
- if ( check_word($word) ) {
- push @unique_words, $word;
- }
- }
- @unique_words = sort @unique_words;
- my $words_listed = scalar @unique_words;
- my $max_words_needed = $dice_face_max ** $index_length;
- if ( scalar @unique_words < $max_words_needed ) {
- print "WARNING!! There are only ${words_listed} words extracted from the sources in the config file. But there should be a total of ${max_words_needed} words or more. It may cause malfunction in the passphrase generator.\nPlease add more data_sources in config or add more words manually in the rough list.\n";
- }
- }
- }
- prepare_unique_words();
- # Hook: Runs before rough file is written
- if (eval "defined(&before_rough_file_write)") {
- before_rough_file_write();
- }
- unless (eval "defined(&write_rough_list)") {
- sub write_rough_list {
- if (-e -f "${data_dir}/${rough_list_file}") {
- unlink("${data_dir}/${rough_list_file}") or die "Can't unlink ${data_dir}/${rough_list_file}: $!";
- }
- open( my $RF, ">", "${data_dir}/${rough_list_file}") or die "Cannot open ${data_dir}/${rough_list_file} for write";
- foreach (@unique_words) {
- print $RF "$_\n";
- }
- close $RF;
- print "${data_dir}/${rough_list_file} has been generated. Please edit the file if you need to and run the same command with -i to create the wordlist.\n";
- }
- }
- write_rough_list();
- # Hook: Runs after rough file is written
- if (eval "defined(&after_rough_file_write)") {
- after_rough_file_write();
- }
- } elsif ( $mode == 2 ) {
- # Hook: Runs before indexing file process starts
- if (eval "defined(&before_indexed_file_process)") {
- before_indexed_file_process();
- }
- # To aid in getting index number
- my $base_enc = [$dice_face_min..$dice_face_max];
- my $index_conv = new Math::Base::Convert('10', $base_enc);
- # Returns Diceware index number for an $nth item.
- # Params:
- # 1: index - has to be 0-based (starts from 0)
- unless (eval "defined(&get_diceware_index_num)") {
- sub get_diceware_index_num {
- my $n = shift;
- my $ind = eval { $index_conv->cnv($n) };
- my $ind_length = length($ind);
- # Fill up empty spaces with 1s (or $dice_face_min)
- if ( $index_length > $ind_length ) {
- $ind = ( $dice_face_min x ($index_length - $ind_length) ) . $ind;
- } else {
- $ind = ( $dice_face_min x ($ind_length - $index_length) ) . $ind;
- }
- return $ind;
- }
- }
- # Read rough file
- my @file_content;
- unless (eval "defined(&read_rough_file_for_indexing)") {
- sub read_rough_file_for_indexing {
- open my $WLR, "${data_dir}/${rough_list_file}" or die "Could not open ${data_dir}/${rough_list_file}: $!";
- while( my $line = <$WLR>) {
- push @file_content, $line;
- }
- close $WLR;
- }
- }
- read_rough_file_for_indexing();
- # Sort before putting into wordlist file
- unless (eval "defined(&sort_rough_file_for_indexing)") {
- sub sort_rough_file_for_indexing {
- @file_content = sort @file_content;
- }
- }
- sort_rough_file_for_indexing();
- # Hook: Runs before indexed file is written
- if (eval "defined(&before_indexed_file_write)") {
- before_indexed_file_write();
- }
- # Write wordlist file
- unless (eval "defined(&write_rough_file_for_indexing)") {
- sub write_rough_file_for_indexing {
- open( my $WLW, ">", "${data_dir}/${indexed_list_file}") or die "Cannot open ${data_dir}/${indexed_list_file} for write";
- my $line_index = 0;
- my $max_index = $dice_face_max x $index_length;
- my $idx;
- foreach (@file_content) {
- # Add the index number and put in the file
- $idx = get_diceware_index_num($line_index);
- print $WLW join('', $idx, "\t$_");
- if ( $idx == $max_index ) {
- print "Reached maximum possible dice index number: ${max_index}.\nWill ignore rest of the words.\nLast word was: $_\n";
- last;
- }
- $line_index++;
- }
- if ( $idx != $max_index ) {
- print "WARNING!! The word count haven't reached the maximum possible dice index number ${max_index} but only at ${idx}.\nThis may result in malfunction in the passphrase generator and may not function as expected.\nPlease add more data_sources in config to increase word count.\n";
- }
- print "${data_dir}/${indexed_list_file} file has been created.\n";
- close $WLW;
- }
- }
- write_rough_file_for_indexing();
- # Hook: Runs after indexed file is written
- if (eval "defined(&after_indexed_file_write)") {
- after_indexed_file_write();
- }
- }
|