1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180 |
- #!/usr/bin/perl
- # Author: Trizen
- # Date: 08 January 2022
- # Edit: 05 February 2022
- # https://github.com/trizen
- # A private search engine, with its own crawler running over Tor (respecting robots.txt).
- # Using some HTML and CSS code from the searX project (++):
- # https://github.com/searx/searx
- # To crawl an website, pass it as an argument to this script.
- # By default, depth = 0. Use --depth=i to increase the crawling depth.
- # Example:
- # perl search.fcgi --depth=i [URL]
- # Other script options:
- # --recrawl : activate recrawl mode
- # --fix-index : fix the index in case it gets messed up (slow operation)
- # --sanitize-index : sanitize the index and show some stats
- # Limitations:
- # - the search engine cannot be used while the crawler is being used
- # - the crawler cannot be used while the search engine is being used
- # Useful videos on this topic:
- #
- # The Inverted Index Stanford NLP Professor Dan Jurafsky & Chris Manning
- # https://yewtu.be/watch?v=bnP6TsqyF30
- # Query Processing with the Inverted Index Stanford NLP Dan Jurafsky & Chris Manning
- # https://yewtu.be/watch?v=B-e297yK50U
- # Phrase Queries and Positional Indexes Stanford NLP Professor Dan Jurafsky & Chris Manning
- # https://yewtu.be/watch?v=PkjuJZSrudE
- use utf8;
- use 5.036;
- no warnings qw(once);
- #use autodie;
- #use experimental qw(signatures);
- use CGI::Fast;
- use CGI qw/:standard *table -utf8/;
- #use CGI::Carp qw(fatalsToBrowser);
- #use IO::Compress::Zstd qw(zstd);
- #use IO::Uncompress::UnZstd qw(unzstd);
- #use URI::Escape qw(uri_escape_utf8);
- use Text::Unidecode qw(unidecode);
- use Text::ParseWords qw(quotewords);
- use HTML::Entities qw(encode_entities);
- use Time::HiRes qw(gettimeofday tv_interval);
- use ntheory qw(forcomb binomial);
- use List::Util qw(uniq max);
- use JSON::XS qw(decode_json encode_json);
- use Encode qw(decode_utf8 encode_utf8);
- use constant {
- # Cache HTML content (using CHI and WWW::Mechanize::Cached)
- CACHE => 0,
- # Use Tor proxy for crawling (127.0.0.1:9050)
- USE_TOR => 0,
- # Compress the values of the content database with Zstandard.
- # When enabled, the content database will be ~3x smaller.
- USE_ZSTD => 1,
- # xxHash seed (don't change it)
- XXHASH_SEED => 42,
- # Minimum and maximum number of characters for words stored in the index.
- WORD_MIN_LEN => 3,
- WORD_MAX_LEN => 45,
- # Maximum number of top best search results to return.
- MAX_SEARCH_RESULTS => 200,
- # Show the description of each website in search results (if available).
- # When disabled, a snippet of the content will be shown instead.
- SHOW_DESCRIPTION => 1,
- # Respect the rules from robots.txt
- RESPECT_ROBOT_RULES => 1,
- # Include only the results that fully match the given query
- EXACT_MATCH => 0,
- # Include all the results that include all the words from the given query, but not necessarily consecutive
- FAST_MATCH => 1,
- # Highlight all words from the query in search results (will produce longer descriptions)
- HIGHLIGHT_ALL_KEYWORDS => 1,
- # Rank the results based on content of the pages (better ranking, but it's much slower)
- RANK_ON_CONTENT => 1,
- # Rank the results based on boundary matches (with \b)
- RANK_ON_BOUNDARY_MATCH => 0,
- # Rank the results based on non-boundary matches (without \b)
- RANK_ON_NON_BOUNDARY_MATCH => 1,
- # Maximum number of iterations to spend during the ranking process.
- MAX_RANK_ITERATIONS => 10_000,
- # Make sure the SSL certificate is valid.
- SSL_VERIFY_HOSTNAME => 0,
- # Extract the date of the article and display it in search results (slow)
- EXTRACT_DATE => 0,
- # On "403 Forbidden" or "429 Too Many Requests" status, try to crawl the Web Archive version.
- CRAWL_ARCHIVE_FORBIDDEN => 1,
- # Word popularity limit (ignore words with popularity larger than this)
- MAX_WORD_POPULARITY => 10_000,
- };
- # List of tracking query parameters to remove from URLs
- my @tracking_parameters = qw(
- ac itc
- yclid fbclid gclsrc
- utm_source utm_medium utm_term
- utm_content utm_campaign utm_referrer
- mtm_kwd mtm_campaign mtm_medium
- __hssc __hstc __s _hsenc _openstat dclid fb_ref gclid
- hsCtaTracking igshid mc_eid mkt_tok ml_subscriber ml_subscriber_hash
- msclkid oly_anon_id oly_enc_id rb_clickid s_cid vero_conv vero_id wickedid
- );
- binmode(STDOUT, ':utf8');
- binmode(STDIN, ':utf8');
- binmode(STDERR, ':utf8');
- if (USE_ZSTD) {
- require IO::Compress::Zstd;
- require IO::Uncompress::UnZstd;
- }
- my %hostname_alternatives = (
- youtube => 'yewtu.be',
- reddit => 'teddit.net',
- medium => 'scribe.rip',
- twitter => 'nitter.net',
- odysee => 'lbry.projectsegfau.lt',
- );
- my $cookie_file = 'cookies.txt';
- my $crawled_db_file = "content_berkeley_deep.db";
- my $index_db_file = "index_berkeley_deep.db";
- use DB_File;
- my $DB_OPTIONS = O_RDONLY;
- if (@ARGV) {
- $DB_OPTIONS = O_CREAT | O_RDWR;
- }
- my $content_db = tie(my %CONTENT_DB, 'DB_File', $crawled_db_file, $DB_OPTIONS, 0666, $DB_HASH)
- or die "Can't create/access database <<$crawled_db_file>>: $!";
- my $index_db = tie(my %WORDS_INDEX, 'DB_File', $index_db_file, $DB_OPTIONS, 0666, $DB_HASH)
- or die "Can't create/access database <<$index_db_file>>: $!";
- local $SIG{INT} = sub {
- $index_db->sync;
- $content_db->sync;
- #untie %CONTENT_DB;
- #untie %WORDS_INDEX;
- exit;
- };
- my ($mech, $lwp, $robot_rules);
- if (@ARGV) {
- my %mech_options = (
- timeout => 20,
- autocheck => 0,
- show_progress => 1,
- stack_depth => 10,
- cookie_jar => {},
- ssl_opts => {verify_hostname => SSL_VERIFY_HOSTNAME, Timeout => 20},
- agent => "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
- );
- if (CACHE) {
- require File::Basename;
- require File::Spec::Functions;
- require CHI;
- require WWW::Mechanize::Cached;
- my $cache = CHI->new(
- driver => 'BerkeleyDB',
- root_dir => File::Spec::Functions::catdir(File::Basename::dirname(File::Spec::Functions::rel2abs($0)), 'cache')
- );
- $mech = WWW::Mechanize::Cached->new(%mech_options, cache => $cache);
- }
- else {
- require WWW::Mechanize;
- $mech = WWW::Mechanize->new(%mech_options);
- }
- $lwp = LWP::UserAgent->new(%mech_options);
- if (USE_TOR) { # set Tor proxy
- $mech->proxy(['http', 'https'], "socks://127.0.0.1:9050");
- $lwp->proxy(['http', 'https'], "socks://127.0.0.1:9050");
- }
- require WWW::RobotRules;
- $robot_rules = WWW::RobotRules->new($mech->agent);
- state $accepted_encodings = HTTP::Message::decodable();
- my %default_headers = (
- 'Accept-Encoding' => $accepted_encodings,
- 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Language' => 'en-US,en;q=0.5',
- 'Connection' => 'keep-alive',
- 'Upgrade-Insecure-Requests' => '1',
- );
- foreach my $key (sort keys %default_headers) {
- $mech->default_header($key, $default_headers{$key});
- $lwp->default_header($key, $default_headers{$key});
- }
- require LWP::ConnCache;
- my $cache = LWP::ConnCache->new;
- $cache->total_capacity(undef); # no limit
- $mech->conn_cache($cache);
- $lwp->conn_cache($cache);
- # Support for cookies from file
- if (defined($cookie_file) and -f $cookie_file) {
- ## Netscape HTTP Cookies
- # Firefox extension:
- # https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/
- # See also:
- # https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl
- require HTTP::Cookies::Netscape;
- my $cookies = HTTP::Cookies::Netscape->new(
- hide_cookie2 => 1,
- autosave => 1,
- file => $cookie_file,
- );
- $cookies->load;
- $mech->cookie_jar($cookies);
- }
- }
- sub lwp_get ($url) {
- my $resp = $lwp->get($url);
- if ($resp->is_success) {
- return $resp->decoded_content;
- }
- return undef;
- }
- sub extract_words ($text) {
- grep { length($_) >= WORD_MIN_LEN and length($_) <= WORD_MAX_LEN and /[[:alnum:]]/ }
- uniq(split(/[_\W]+/, CORE::fc($text)));
- }
- sub zstd_encode ($data) {
- IO::Compress::Zstd::zstd(\$data, \my $zstd_data)
- or die "zstd failed: $IO::Compress::Zstd::ZstdError\n";
- return $zstd_data;
- }
- sub zstd_decode ($zstd_data) {
- IO::Uncompress::UnZstd::unzstd(\$zstd_data, \my $decoded_data)
- or die "unzstd failed: $IO::Uncompress::UnZstd::UnZstdError\n";
- return $decoded_data;
- }
- sub encode_content_entry ($entry) {
- my $data = encode_json($entry);
- if (USE_ZSTD) {
- $data = zstd_encode($data);
- }
- return $data;
- }
- sub decode_content_entry ($entry) {
- my $data = $entry;
- if (USE_ZSTD) {
- $data = zstd_decode($data);
- }
- return decode_json($data);
- }
- sub encode_index_entry ($entry) {
- my $data = $entry;
- if (USE_ZSTD) {
- $data = zstd_encode($data);
- }
- return $data;
- }
- sub decode_index_entry ($entry) {
- my $data = $entry;
- if (USE_ZSTD) {
- $data = zstd_decode($data);
- }
- return $data;
- }
- sub surprise_me {
- while (my ($word, $value) = each %WORDS_INDEX) {
- if (length($word) >= 5 and rand() < 0.1) {
- my $entry = decode_index_entry($value);
- my $ref_count = ($entry =~ tr/ //);
- if ($ref_count >= 10 and $ref_count <= 1000) {
- return $word;
- }
- }
- }
- return undef;
- }
- sub sanitize_url ($url) {
- # Replace some bad hostnames with better alternatives
- my $protocol = '';
- if ($url =~ m{^(https?://)(.+)}s) {
- $protocol = $1;
- $url = $2;
- }
- # Normalize the URL
- ## $url = normalize_url($protocol . $url);
- # YouTube
- $url =~ s{^(?:www\.)?youtube\.com(?=[/?])}{$hostname_alternatives{youtube}};
- $url =~ s{^(?:www\.)?youtu\.be(?=[/?])}{$hostname_alternatives{youtube}};
- # Reddit (doesn't work for comments)
- ## $url =~ s{^(?:www\.)?reddit\.com(?=[/?])}{$hostname_alternatives{reddit}};
- # Twitter
- $url =~ s{^(?:www\.)?twitter\.com(?=/\w+\z)}{$hostname_alternatives{twitter}};
- $url =~ s{^(?:www\.)?twitter\.com(?=/\w+/status/)}{$hostname_alternatives{twitter}};
- # Medium
- $url =~ s{^(?:www\.)?medium\.com(?=[/?])}{$hostname_alternatives{medium}};
- # Odysee / LBRY
- $url =~ s{^(?:www\.)?odysee\.com(?=[/?])}{$hostname_alternatives{odysee}};
- $url =~ s{^(?:www\.)?open\.lbry\.com(?=[/?])}{$hostname_alternatives{odysee}};
- $url =~ s{^(?:www\.)?lbry\.com(?=[/?])}{$hostname_alternatives{odysee}};
- $url =~ s{^(?:www\.)?lbry\.tv(?=[/?])}{$hostname_alternatives{odysee}};
- return ($protocol . $url);
- }
- sub normalize_url ($url) {
- #$url =~ s/#.*//sr =~ s{^https?://(?:www\.)?}{}r =~ s{/+\z}{}r;
- require URL::Normalize;
- my $normalizer = URL::Normalize->new(url => $url);
- # Remove tracking query parameters
- $normalizer->remove_query_parameters(\@tracking_parameters);
- my $normalize = sub ($url, $method) {
- my $obj = URL::Normalize->new(url => $url);
- $obj->$method;
- $obj->url;
- };
- my $normalized_url = $normalizer->url;
- foreach my $method (
- qw(
- remove_directory_index
- remove_fragment
- remove_fragments
- remove_duplicate_slashes
- remove_empty_query_parameters
- sort_query_parameters
- make_canonical
- remove_empty_query
- )
- ) {
- $normalized_url = $normalize->($normalized_url, $method);
- }
- # Remove the protocol
- $normalized_url =~ s{^https?://}{};
- return $normalized_url;
- }
- sub add_to_database_index ($text, $key) {
- foreach my $word (extract_words($text)) {
- if (exists $WORDS_INDEX{$word}) {
- my $entry = decode_index_entry($WORDS_INDEX{$word});
- #<<<
- #~ if (($entry =~ tr/ //) >= MAX_WORD_POPULARITY) {
- #~ next;
- #~ }
- #>>>
- delete $WORDS_INDEX{$word};
- $WORDS_INDEX{$word} = encode_index_entry($entry . ' ' . $key);
- }
- else {
- $WORDS_INDEX{$word} = encode_index_entry($key);
- }
- }
- return 1;
- }
- sub readd_to_database_index ($text, $key) {
- foreach my $word (extract_words($text)) {
- if (exists $WORDS_INDEX{$word}) {
- my $entry = decode_index_entry($WORDS_INDEX{$word});
- delete $WORDS_INDEX{$word};
- $WORDS_INDEX{$word} = encode_index_entry(join(' ', uniq(split(' ', $entry), $key)));
- }
- else {
- $WORDS_INDEX{$word} = encode_index_entry($key);
- }
- }
- return 1;
- }
- sub valid_content_type {
- $mech->is_html() or (lc($mech->content_type) =~ m{^(?:text/|message/)});
- }
- sub extract_hostname ($url) {
- normalize_url(sanitize_url("$url")) =~ s{/.*}{}sr;
- }
- sub extract_protocol ($url) {
- ("$url" =~ m{^https://}) ? 'https://' : 'http://';
- }
- sub crawl ($url, $seen_hostname = {}) {
- # Must be http:// or https://
- $url =~ m{^https?://} or return;
- # Sanitize url
- $url = sanitize_url($url);
- # Check if we're allowed to crawl this URL
- if (RESPECT_ROBOT_RULES and not $robot_rules->allowed($url)) {
- warn "Not allowed to crawl: $url\n";
- return;
- }
- require Digest::xxHash;
- my $id = Digest::xxHash::xxhash32_hex(encode_utf8(normalize_url($url)), XXHASH_SEED);
- if (keys(%$seen_hostname) and exists($CONTENT_DB{$id})) {
- return 1;
- }
- my $resp = $mech->head($url);
- if ($resp->is_success) {
- valid_content_type() || return;
- }
- $url = $mech->uri;
- $url = sanitize_url("$url");
- $resp = $mech->get($url);
- # On HTTP 400+ errors, try again with WebArchive
- if (CRAWL_ARCHIVE_FORBIDDEN and $resp->code >= 400) {
- if ($url !~ m{^https://web\.archive\.org/}) {
- return crawl(join('', "https://web.archive.org/web/1990/", extract_protocol($url), normalize_url($url)),
- $seen_hostname);
- }
- }
- $resp->is_success or return;
- if (not valid_content_type()) {
- $mech->invalidate_last_request() if CACHE;
- return;
- }
- $url = $mech->uri;
- $url = sanitize_url("$url");
- my $normalized_url = normalize_url($url);
- my $protocol = extract_protocol($url);
- if (not exists $CONTENT_DB{$id}) {
- my %info;
- my $decoded_content = $resp->decoded_content() // $resp->content() // return;
- if ($mech->is_html) {
- if (not exists $INC{'HTML::TreeBuilder'}) {
- require HTML::TreeBuilder;
- HTML::TreeBuilder->VERSION(5);
- HTML::TreeBuilder->import('-weak');
- }
- my $tree = HTML::TreeBuilder->new();
- $tree->parse($decoded_content);
- $tree->eof();
- $tree->elementify(); # just for safety
- require HTML::FormatText;
- my $formatter = HTML::FormatText->new(leftmargin => 0,
- rightmargin => 1000);
- $info{content} = $formatter->format($tree);
- }
- else {
- $info{content} = $decoded_content;
- }
- $info{title} = $mech->title;
- # Convert Unicode to ASCII
- $info{content} = unidecode($info{content});
- if ($mech->is_html) {
- # Parse HTML header for extracting metadata
- my $html_head_parser = HTML::HeadParser->new;
- $html_head_parser->parse($decoded_content);
- $info{title} ||= $html_head_parser->header('Title');
- $info{keywords} = $html_head_parser->header('X-Meta-Keywords');
- $info{description} = $html_head_parser->header('X-Meta-Description');
- }
- $info{title} ||= $normalized_url;
- $info{id} = $id;
- $info{url} = $protocol . $normalized_url;
- warn "Adding: $info{title}\nURI: $info{url}\n";
- my $relevant_content = join(' ', unidecode($normalized_url), unidecode($info{title}), $info{content});
- add_to_database_index($relevant_content, $id);
- $CONTENT_DB{$id} = encode_content_entry(\%info);
- }
- if (RESPECT_ROBOT_RULES) {
- my $host = $normalized_url =~ s{/.*}{}sr;
- ## my $host = URI->new($url)->host;
- $seen_hostname->{$host} = 1;
- my $robots_url = $protocol . join('/', $host, 'robots.txt');
- my $robots_txt = lwp_get($robots_url);
- $robot_rules->parse($robots_url, $robots_txt) if defined($robots_txt);
- }
- my @links = $mech->find_all_links(text_regex => qr/./);
- foreach my $link (@links) {
- my $abs_url = join('', $link->url_abs);
- my $host = extract_hostname($abs_url);
- next if $seen_hostname->{$host};
- crawl($abs_url, $seen_hostname);
- $seen_hostname->{$host} = 1;
- }
- return 1;
- }
- sub add_match_text_to_value ($text, $value, $i, $j) {
- if (!HIGHLIGHT_ALL_KEYWORDS) {
- exists($value->{match}) and return 1;
- }
- my $prefix_len = 50;
- my $suffix_len = 200;
- my $match_content = substr($text, $i, $j - $i);
- if ($j + $suffix_len > length($text)) {
- $prefix_len += $j + $suffix_len - length($text);
- }
- if ($i - $prefix_len < 0) {
- $prefix_len = $i;
- }
- my $prefix_content = substr($text, $i - $prefix_len, $prefix_len);
- my $suffix_content = substr($text, $j, $suffix_len);
- foreach ($match_content, $prefix_content, $suffix_content) {
- s/\s+/ /g;
- s/(\W)\1{2,}/$1/g;
- }
- $value->{match} .=
- encode_entities($prefix_content) . '<b>'
- . encode_entities($match_content) . '</b>'
- . encode_entities($suffix_content)
- . (HIGHLIGHT_ALL_KEYWORDS ? ' [...] ' : '');
- return 1;
- }
- sub set_intersection ($sets) {
- my @sets = @$sets;
- @sets || return;
- # Optimization: sort the sets by their number of elements
- @sets = sort { scalar(@$a) <=> scalar(@$b) } @sets;
- my $intersection = {};
- @{$intersection}{@{shift(@sets)}} = ();
- while (@sets) {
- my %curr;
- @curr{@{shift(@sets)}} = ();
- my %tmp;
- foreach my $key (keys %$intersection) {
- if (exists $curr{$key}) {
- undef $tmp{$key};
- }
- }
- $intersection = \%tmp;
- }
- return keys %$intersection;
- }
- sub search ($text) {
- $text = unidecode($text);
- my %seen;
- my %matches;
- my @words = extract_words($text);
- my @known_words = grep { exists($WORDS_INDEX{$_}) } @words;
- my @ref_sets;
- my %counts;
- foreach my $word (@known_words) {
- my @refs = split(' ', decode_index_entry($WORDS_INDEX{$word}));
- $counts{$word} = scalar(@refs);
- push @ref_sets, \@refs;
- }
- foreach my $key (set_intersection(\@ref_sets)) {
- $matches{$key} = eval { decode_content_entry($CONTENT_DB{$key}) } // next;
- }
- my @original_words = map {
- join('\W+', map { quotemeta($_) } split(' '))
- } grep { length($_) >= 2 } quotewords(qr/\s+/, 0, $text);
- if (not @original_words) {
- @original_words = map { quotemeta($_) } grep { length($_) >= 2 } split(/\W+/, $text);
- }
- my $ranking_cost = 0;
- my $matches_count = scalar(keys %matches);
- my @regexes;
- for (my $k = scalar(@original_words) ; $k >= 1 ; --$k) {
- if (FAST_MATCH) {
- $k == 1 or next;
- }
- my $current_cost =
- ((RANK_ON_NON_BOUNDARY_MATCH ? 1 : 0) + (RANK_ON_BOUNDARY_MATCH ? 1 : 0)) * binomial(scalar(@original_words), $k);
- if ($matches_count * ($ranking_cost + $current_cost) > max($matches_count, MAX_RANK_ITERATIONS)) {
- next;
- }
- $ranking_cost += $current_cost;
- #<<<
- forcomb {
- my @subset = @original_words[@_];
- my $regex = join('.{0,10}', @subset);
- my $b_regex = join('\b.{0,10}\b', @subset);
- #my $regex = join('\W*+', @subset);
- #my $b_regex = join('\b\W*+\b', @subset);
- push @regexes,
- scalar {
- (RANK_ON_NON_BOUNDARY_MATCH ? (re => qr/$regex/si) : ()),
- (RANK_ON_BOUNDARY_MATCH ? (b_re => qr/\b$b_regex\b/si) : ()),
- factor => $k,
- };
- } scalar(@original_words), $k;
- #>>>
- EXACT_MATCH && last;
- }
- foreach my $key (keys %matches) {
- my $value = $matches{$key};
- $value->{score} = 0;
- if ($value->{url} !~ m{^https?://}) {
- $value->{url} = 'https://' . $value->{url};
- }
- my $content = $value->{content} // '';
- my $title = unidecode($value->{title} // '');
- my $description = unidecode($value->{description} // '');
- my $keywords = unidecode($value->{keywords} // '');
- my $url = unidecode($value->{url} // '');
- foreach my $regex (@regexes) {
- foreach my $re_type (qw(b_re re)) {
- my $re = $regex->{$re_type} // next;
- my $factor = $regex->{factor} * ($re_type eq 'b_re' ? 1 : 0.5);
- if ($title =~ $re) {
- $value->{score} += 2 * $factor;
- }
- if ($description =~ $re) {
- ## $value->{score} += 1 * $factor;
- if (SHOW_DESCRIPTION
- and $re_type eq (RANK_ON_BOUNDARY_MATCH ? 'b_re' : 're')) {
- add_match_text_to_value($description, $value, $-[0], $+[0]);
- }
- }
- if (RANK_ON_CONTENT and $content =~ $re) {
- $value->{score} += $factor;
- if ($re_type eq (RANK_ON_BOUNDARY_MATCH ? 'b_re' : 're')) {
- add_match_text_to_value($content, $value, $-[0], $+[0]);
- }
- }
- if ($keywords =~ $re) {
- $value->{score} += 2 * $factor;
- }
- if ($url =~ $re) {
- $value->{score} += 4 * $factor;
- }
- }
- }
- ## delete $value->{content};
- }
- my %seen_url;
- my @sorted = sort { $b->{score} <=> $a->{score} } values %matches;
- my $results_count = scalar(@sorted);
- # Keep only the top best entries
- $#sorted = (MAX_SEARCH_RESULTS - 1) if (scalar(@sorted) > MAX_SEARCH_RESULTS);
- # Keep entries with score > 0
- @sorted = grep { $_->{score} > 0 } @sorted;
- # Prefer longer content for results with the same score
- @sorted = map { $_->[0] }
- sort { ($b->[1] <=> $a->[1]) || ($b->[2] <=> $a->[2]) }
- map { [$_, $_->{score}, length($_->{content})] } @sorted;
- # Fix some ArchWiki links
- foreach my $entry (@sorted) {
- $entry->{url} =~ s{^https://wiki\.archlinux\.org//}{https://wiki.archlinux.org/title/};
- }
- # Remove duplicated entries
- @sorted = grep { !$seen_url{(($_->{url} =~ s{^https?://(?:www\.)?}{}r) =~ s{#.*}{}sr) =~ s{[/?]+\z}{}r}++ } @sorted;
- return {
- results => \@sorted,
- counts => \%counts,
- words => \@known_words,
- count => $results_count,
- };
- }
- sub repair_index { # very slow operation
- while (my ($key, $value) = each %CONTENT_DB) {
- my $info = eval { decode_content_entry($value) } // next;
- readd_to_database_index(unidecode($info->{title}) . ' ' . $info->{content}, $info->{id});
- }
- return 1;
- }
- sub sanitize_index {
- my @for_delete_keys;
- my $index_len = 0;
- my $uniq_refs = 0;
- while (my ($key, $value) = each %WORDS_INDEX) {
- my $entry = decode_index_entry($value);
- ++$index_len;
- my $ref_count = 1 + ($entry =~ tr/ //);
- if ($ref_count > MAX_WORD_POPULARITY) {
- say "$ref_count: $key";
- }
- if ($ref_count == 1) {
- ++$uniq_refs;
- }
- if (length($key) < WORD_MIN_LEN or length($key) > WORD_MAX_LEN) {
- push @for_delete_keys, $key;
- }
- }
- say ":: The words index contains $index_len entries.";
- say ":: The words index contains $uniq_refs entries with only one reference.";
- foreach my $key (@for_delete_keys) {
- delete $WORDS_INDEX{$key};
- }
- return 1;
- }
- if (@ARGV) {
- require Getopt::Long;
- Getopt::Long::GetOptions(
- "sanitize-index" => sub {
- warn "Sanitzing index...\n";
- sanitize_index();
- exit;
- },
- "fix-index|recover-index|repair-index" => sub {
- warn "Recovering index...\n";
- repair_index();
- exit;
- },
- );
- foreach my $url (@ARGV) {
- warn "Crawling: $url\n";
- crawl($url);
- $index_db->sync;
- $content_db->sync;
- }
- #untie(%CONTENT_DB);
- #untie(%WORDS_INDEX);
- exit;
- }
- while (my $c = CGI::Fast->new) {
- my $query = $c->param('q');
- my $id = $c->param('text');
- my $surprise = $c->param('surprise');
- my $info = defined($id) ? decode_content_entry($CONTENT_DB{$id}) : undef;
- my $title = defined($info) ? encode_utf8(encode_entities($info->{title})) : undef;
- print header(
- -charset => 'UTF-8',
- 'Referrer-Policy' => 'no-referrer',
- 'X-Frame-Options' => 'DENY',
- 'X-Xss-Protection' => '1; mode=block',
- 'X-Content-Type-Options' => 'nosniff',
- 'Content-Security-Policy' =>
- q{default-src 'self'; frame-ancestors 'none'; form-action 'self'; base-uri 'self'; img-src 'self' data:;},
- ),
- start_html(
- -class => 'results_endpoint',
- -title => encode_utf8($query // $title // 'Surprise'),
- -meta => {
- 'keywords' => 'dark search, search engine, private, secure',
- #'viewport' => 'width=device-width, initial-scale=1.0',
- 'viewport' => 'width=device-width, initial-scale=1, maximum-scale=2.0, user-scalable=1',
- 'referrer' => 'no-referrer',
- },
- -style => [
- {
- src => 'css/logicodev-dark.min.css',
- },
- {
- src => 'css/bootstrap.min.css',
- },
- {
- src => 'css/pre.css',
- },
- #~ {
- #~ src => 'css/popup.css',
- #~ },
- ],
- -head => [
- Link(
- {
- -rel => 'shortcut icon',
- -type => 'image/png',
- -href => 'img/favicon.png',
- }
- ),
- (-e "opensearch.xml")
- ? Link(
- {
- -rel => 'search',
- -type => 'application/opensearchdescription+xml',
- -title => 'Dark search',
- -href => 'opensearch.xml',
- }
- )
- : ()
- ],
- );
- if (defined($id)) {
- say h4(
- {-class => "result_header"},
- a(
- {
- -href => encode_utf8($info->{url}),
- -target => "_blank",
- -rel => "noopener noreferrer",
- },
- b($title),
- )
- );
- print pre(encode_entities($info->{content}));
- print end_html();
- next;
- }
- print <<"EOT";
- <div class="searx-navbar"><span class="instance pull-left"><a href="/search">Home</a></span><span class="pull-right"><a href="$ENV{SCRIPT_NAME}?surprise=1">Surprise me</a></span></div>
- <div class="container">
- <form method="post" action="$ENV{SCRIPT_NAME}" id="search_form" role="search">
- <div class="row">
- <div class="col-xs-12 col-md-8">
- <div class="input-group search-margin">
- <input type="search" autofocus="" name="q" class="form-control autofocus" id="q" placeholder="${\encode_entities($query // '')}" aria-label="Search for..." autocomplete="off" value="" accesskey="s">
- <span class="input-group-btn">
- <button type="submit" class="btn btn-default" aria-label="Search"><span>Search</span></button>
- </span>
- </div>
- </div>
- <div class="col-xs-6 col-md-2 search-margin"><label class="visually-hidden" for="time-range">Time range</label></div>
- <div class="col-xs-6 col-md-2 search-margin"><label class="visually-hidden" for="language">Language</label></div>
- </div>
- </form><!-- / #search_form_full -->
- <div class="row">
- <div class="col-sm-4 col-sm-push-8" id="sidebar_results">
- </div><!-- /#sidebar_results -->
- <div class="col-sm-8 col-sm-pull-4" id="main_results">
- <h1 class="sr-only">Search results</h1>
- EOT
- if ($surprise) {
- $query = surprise_me();
- }
- say q{<div class="result result-default">};
- my $t0 = [gettimeofday];
- my @results;
- my $search_results = ((($query // '') =~ /\S/) ? search($query) : ());
- my $elapsed = tv_interval($t0, [gettimeofday]);
- if ($search_results) {
- @results = @{$search_results->{results}};
- my @words = @{$search_results->{words}};
- if (@words) {
- ## say p("Results found: ", b($search_results->{count}));
- say p("Term frequencies: " . join(", ", map { b($_) . ': ' . $search_results->{counts}{$_} } @words));
- say p(small(sprintf("Search took %.5f seconds", $elapsed)));
- }
- }
- foreach my $result (@results) {
- my $url = $result->{url};
- if ($url !~ m{^https?://}) {
- $url = 'https://' . $url;
- }
- $url = sanitize_url($url);
- my $title = $result->{title} // $url;
- if ($title !~ /\S/) {
- $title = $url;
- }
- say h4(
- {-class => "result_header"},
- a(
- {
- #-href => encode_utf8($url),
- -href => "$ENV{SCRIPT_NAME}?text=" . $result->{id},
- -target => "_blank",
- -rel => "noopener noreferrer",
- #(defined($result->{description}) ? (-class => 'popup') : ()),
- },
- #(defined($result->{description}) ? small(span(encode_utf8(encode_entities($result->{description})))) : ()),
- #small(span($result->{content} =~ s/(\R)\1{2,}/$1/gr =~ s{\R}{<br/>}gr)),
- b(encode_utf8(encode_entities($title))),
- )
- );
- say q{<p class="result-content">};
- say $result->{match};
- say q{</p>};
- say q{<div class="clearfix"></div>};
- say q{<div class="pull-right">};
- # Extract the date of the article (if any)
- if (EXTRACT_DATE) {
- require Date::Extract;
- my $date_extract = Date::Extract->new();
- if (my $dt = $date_extract->extract($result->{content})) {
- say small(scalar $dt->ymd);
- say q{<b> | </b>};
- }
- }
- # Web archive
- say small(
- a(
- {
- -href => encode_utf8('https://web.archive.org/web/' . $url),
- -class => 'text-info',
- -target => '_blank',
- -rel => 'noopener noreferrer',
- },
- "cached",
- ),
- );
- say q{<b> | </b>};
- # Text only (cached version)
- say small(
- a(
- {
- #-href => "$ENV{SCRIPT_NAME}?text=" . $result->{id},
- -href => encode_utf8($url),
- -class => 'text-info',
- -target => '_blank',
- -rel => 'noopener noreferrer',
- },
- "text",
- )
- );
- say q{<b> | </b>};
- say small("rank: $result->{score}");
- say "</div>"; # end of 'pull-right' div
- say div({-class => "external-link"}, encode_utf8($url));
- }
- say "</div>";
- print <<'EOT';
- <div class="clearfix"></div>
- <div class="clearfix"></div>
- </div><!-- /#main_results -->
- </div>
- </div>
- EOT
- print end_html;
- }
|