search_berkeley_deep.fcgi 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 08 January 2022
  4. # Edit: 05 February 2022
  5. # https://github.com/trizen
  6. # A private search engine, with its own crawler running over Tor (respecting robots.txt).
  7. # Using some HTML and CSS code from the searX project (++):
  8. # https://github.com/searx/searx
  9. # To crawl an website, pass it as an argument to this script.
  10. # By default, depth = 0. Use --depth=i to increase the crawling depth.
  11. # Example:
  12. # perl search.fcgi --depth=i [URL]
  13. # Other script options:
  14. # --recrawl : activate recrawl mode
  15. # --fix-index : fix the index in case it gets messed up (slow operation)
  16. # --sanitize-index : sanitize the index and show some stats
  17. # Limitations:
  18. # - the search engine cannot be used while the crawler is being used
  19. # - the crawler cannot be used while the search engine is being used
  20. # Useful videos on this topic:
  21. #
  22. # The Inverted Index Stanford NLP Professor Dan Jurafsky & Chris Manning
  23. # https://yewtu.be/watch?v=bnP6TsqyF30
  24. # Query Processing with the Inverted Index Stanford NLP Dan Jurafsky & Chris Manning
  25. # https://yewtu.be/watch?v=B-e297yK50U
  26. # Phrase Queries and Positional Indexes Stanford NLP Professor Dan Jurafsky & Chris Manning
  27. # https://yewtu.be/watch?v=PkjuJZSrudE
  28. use utf8;
  29. use 5.036;
  30. no warnings qw(once);
  31. #use autodie;
  32. #use experimental qw(signatures);
  33. use CGI::Fast;
  34. use CGI qw/:standard *table -utf8/;
  35. #use CGI::Carp qw(fatalsToBrowser);
  36. #use IO::Compress::Zstd qw(zstd);
  37. #use IO::Uncompress::UnZstd qw(unzstd);
  38. #use URI::Escape qw(uri_escape_utf8);
  39. use Text::Unidecode qw(unidecode);
  40. use Text::ParseWords qw(quotewords);
  41. use HTML::Entities qw(encode_entities);
  42. use Time::HiRes qw(gettimeofday tv_interval);
  43. use ntheory qw(forcomb binomial);
  44. use List::Util qw(uniq max);
  45. use JSON::XS qw(decode_json encode_json);
  46. use Encode qw(decode_utf8 encode_utf8);
  47. use constant {
  48. # Cache HTML content (using CHI and WWW::Mechanize::Cached)
  49. CACHE => 0,
  50. # Use Tor proxy for crawling (127.0.0.1:9050)
  51. USE_TOR => 0,
  52. # Compress the values of the content database with Zstandard.
  53. # When enabled, the content database will be ~3x smaller.
  54. USE_ZSTD => 1,
  55. # xxHash seed (don't change it)
  56. XXHASH_SEED => 42,
  57. # Minimum and maximum number of characters for words stored in the index.
  58. WORD_MIN_LEN => 3,
  59. WORD_MAX_LEN => 45,
  60. # Maximum number of top best search results to return.
  61. MAX_SEARCH_RESULTS => 200,
  62. # Show the description of each website in search results (if available).
  63. # When disabled, a snippet of the content will be shown instead.
  64. SHOW_DESCRIPTION => 1,
  65. # Respect the rules from robots.txt
  66. RESPECT_ROBOT_RULES => 1,
  67. # Include only the results that fully match the given query
  68. EXACT_MATCH => 0,
  69. # Include all the results that include all the words from the given query, but not necessarily consecutive
  70. FAST_MATCH => 1,
  71. # Highlight all words from the query in search results (will produce longer descriptions)
  72. HIGHLIGHT_ALL_KEYWORDS => 1,
  73. # Rank the results based on content of the pages (better ranking, but it's much slower)
  74. RANK_ON_CONTENT => 1,
  75. # Rank the results based on boundary matches (with \b)
  76. RANK_ON_BOUNDARY_MATCH => 0,
  77. # Rank the results based on non-boundary matches (without \b)
  78. RANK_ON_NON_BOUNDARY_MATCH => 1,
  79. # Maximum number of iterations to spend during the ranking process.
  80. MAX_RANK_ITERATIONS => 10_000,
  81. # Make sure the SSL certificate is valid.
  82. SSL_VERIFY_HOSTNAME => 0,
  83. # Extract the date of the article and display it in search results (slow)
  84. EXTRACT_DATE => 0,
  85. # On "403 Forbidden" or "429 Too Many Requests" status, try to crawl the Web Archive version.
  86. CRAWL_ARCHIVE_FORBIDDEN => 1,
  87. # Word popularity limit (ignore words with popularity larger than this)
  88. MAX_WORD_POPULARITY => 10_000,
  89. };
  90. # List of tracking query parameters to remove from URLs
  91. my @tracking_parameters = qw(
  92. ac itc
  93. yclid fbclid gclsrc
  94. utm_source utm_medium utm_term
  95. utm_content utm_campaign utm_referrer
  96. mtm_kwd mtm_campaign mtm_medium
  97. __hssc __hstc __s _hsenc _openstat dclid fb_ref gclid
  98. hsCtaTracking igshid mc_eid mkt_tok ml_subscriber ml_subscriber_hash
  99. msclkid oly_anon_id oly_enc_id rb_clickid s_cid vero_conv vero_id wickedid
  100. );
  101. binmode(STDOUT, ':utf8');
  102. binmode(STDIN, ':utf8');
  103. binmode(STDERR, ':utf8');
  104. if (USE_ZSTD) {
  105. require IO::Compress::Zstd;
  106. require IO::Uncompress::UnZstd;
  107. }
  108. my %hostname_alternatives = (
  109. youtube => 'yewtu.be',
  110. reddit => 'teddit.net',
  111. medium => 'scribe.rip',
  112. twitter => 'nitter.net',
  113. odysee => 'lbry.projectsegfau.lt',
  114. );
  115. my $cookie_file = 'cookies.txt';
  116. my $crawled_db_file = "content_berkeley_deep.db";
  117. my $index_db_file = "index_berkeley_deep.db";
  118. use DB_File;
  119. my $DB_OPTIONS = O_RDONLY;
  120. if (@ARGV) {
  121. $DB_OPTIONS = O_CREAT | O_RDWR;
  122. }
  123. my $content_db = tie(my %CONTENT_DB, 'DB_File', $crawled_db_file, $DB_OPTIONS, 0666, $DB_HASH)
  124. or die "Can't create/access database <<$crawled_db_file>>: $!";
  125. my $index_db = tie(my %WORDS_INDEX, 'DB_File', $index_db_file, $DB_OPTIONS, 0666, $DB_HASH)
  126. or die "Can't create/access database <<$index_db_file>>: $!";
  127. local $SIG{INT} = sub {
  128. $index_db->sync;
  129. $content_db->sync;
  130. #untie %CONTENT_DB;
  131. #untie %WORDS_INDEX;
  132. exit;
  133. };
  134. my ($mech, $lwp, $robot_rules);
  135. if (@ARGV) {
  136. my %mech_options = (
  137. timeout => 20,
  138. autocheck => 0,
  139. show_progress => 1,
  140. stack_depth => 10,
  141. cookie_jar => {},
  142. ssl_opts => {verify_hostname => SSL_VERIFY_HOSTNAME, Timeout => 20},
  143. agent => "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
  144. );
  145. if (CACHE) {
  146. require File::Basename;
  147. require File::Spec::Functions;
  148. require CHI;
  149. require WWW::Mechanize::Cached;
  150. my $cache = CHI->new(
  151. driver => 'BerkeleyDB',
  152. root_dir => File::Spec::Functions::catdir(File::Basename::dirname(File::Spec::Functions::rel2abs($0)), 'cache')
  153. );
  154. $mech = WWW::Mechanize::Cached->new(%mech_options, cache => $cache);
  155. }
  156. else {
  157. require WWW::Mechanize;
  158. $mech = WWW::Mechanize->new(%mech_options);
  159. }
  160. $lwp = LWP::UserAgent->new(%mech_options);
  161. if (USE_TOR) { # set Tor proxy
  162. $mech->proxy(['http', 'https'], "socks://127.0.0.1:9050");
  163. $lwp->proxy(['http', 'https'], "socks://127.0.0.1:9050");
  164. }
  165. require WWW::RobotRules;
  166. $robot_rules = WWW::RobotRules->new($mech->agent);
  167. state $accepted_encodings = HTTP::Message::decodable();
  168. my %default_headers = (
  169. 'Accept-Encoding' => $accepted_encodings,
  170. 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  171. 'Accept-Language' => 'en-US,en;q=0.5',
  172. 'Connection' => 'keep-alive',
  173. 'Upgrade-Insecure-Requests' => '1',
  174. );
  175. foreach my $key (sort keys %default_headers) {
  176. $mech->default_header($key, $default_headers{$key});
  177. $lwp->default_header($key, $default_headers{$key});
  178. }
  179. require LWP::ConnCache;
  180. my $cache = LWP::ConnCache->new;
  181. $cache->total_capacity(undef); # no limit
  182. $mech->conn_cache($cache);
  183. $lwp->conn_cache($cache);
  184. # Support for cookies from file
  185. if (defined($cookie_file) and -f $cookie_file) {
  186. ## Netscape HTTP Cookies
  187. # Firefox extension:
  188. # https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/
  189. # See also:
  190. # https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl
  191. require HTTP::Cookies::Netscape;
  192. my $cookies = HTTP::Cookies::Netscape->new(
  193. hide_cookie2 => 1,
  194. autosave => 1,
  195. file => $cookie_file,
  196. );
  197. $cookies->load;
  198. $mech->cookie_jar($cookies);
  199. }
  200. }
  201. sub lwp_get ($url) {
  202. my $resp = $lwp->get($url);
  203. if ($resp->is_success) {
  204. return $resp->decoded_content;
  205. }
  206. return undef;
  207. }
  208. sub extract_words ($text) {
  209. grep { length($_) >= WORD_MIN_LEN and length($_) <= WORD_MAX_LEN and /[[:alnum:]]/ }
  210. uniq(split(/[_\W]+/, CORE::fc($text)));
  211. }
  212. sub zstd_encode ($data) {
  213. IO::Compress::Zstd::zstd(\$data, \my $zstd_data)
  214. or die "zstd failed: $IO::Compress::Zstd::ZstdError\n";
  215. return $zstd_data;
  216. }
  217. sub zstd_decode ($zstd_data) {
  218. IO::Uncompress::UnZstd::unzstd(\$zstd_data, \my $decoded_data)
  219. or die "unzstd failed: $IO::Uncompress::UnZstd::UnZstdError\n";
  220. return $decoded_data;
  221. }
  222. sub encode_content_entry ($entry) {
  223. my $data = encode_json($entry);
  224. if (USE_ZSTD) {
  225. $data = zstd_encode($data);
  226. }
  227. return $data;
  228. }
  229. sub decode_content_entry ($entry) {
  230. my $data = $entry;
  231. if (USE_ZSTD) {
  232. $data = zstd_decode($data);
  233. }
  234. return decode_json($data);
  235. }
  236. sub encode_index_entry ($entry) {
  237. my $data = $entry;
  238. if (USE_ZSTD) {
  239. $data = zstd_encode($data);
  240. }
  241. return $data;
  242. }
  243. sub decode_index_entry ($entry) {
  244. my $data = $entry;
  245. if (USE_ZSTD) {
  246. $data = zstd_decode($data);
  247. }
  248. return $data;
  249. }
  250. sub surprise_me {
  251. while (my ($word, $value) = each %WORDS_INDEX) {
  252. if (length($word) >= 5 and rand() < 0.1) {
  253. my $entry = decode_index_entry($value);
  254. my $ref_count = ($entry =~ tr/ //);
  255. if ($ref_count >= 10 and $ref_count <= 1000) {
  256. return $word;
  257. }
  258. }
  259. }
  260. return undef;
  261. }
  262. sub sanitize_url ($url) {
  263. # Replace some bad hostnames with better alternatives
  264. my $protocol = '';
  265. if ($url =~ m{^(https?://)(.+)}s) {
  266. $protocol = $1;
  267. $url = $2;
  268. }
  269. # Normalize the URL
  270. ## $url = normalize_url($protocol . $url);
  271. # YouTube
  272. $url =~ s{^(?:www\.)?youtube\.com(?=[/?])}{$hostname_alternatives{youtube}};
  273. $url =~ s{^(?:www\.)?youtu\.be(?=[/?])}{$hostname_alternatives{youtube}};
  274. # Reddit (doesn't work for comments)
  275. ## $url =~ s{^(?:www\.)?reddit\.com(?=[/?])}{$hostname_alternatives{reddit}};
  276. # Twitter
  277. $url =~ s{^(?:www\.)?twitter\.com(?=/\w+\z)}{$hostname_alternatives{twitter}};
  278. $url =~ s{^(?:www\.)?twitter\.com(?=/\w+/status/)}{$hostname_alternatives{twitter}};
  279. # Medium
  280. $url =~ s{^(?:www\.)?medium\.com(?=[/?])}{$hostname_alternatives{medium}};
  281. # Odysee / LBRY
  282. $url =~ s{^(?:www\.)?odysee\.com(?=[/?])}{$hostname_alternatives{odysee}};
  283. $url =~ s{^(?:www\.)?open\.lbry\.com(?=[/?])}{$hostname_alternatives{odysee}};
  284. $url =~ s{^(?:www\.)?lbry\.com(?=[/?])}{$hostname_alternatives{odysee}};
  285. $url =~ s{^(?:www\.)?lbry\.tv(?=[/?])}{$hostname_alternatives{odysee}};
  286. return ($protocol . $url);
  287. }
  288. sub normalize_url ($url) {
  289. #$url =~ s/#.*//sr =~ s{^https?://(?:www\.)?}{}r =~ s{/+\z}{}r;
  290. require URL::Normalize;
  291. my $normalizer = URL::Normalize->new(url => $url);
  292. # Remove tracking query parameters
  293. $normalizer->remove_query_parameters(\@tracking_parameters);
  294. my $normalize = sub ($url, $method) {
  295. my $obj = URL::Normalize->new(url => $url);
  296. $obj->$method;
  297. $obj->url;
  298. };
  299. my $normalized_url = $normalizer->url;
  300. foreach my $method (
  301. qw(
  302. remove_directory_index
  303. remove_fragment
  304. remove_fragments
  305. remove_duplicate_slashes
  306. remove_empty_query_parameters
  307. sort_query_parameters
  308. make_canonical
  309. remove_empty_query
  310. )
  311. ) {
  312. $normalized_url = $normalize->($normalized_url, $method);
  313. }
  314. # Remove the protocol
  315. $normalized_url =~ s{^https?://}{};
  316. return $normalized_url;
  317. }
  318. sub add_to_database_index ($text, $key) {
  319. foreach my $word (extract_words($text)) {
  320. if (exists $WORDS_INDEX{$word}) {
  321. my $entry = decode_index_entry($WORDS_INDEX{$word});
  322. #<<<
  323. #~ if (($entry =~ tr/ //) >= MAX_WORD_POPULARITY) {
  324. #~ next;
  325. #~ }
  326. #>>>
  327. delete $WORDS_INDEX{$word};
  328. $WORDS_INDEX{$word} = encode_index_entry($entry . ' ' . $key);
  329. }
  330. else {
  331. $WORDS_INDEX{$word} = encode_index_entry($key);
  332. }
  333. }
  334. return 1;
  335. }
  336. sub readd_to_database_index ($text, $key) {
  337. foreach my $word (extract_words($text)) {
  338. if (exists $WORDS_INDEX{$word}) {
  339. my $entry = decode_index_entry($WORDS_INDEX{$word});
  340. delete $WORDS_INDEX{$word};
  341. $WORDS_INDEX{$word} = encode_index_entry(join(' ', uniq(split(' ', $entry), $key)));
  342. }
  343. else {
  344. $WORDS_INDEX{$word} = encode_index_entry($key);
  345. }
  346. }
  347. return 1;
  348. }
  349. sub valid_content_type {
  350. $mech->is_html() or (lc($mech->content_type) =~ m{^(?:text/|message/)});
  351. }
  352. sub extract_hostname ($url) {
  353. normalize_url(sanitize_url("$url")) =~ s{/.*}{}sr;
  354. }
  355. sub extract_protocol ($url) {
  356. ("$url" =~ m{^https://}) ? 'https://' : 'http://';
  357. }
  358. sub crawl ($url, $seen_hostname = {}) {
  359. # Must be http:// or https://
  360. $url =~ m{^https?://} or return;
  361. # Sanitize url
  362. $url = sanitize_url($url);
  363. # Check if we're allowed to crawl this URL
  364. if (RESPECT_ROBOT_RULES and not $robot_rules->allowed($url)) {
  365. warn "Not allowed to crawl: $url\n";
  366. return;
  367. }
  368. require Digest::xxHash;
  369. my $id = Digest::xxHash::xxhash32_hex(encode_utf8(normalize_url($url)), XXHASH_SEED);
  370. if (keys(%$seen_hostname) and exists($CONTENT_DB{$id})) {
  371. return 1;
  372. }
  373. my $resp = $mech->head($url);
  374. if ($resp->is_success) {
  375. valid_content_type() || return;
  376. }
  377. $url = $mech->uri;
  378. $url = sanitize_url("$url");
  379. $resp = $mech->get($url);
  380. # On HTTP 400+ errors, try again with WebArchive
  381. if (CRAWL_ARCHIVE_FORBIDDEN and $resp->code >= 400) {
  382. if ($url !~ m{^https://web\.archive\.org/}) {
  383. return crawl(join('', "https://web.archive.org/web/1990/", extract_protocol($url), normalize_url($url)),
  384. $seen_hostname);
  385. }
  386. }
  387. $resp->is_success or return;
  388. if (not valid_content_type()) {
  389. $mech->invalidate_last_request() if CACHE;
  390. return;
  391. }
  392. $url = $mech->uri;
  393. $url = sanitize_url("$url");
  394. my $normalized_url = normalize_url($url);
  395. my $protocol = extract_protocol($url);
  396. if (not exists $CONTENT_DB{$id}) {
  397. my %info;
  398. my $decoded_content = $resp->decoded_content() // $resp->content() // return;
  399. if ($mech->is_html) {
  400. if (not exists $INC{'HTML::TreeBuilder'}) {
  401. require HTML::TreeBuilder;
  402. HTML::TreeBuilder->VERSION(5);
  403. HTML::TreeBuilder->import('-weak');
  404. }
  405. my $tree = HTML::TreeBuilder->new();
  406. $tree->parse($decoded_content);
  407. $tree->eof();
  408. $tree->elementify(); # just for safety
  409. require HTML::FormatText;
  410. my $formatter = HTML::FormatText->new(leftmargin => 0,
  411. rightmargin => 1000);
  412. $info{content} = $formatter->format($tree);
  413. }
  414. else {
  415. $info{content} = $decoded_content;
  416. }
  417. $info{title} = $mech->title;
  418. # Convert Unicode to ASCII
  419. $info{content} = unidecode($info{content});
  420. if ($mech->is_html) {
  421. # Parse HTML header for extracting metadata
  422. my $html_head_parser = HTML::HeadParser->new;
  423. $html_head_parser->parse($decoded_content);
  424. $info{title} ||= $html_head_parser->header('Title');
  425. $info{keywords} = $html_head_parser->header('X-Meta-Keywords');
  426. $info{description} = $html_head_parser->header('X-Meta-Description');
  427. }
  428. $info{title} ||= $normalized_url;
  429. $info{id} = $id;
  430. $info{url} = $protocol . $normalized_url;
  431. warn "Adding: $info{title}\nURI: $info{url}\n";
  432. my $relevant_content = join(' ', unidecode($normalized_url), unidecode($info{title}), $info{content});
  433. add_to_database_index($relevant_content, $id);
  434. $CONTENT_DB{$id} = encode_content_entry(\%info);
  435. }
  436. if (RESPECT_ROBOT_RULES) {
  437. my $host = $normalized_url =~ s{/.*}{}sr;
  438. ## my $host = URI->new($url)->host;
  439. $seen_hostname->{$host} = 1;
  440. my $robots_url = $protocol . join('/', $host, 'robots.txt');
  441. my $robots_txt = lwp_get($robots_url);
  442. $robot_rules->parse($robots_url, $robots_txt) if defined($robots_txt);
  443. }
  444. my @links = $mech->find_all_links(text_regex => qr/./);
  445. foreach my $link (@links) {
  446. my $abs_url = join('', $link->url_abs);
  447. my $host = extract_hostname($abs_url);
  448. next if $seen_hostname->{$host};
  449. crawl($abs_url, $seen_hostname);
  450. $seen_hostname->{$host} = 1;
  451. }
  452. return 1;
  453. }
  454. sub add_match_text_to_value ($text, $value, $i, $j) {
  455. if (!HIGHLIGHT_ALL_KEYWORDS) {
  456. exists($value->{match}) and return 1;
  457. }
  458. my $prefix_len = 50;
  459. my $suffix_len = 200;
  460. my $match_content = substr($text, $i, $j - $i);
  461. if ($j + $suffix_len > length($text)) {
  462. $prefix_len += $j + $suffix_len - length($text);
  463. }
  464. if ($i - $prefix_len < 0) {
  465. $prefix_len = $i;
  466. }
  467. my $prefix_content = substr($text, $i - $prefix_len, $prefix_len);
  468. my $suffix_content = substr($text, $j, $suffix_len);
  469. foreach ($match_content, $prefix_content, $suffix_content) {
  470. s/\s+/ /g;
  471. s/(\W)\1{2,}/$1/g;
  472. }
  473. $value->{match} .=
  474. encode_entities($prefix_content) . '<b>'
  475. . encode_entities($match_content) . '</b>'
  476. . encode_entities($suffix_content)
  477. . (HIGHLIGHT_ALL_KEYWORDS ? ' [...] ' : '');
  478. return 1;
  479. }
  480. sub set_intersection ($sets) {
  481. my @sets = @$sets;
  482. @sets || return;
  483. # Optimization: sort the sets by their number of elements
  484. @sets = sort { scalar(@$a) <=> scalar(@$b) } @sets;
  485. my $intersection = {};
  486. @{$intersection}{@{shift(@sets)}} = ();
  487. while (@sets) {
  488. my %curr;
  489. @curr{@{shift(@sets)}} = ();
  490. my %tmp;
  491. foreach my $key (keys %$intersection) {
  492. if (exists $curr{$key}) {
  493. undef $tmp{$key};
  494. }
  495. }
  496. $intersection = \%tmp;
  497. }
  498. return keys %$intersection;
  499. }
  500. sub search ($text) {
  501. $text = unidecode($text);
  502. my %seen;
  503. my %matches;
  504. my @words = extract_words($text);
  505. my @known_words = grep { exists($WORDS_INDEX{$_}) } @words;
  506. my @ref_sets;
  507. my %counts;
  508. foreach my $word (@known_words) {
  509. my @refs = split(' ', decode_index_entry($WORDS_INDEX{$word}));
  510. $counts{$word} = scalar(@refs);
  511. push @ref_sets, \@refs;
  512. }
  513. foreach my $key (set_intersection(\@ref_sets)) {
  514. $matches{$key} = eval { decode_content_entry($CONTENT_DB{$key}) } // next;
  515. }
  516. my @original_words = map {
  517. join('\W+', map { quotemeta($_) } split(' '))
  518. } grep { length($_) >= 2 } quotewords(qr/\s+/, 0, $text);
  519. if (not @original_words) {
  520. @original_words = map { quotemeta($_) } grep { length($_) >= 2 } split(/\W+/, $text);
  521. }
  522. my $ranking_cost = 0;
  523. my $matches_count = scalar(keys %matches);
  524. my @regexes;
  525. for (my $k = scalar(@original_words) ; $k >= 1 ; --$k) {
  526. if (FAST_MATCH) {
  527. $k == 1 or next;
  528. }
  529. my $current_cost =
  530. ((RANK_ON_NON_BOUNDARY_MATCH ? 1 : 0) + (RANK_ON_BOUNDARY_MATCH ? 1 : 0)) * binomial(scalar(@original_words), $k);
  531. if ($matches_count * ($ranking_cost + $current_cost) > max($matches_count, MAX_RANK_ITERATIONS)) {
  532. next;
  533. }
  534. $ranking_cost += $current_cost;
  535. #<<<
  536. forcomb {
  537. my @subset = @original_words[@_];
  538. my $regex = join('.{0,10}', @subset);
  539. my $b_regex = join('\b.{0,10}\b', @subset);
  540. #my $regex = join('\W*+', @subset);
  541. #my $b_regex = join('\b\W*+\b', @subset);
  542. push @regexes,
  543. scalar {
  544. (RANK_ON_NON_BOUNDARY_MATCH ? (re => qr/$regex/si) : ()),
  545. (RANK_ON_BOUNDARY_MATCH ? (b_re => qr/\b$b_regex\b/si) : ()),
  546. factor => $k,
  547. };
  548. } scalar(@original_words), $k;
  549. #>>>
  550. EXACT_MATCH && last;
  551. }
  552. foreach my $key (keys %matches) {
  553. my $value = $matches{$key};
  554. $value->{score} = 0;
  555. if ($value->{url} !~ m{^https?://}) {
  556. $value->{url} = 'https://' . $value->{url};
  557. }
  558. my $content = $value->{content} // '';
  559. my $title = unidecode($value->{title} // '');
  560. my $description = unidecode($value->{description} // '');
  561. my $keywords = unidecode($value->{keywords} // '');
  562. my $url = unidecode($value->{url} // '');
  563. foreach my $regex (@regexes) {
  564. foreach my $re_type (qw(b_re re)) {
  565. my $re = $regex->{$re_type} // next;
  566. my $factor = $regex->{factor} * ($re_type eq 'b_re' ? 1 : 0.5);
  567. if ($title =~ $re) {
  568. $value->{score} += 2 * $factor;
  569. }
  570. if ($description =~ $re) {
  571. ## $value->{score} += 1 * $factor;
  572. if (SHOW_DESCRIPTION
  573. and $re_type eq (RANK_ON_BOUNDARY_MATCH ? 'b_re' : 're')) {
  574. add_match_text_to_value($description, $value, $-[0], $+[0]);
  575. }
  576. }
  577. if (RANK_ON_CONTENT and $content =~ $re) {
  578. $value->{score} += $factor;
  579. if ($re_type eq (RANK_ON_BOUNDARY_MATCH ? 'b_re' : 're')) {
  580. add_match_text_to_value($content, $value, $-[0], $+[0]);
  581. }
  582. }
  583. if ($keywords =~ $re) {
  584. $value->{score} += 2 * $factor;
  585. }
  586. if ($url =~ $re) {
  587. $value->{score} += 4 * $factor;
  588. }
  589. }
  590. }
  591. ## delete $value->{content};
  592. }
  593. my %seen_url;
  594. my @sorted = sort { $b->{score} <=> $a->{score} } values %matches;
  595. my $results_count = scalar(@sorted);
  596. # Keep only the top best entries
  597. $#sorted = (MAX_SEARCH_RESULTS - 1) if (scalar(@sorted) > MAX_SEARCH_RESULTS);
  598. # Keep entries with score > 0
  599. @sorted = grep { $_->{score} > 0 } @sorted;
  600. # Prefer longer content for results with the same score
  601. @sorted = map { $_->[0] }
  602. sort { ($b->[1] <=> $a->[1]) || ($b->[2] <=> $a->[2]) }
  603. map { [$_, $_->{score}, length($_->{content})] } @sorted;
  604. # Fix some ArchWiki links
  605. foreach my $entry (@sorted) {
  606. $entry->{url} =~ s{^https://wiki\.archlinux\.org//}{https://wiki.archlinux.org/title/};
  607. }
  608. # Remove duplicated entries
  609. @sorted = grep { !$seen_url{(($_->{url} =~ s{^https?://(?:www\.)?}{}r) =~ s{#.*}{}sr) =~ s{[/?]+\z}{}r}++ } @sorted;
  610. return {
  611. results => \@sorted,
  612. counts => \%counts,
  613. words => \@known_words,
  614. count => $results_count,
  615. };
  616. }
  617. sub repair_index { # very slow operation
  618. while (my ($key, $value) = each %CONTENT_DB) {
  619. my $info = eval { decode_content_entry($value) } // next;
  620. readd_to_database_index(unidecode($info->{title}) . ' ' . $info->{content}, $info->{id});
  621. }
  622. return 1;
  623. }
  624. sub sanitize_index {
  625. my @for_delete_keys;
  626. my $index_len = 0;
  627. my $uniq_refs = 0;
  628. while (my ($key, $value) = each %WORDS_INDEX) {
  629. my $entry = decode_index_entry($value);
  630. ++$index_len;
  631. my $ref_count = 1 + ($entry =~ tr/ //);
  632. if ($ref_count > MAX_WORD_POPULARITY) {
  633. say "$ref_count: $key";
  634. }
  635. if ($ref_count == 1) {
  636. ++$uniq_refs;
  637. }
  638. if (length($key) < WORD_MIN_LEN or length($key) > WORD_MAX_LEN) {
  639. push @for_delete_keys, $key;
  640. }
  641. }
  642. say ":: The words index contains $index_len entries.";
  643. say ":: The words index contains $uniq_refs entries with only one reference.";
  644. foreach my $key (@for_delete_keys) {
  645. delete $WORDS_INDEX{$key};
  646. }
  647. return 1;
  648. }
  649. if (@ARGV) {
  650. require Getopt::Long;
  651. Getopt::Long::GetOptions(
  652. "sanitize-index" => sub {
  653. warn "Sanitzing index...\n";
  654. sanitize_index();
  655. exit;
  656. },
  657. "fix-index|recover-index|repair-index" => sub {
  658. warn "Recovering index...\n";
  659. repair_index();
  660. exit;
  661. },
  662. );
  663. foreach my $url (@ARGV) {
  664. warn "Crawling: $url\n";
  665. crawl($url);
  666. $index_db->sync;
  667. $content_db->sync;
  668. }
  669. #untie(%CONTENT_DB);
  670. #untie(%WORDS_INDEX);
  671. exit;
  672. }
  673. while (my $c = CGI::Fast->new) {
  674. my $query = $c->param('q');
  675. my $id = $c->param('text');
  676. my $surprise = $c->param('surprise');
  677. my $info = defined($id) ? decode_content_entry($CONTENT_DB{$id}) : undef;
  678. my $title = defined($info) ? encode_utf8(encode_entities($info->{title})) : undef;
  679. print header(
  680. -charset => 'UTF-8',
  681. 'Referrer-Policy' => 'no-referrer',
  682. 'X-Frame-Options' => 'DENY',
  683. 'X-Xss-Protection' => '1; mode=block',
  684. 'X-Content-Type-Options' => 'nosniff',
  685. 'Content-Security-Policy' =>
  686. q{default-src 'self'; frame-ancestors 'none'; form-action 'self'; base-uri 'self'; img-src 'self' data:;},
  687. ),
  688. start_html(
  689. -class => 'results_endpoint',
  690. -title => encode_utf8($query // $title // 'Surprise'),
  691. -meta => {
  692. 'keywords' => 'dark search, search engine, private, secure',
  693. #'viewport' => 'width=device-width, initial-scale=1.0',
  694. 'viewport' => 'width=device-width, initial-scale=1, maximum-scale=2.0, user-scalable=1',
  695. 'referrer' => 'no-referrer',
  696. },
  697. -style => [
  698. {
  699. src => 'css/logicodev-dark.min.css',
  700. },
  701. {
  702. src => 'css/bootstrap.min.css',
  703. },
  704. {
  705. src => 'css/pre.css',
  706. },
  707. #~ {
  708. #~ src => 'css/popup.css',
  709. #~ },
  710. ],
  711. -head => [
  712. Link(
  713. {
  714. -rel => 'shortcut icon',
  715. -type => 'image/png',
  716. -href => 'img/favicon.png',
  717. }
  718. ),
  719. (-e "opensearch.xml")
  720. ? Link(
  721. {
  722. -rel => 'search',
  723. -type => 'application/opensearchdescription+xml',
  724. -title => 'Dark search',
  725. -href => 'opensearch.xml',
  726. }
  727. )
  728. : ()
  729. ],
  730. );
  731. if (defined($id)) {
  732. say h4(
  733. {-class => "result_header"},
  734. a(
  735. {
  736. -href => encode_utf8($info->{url}),
  737. -target => "_blank",
  738. -rel => "noopener noreferrer",
  739. },
  740. b($title),
  741. )
  742. );
  743. print pre(encode_entities($info->{content}));
  744. print end_html();
  745. next;
  746. }
  747. print <<"EOT";
  748. <div class="searx-navbar"><span class="instance pull-left"><a href="/search">Home</a></span><span class="pull-right"><a href="$ENV{SCRIPT_NAME}?surprise=1">Surprise me</a></span></div>
  749. <div class="container">
  750. <form method="post" action="$ENV{SCRIPT_NAME}" id="search_form" role="search">
  751. <div class="row">
  752. <div class="col-xs-12 col-md-8">
  753. <div class="input-group search-margin">
  754. <input type="search" autofocus="" name="q" class="form-control autofocus" id="q" placeholder="${\encode_entities($query // '')}" aria-label="Search for..." autocomplete="off" value="" accesskey="s">
  755. <span class="input-group-btn">
  756. <button type="submit" class="btn btn-default" aria-label="Search"><span>Search</span></button>
  757. </span>
  758. </div>
  759. </div>
  760. <div class="col-xs-6 col-md-2 search-margin"><label class="visually-hidden" for="time-range">Time range</label></div>
  761. <div class="col-xs-6 col-md-2 search-margin"><label class="visually-hidden" for="language">Language</label></div>
  762. </div>
  763. </form><!-- / #search_form_full -->
  764. <div class="row">
  765. <div class="col-sm-4 col-sm-push-8" id="sidebar_results">
  766. </div><!-- /#sidebar_results -->
  767. <div class="col-sm-8 col-sm-pull-4" id="main_results">
  768. <h1 class="sr-only">Search results</h1>
  769. EOT
  770. if ($surprise) {
  771. $query = surprise_me();
  772. }
  773. say q{<div class="result result-default">};
  774. my $t0 = [gettimeofday];
  775. my @results;
  776. my $search_results = ((($query // '') =~ /\S/) ? search($query) : ());
  777. my $elapsed = tv_interval($t0, [gettimeofday]);
  778. if ($search_results) {
  779. @results = @{$search_results->{results}};
  780. my @words = @{$search_results->{words}};
  781. if (@words) {
  782. ## say p("Results found: ", b($search_results->{count}));
  783. say p("Term frequencies: " . join(", ", map { b($_) . ': ' . $search_results->{counts}{$_} } @words));
  784. say p(small(sprintf("Search took %.5f seconds", $elapsed)));
  785. }
  786. }
  787. foreach my $result (@results) {
  788. my $url = $result->{url};
  789. if ($url !~ m{^https?://}) {
  790. $url = 'https://' . $url;
  791. }
  792. $url = sanitize_url($url);
  793. my $title = $result->{title} // $url;
  794. if ($title !~ /\S/) {
  795. $title = $url;
  796. }
  797. say h4(
  798. {-class => "result_header"},
  799. a(
  800. {
  801. #-href => encode_utf8($url),
  802. -href => "$ENV{SCRIPT_NAME}?text=" . $result->{id},
  803. -target => "_blank",
  804. -rel => "noopener noreferrer",
  805. #(defined($result->{description}) ? (-class => 'popup') : ()),
  806. },
  807. #(defined($result->{description}) ? small(span(encode_utf8(encode_entities($result->{description})))) : ()),
  808. #small(span($result->{content} =~ s/(\R)\1{2,}/$1/gr =~ s{\R}{<br/>}gr)),
  809. b(encode_utf8(encode_entities($title))),
  810. )
  811. );
  812. say q{<p class="result-content">};
  813. say $result->{match};
  814. say q{</p>};
  815. say q{<div class="clearfix"></div>};
  816. say q{<div class="pull-right">};
  817. # Extract the date of the article (if any)
  818. if (EXTRACT_DATE) {
  819. require Date::Extract;
  820. my $date_extract = Date::Extract->new();
  821. if (my $dt = $date_extract->extract($result->{content})) {
  822. say small(scalar $dt->ymd);
  823. say q{<b> | </b>};
  824. }
  825. }
  826. # Web archive
  827. say small(
  828. a(
  829. {
  830. -href => encode_utf8('https://web.archive.org/web/' . $url),
  831. -class => 'text-info',
  832. -target => '_blank',
  833. -rel => 'noopener noreferrer',
  834. },
  835. "cached",
  836. ),
  837. );
  838. say q{<b> | </b>};
  839. # Text only (cached version)
  840. say small(
  841. a(
  842. {
  843. #-href => "$ENV{SCRIPT_NAME}?text=" . $result->{id},
  844. -href => encode_utf8($url),
  845. -class => 'text-info',
  846. -target => '_blank',
  847. -rel => 'noopener noreferrer',
  848. },
  849. "text",
  850. )
  851. );
  852. say q{<b> | </b>};
  853. say small("rank: $result->{score}");
  854. say "</div>"; # end of 'pull-right' div
  855. say div({-class => "external-link"}, encode_utf8($url));
  856. }
  857. say "</div>";
  858. print <<'EOT';
  859. <div class="clearfix"></div>
  860. <div class="clearfix"></div>
  861. </div><!-- /#main_results -->
  862. </div>
  863. </div>
  864. EOT
  865. print end_html;
  866. }