adnan360
/
code-backups


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
							#!/usr/bin/env python3
# Script to find out unique words in a given text.
# Modified to act as a Diceware list generator for Bengali/Bangla.

# Original script is from: https://pythonexamples.org/python-find-unique-words-in-text-file/
# But evolved a lot since then.

# License: MIT (Expat)

# Usage:
# - Keep your Bangla text in data.txt
# - Run this script:
#   $ python3 dicewaregen-bn.py
# - You'll get an output on terminal with unique words
# NOTE: This script tries its best but some words from other language
#   and words with unnecessary extension (liKe -ar, -e, -ay) may get
#   through. You may have to process them by hand.

import re

input_file = open('data.txt', 'r')
text = input_file.read()

# Replace special characters
# No escaping is needed, because re.escape will be run later
exclude_chars = [
		'.',
		',',
		'!',
		';',
		':',
		'(',
		')',
		'[',
		']',
		'/',
		'।',
		'-এর',
		'সমূহ',
	]
exclude_regex = ''
for char in exclude_chars:
	if len(exclude_regex) > 0:
		exclude_regex += '|'
	exclude_regex += re.escape(char)
text = re.sub(r'' + exclude_regex, ' ', text)

# Divide the string into a list of words for easier processing
words = text.split()

# Prepare unique words list
unique = []
for word in words:
	if word not in unique:
		# Words should be:
		# - more than 3 chars long
		# - non-English
		# - non-numeric (such as years, etc.)
		if len(word) > 3 \
		and ord(word[0]) > 128 \
		and not word.isnumeric():
			unique.append(word)

# Sort the list so that searching through the list is easier
unique.sort()

# print as list
#print(unique)
# or print words in each line
print('\n'.join(unique))