1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- #!/usr/bin/env python3
- # Script to find out unique words in a given text.
- # Modified to act as a Diceware list generator for Bengali/Bangla.
- # Original script is from: https://pythonexamples.org/python-find-unique-words-in-text-file/
- # But evolved a lot since then.
- # License: MIT (Expat)
- # Usage:
- # - Keep your Bangla text in data.txt
- # - Run this script:
- # $ python3 dicewaregen-bn.py
- # - You'll get an output on terminal with unique words
- # NOTE: This script tries its best but some words from other language
- # and words with unnecessary extension (liKe -ar, -e, -ay) may get
- # through. You may have to process them by hand.
- import re
- input_file = open('data.txt', 'r')
- text = input_file.read()
- # Replace special characters
- # No escaping is needed, because re.escape will be run later
- exclude_chars = [
- '.',
- ',',
- '!',
- ';',
- ':',
- '(',
- ')',
- '[',
- ']',
- '/',
- '।',
- '-এর',
- 'সমূহ',
- ]
- exclude_regex = ''
- for char in exclude_chars:
- if len(exclude_regex) > 0:
- exclude_regex += '|'
- exclude_regex += re.escape(char)
- text = re.sub(r'' + exclude_regex, ' ', text)
- # Divide the string into a list of words for easier processing
- words = text.split()
- # Prepare unique words list
- unique = []
- for word in words:
- if word not in unique:
- # Words should be:
- # - more than 3 chars long
- # - non-English
- # - non-numeric (such as years, etc.)
- if len(word) > 3 \
- and ord(word[0]) > 128 \
- and not word.isnumeric():
- unique.append(word)
- # Sort the list so that searching through the list is easier
- unique.sort()
- # print as list
- #print(unique)
- # or print words in each line
- print('\n'.join(unique))
|