dicewaregen-bn.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. #!/usr/bin/env python3
  2. # Script to find out unique words in a given text.
  3. # Modified to act as a Diceware list generator for Bengali/Bangla.
  4. # Original script is from: https://pythonexamples.org/python-find-unique-words-in-text-file/
  5. # But evolved a lot since then.
  6. # License: MIT (Expat)
  7. # Usage:
  8. # - Keep your Bangla text in data.txt
  9. # - Run this script:
  10. # $ python3 dicewaregen-bn.py
  11. # - You'll get an output on terminal with unique words
  12. # NOTE: This script tries its best but some words from other language
  13. # and words with unnecessary extension (liKe -ar, -e, -ay) may get
  14. # through. You may have to process them by hand.
  15. import re
  16. input_file = open('data.txt', 'r')
  17. text = input_file.read()
  18. # Replace special characters
  19. # No escaping is needed, because re.escape will be run later
  20. exclude_chars = [
  21. '.',
  22. ',',
  23. '!',
  24. ';',
  25. ':',
  26. '(',
  27. ')',
  28. '[',
  29. ']',
  30. '/',
  31. '।',
  32. '-এর',
  33. 'সমূহ',
  34. ]
  35. exclude_regex = ''
  36. for char in exclude_chars:
  37. if len(exclude_regex) > 0:
  38. exclude_regex += '|'
  39. exclude_regex += re.escape(char)
  40. text = re.sub(r'' + exclude_regex, ' ', text)
  41. # Divide the string into a list of words for easier processing
  42. words = text.split()
  43. # Prepare unique words list
  44. unique = []
  45. for word in words:
  46. if word not in unique:
  47. # Words should be:
  48. # - more than 3 chars long
  49. # - non-English
  50. # - non-numeric (such as years, etc.)
  51. if len(word) > 3 \
  52. and ord(word[0]) > 128 \
  53. and not word.isnumeric():
  54. unique.append(word)
  55. # Sort the list so that searching through the list is easier
  56. unique.sort()
  57. # print as list
  58. #print(unique)
  59. # or print words in each line
  60. print('\n'.join(unique))