idiolectalyzer.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. '''
  2. Created on 18 Apr 2016
  3. @author: LE van Braam-Stewart LoneLocust@gmail.com
  4. Idiolectalyzer 0.1
  5. '''
  6. import enchant
  7. import scipy
  8. import numpy
  9. import re
  10. import os
  11. import collections
  12. import string
  13. import fileinput
  14. localpath = os.path.dirname(os.path.realpath(__file__))
  15. def stripData( textSample, howfar ='alphanumeric'):
  16. #Join elements to string if necessary
  17. textSample = ''.join(textSample)
  18. textSample = textSample.strip()
  19. textSample = textSample.replace("\n",' ')
  20. textSample = re.sub(' +',' ', textSample)
  21. pattern = re.compile('[\W ]+')
  22. toAlphanumeric=pattern.sub(' ', textSample)
  23. toNoSpaces=pattern.sub('', textSample)
  24. if howfar == 'alphanumeric':
  25. return toAlphanumeric
  26. elif howfar == 'nospaces':
  27. return toNoSpaces
  28. return
  29. def checkStructureMarkers(textSample, req='none'):
  30. charCount = len(textSample)
  31. def calculateLowercasePercentage(textSample):
  32. textSample = stripData(textSample, 'nospaces')
  33. charCount = len(textSample)
  34. lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
  35. lowercaseCount = float(lowercaseCount)
  36. lowercasePercent = (lowercaseCount*100)/charCount
  37. lowercasePercent = int(lowercasePercent)
  38. return lowercasePercent
  39. def calculateDoubleSpaceRatio(textSample):
  40. doubleSpaceCount = len(re.findall(' [^ ]', textSample))
  41. doubleSpaceCount = float(doubleSpaceCount)
  42. doubleSpacePercent = (doubleSpaceCount*100)/charCount
  43. return doubleSpacePercent
  44. def calculateUnusualSpacingRatio(textSample):
  45. unusualSpacingCount = len(re.findall('[^ ] [^ ]', textSample))
  46. unusualSpacingCount = float(unusualSpacingCount)
  47. unusualSpacingPercent = (unusualSpacingCount*100)/charCount
  48. return unusualSpacingPercent
  49. def calculateLinebreakRatio(textSample):
  50. lineBreakCount = textSample.count('\n')
  51. lineBreakCount = float(lineBreakCount)
  52. lineBreakPercent = (lineBreakCount*100)/charCount
  53. return lineBreakPercent
  54. lowercasePercentage = calculateLowercasePercentage(textSample)
  55. doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
  56. unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
  57. lineBreakRatio = calculateLinebreakRatio(textSample)
  58. if req=='lowercase':
  59. return lowercasePercentage
  60. elif req == 'doublespace':
  61. return doubleSpaceRatio
  62. elif req == 'unusualspacing':
  63. return unusualSpacingRatio
  64. elif req == 'linebreak':
  65. return lineBreakRatio
  66. else:
  67. return None
  68. return
  69. def countFunctionWords ( textSample ):
  70. functionWords = enchant.request_pwl_dict((localpath+"/wordLists/englishFunctionWords.txt"))
  71. wordList = textSample.split(" ")
  72. wordList = filter(None, wordList) #remove any empty strings from list. Have made this unecessary in my fork of PyEnchant
  73. functionWordCount = 0
  74. for word in wordList:
  75. if functionWords.check(word):
  76. functionWordCount +=1
  77. return functionWordCount
  78. def calculateLexicalDensity( textSample ):
  79. functionWordCount = countFunctionWords( textSample )
  80. totalWordCount = countWordsIn( textSample )
  81. rawLexicalDensity = ((totalWordCount-functionWordCount)*100/totalWordCount)
  82. return rawLexicalDensity
  83. def findRepeatWords ( textSample, minFreq, context='all', maxWords='all'):
  84. def getWordCounts(textSample):
  85. counts = {}
  86. for word in sampleWords:
  87. if word not in counts:
  88. counts[word] = 0
  89. counts[word] += 1
  90. return counts
  91. def deleteSingleWords(counts):
  92. for word, count in counts.items():
  93. if count < minFreq:
  94. del counts[word]
  95. return counts
  96. def trimSize(counts,maxWords):
  97. def getKey(item):
  98. return item[1]
  99. trimmedCounts = counts.items() #convert to list of pairs
  100. trimmedCounts = sorted(trimmedCounts, key=getKey, reverse=True)
  101. trimmedCounts = trimmedCounts[:maxWords] #cut down to maxWords number of elements
  102. return trimmedCounts
  103. if (context == 'non-function'):
  104. #return only function words
  105. pass
  106. elif (context =='all'):
  107. pass
  108. else:
  109. print 'Invalid context. Available options: all, non-function'
  110. return False
  111. totalWords = countWordsIn(textSample)
  112. sampleWords = textSample.split()
  113. counts = getWordCounts(textSample)
  114. counts = deleteSingleWords(counts)
  115. if (maxWords != 'all'):
  116. if (type(maxWords) == int):
  117. counts = trimSize(counts,maxWords)
  118. else:
  119. #should raise error
  120. print 'maxWords must be \'all\' or an integer'
  121. return False
  122. return counts
  123. def howCommonIs ( word, context='all_google' ):
  124. if (context == 'all_google'):
  125. frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
  126. elif (context == 'google_1965'):
  127. frequencyFile = localpath+"/wordLists/frequency_googlengramssince1965.txt"
  128. else:
  129. print "Invalid context. Available options: all_google, google_1965"
  130. return False
  131. rankedWordList = enchant.request_pwl_dict(frequencyFile)
  132. if (rankedWordList.check(word)):
  133. wordRank = 0
  134. for line in fileinput.input(frequencyFile):
  135. currentLine = line.strip()
  136. if (word==currentLine):
  137. rank=fileinput.filelineno()
  138. fileinput.close()
  139. return rank
  140. else:
  141. return 'unique'
  142. return
  143. def frequencyOfRepeats (counts, context='all_google'):
  144. #returns an unordered list of the frequency of word
  145. counts = dict(counts)
  146. frequencyList = []
  147. for word in counts:
  148. frequency = howCommonIs(word, context=context)
  149. frequencyList.append(frequency)
  150. return frequencyList
  151. def findCommonMisspellings ( textSample, req='count' ):
  152. commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
  153. wordList = textSample.split(" ")
  154. commonMisspellingsInSample = []
  155. for word in wordList:
  156. if len(word)>0:
  157. if commonMisspellings.check(word):
  158. commonMisspellingsInSample.append(word)
  159. #only proceed if the list is not empty
  160. if commonMisspellingsInSample == []:
  161. return
  162. else:
  163. commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
  164. commonMisspellingsUsed.sort()
  165. misspellCounts = collections.Counter(commonMisspellingsInSample)
  166. commonMisspellingsInSample.sort()
  167. misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
  168. misspellCounts = dict(misspellCounts) #convert to regular dict
  169. if req == 'list':
  170. return commonMisspellingsUsed
  171. elif req == 'count':
  172. return misspellCounts
  173. elif req == 'none':
  174. return
  175. else:
  176. #should be raise error
  177. return 'invalid req argument (list|count|none)'
  178. return
  179. class textData(object):
  180. def __init__(self):
  181. self.samplewordcount = int(0)
  182. self.samplefunctionwordcount = int(0)
  183. self.rawlexicaldensity = int(0)
  184. self.strippedlexicaldensity = int(0)
  185. self.lowercasepercentage = int(0)
  186. self.doublespaceratio = float(0)
  187. self.unusualspacingratio = float(0)
  188. self.linebreakratio = float(0)
  189. self.repeatwordcounts = {}
  190. self.commonmisspellingscounts = {}
  191. def fill(self, textSample):
  192. self.strippedSample = stripData(textSample)
  193. self.samplewordcount = countWordsIn(textSample)
  194. self.rawlexicaldesnity = calculateLexicalDensity(textSample)
  195. self.strippedlexicaldensity = calculateLexicalDensity(self.strippedSample)
  196. self.lowercasepercentage = checkStructureMarkers(textSample,'lowercase')
  197. self.doublespacingratio = checkStructureMarkers(textSample,'doublespace')
  198. self.unusualspacingratio = checkStructureMarkers(textSample,'unusualspacing')
  199. self.linebreakratio = checkStructureMarkers(textSample,'linebreak')
  200. self.repeatwordcounts = findRepeatWords(self.strippedSample, 2)
  201. self.commonmisspellingscounts = findCommonMisspellings(self.strippedSample, 'count')