123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- '''
- Created on 18 Apr 2016
- @author: LE van Braam-Stewart LoneLocust@gmail.com
- Idiolectalyzer 0.1
- '''
- import enchant
- import scipy
- import numpy
- import re
- import os
- import collections
- import string
- import fileinput
- localpath = os.path.dirname(os.path.realpath(__file__))
- def stripData( textSample, howfar ='alphanumeric'):
- #Join elements to string if necessary
- textSample = ''.join(textSample)
- textSample = textSample.strip()
- textSample = textSample.replace("\n",' ')
- textSample = re.sub(' +',' ', textSample)
- pattern = re.compile('[\W ]+')
- toAlphanumeric=pattern.sub(' ', textSample)
- toNoSpaces=pattern.sub('', textSample)
- if howfar == 'alphanumeric':
- return toAlphanumeric
- elif howfar == 'nospaces':
- return toNoSpaces
- return
- def checkStructureMarkers(textSample, req='none'):
- charCount = len(textSample)
- def calculateLowercasePercentage(textSample):
- textSample = stripData(textSample, 'nospaces')
- charCount = len(textSample)
- lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
- lowercaseCount = float(lowercaseCount)
- lowercasePercent = (lowercaseCount*100)/charCount
- lowercasePercent = int(lowercasePercent)
- return lowercasePercent
- def calculateDoubleSpaceRatio(textSample):
- doubleSpaceCount = len(re.findall(' [^ ]', textSample))
- doubleSpaceCount = float(doubleSpaceCount)
- doubleSpacePercent = (doubleSpaceCount*100)/charCount
- return doubleSpacePercent
- def calculateUnusualSpacingRatio(textSample):
- unusualSpacingCount = len(re.findall('[^ ] [^ ]', textSample))
- unusualSpacingCount = float(unusualSpacingCount)
- unusualSpacingPercent = (unusualSpacingCount*100)/charCount
- return unusualSpacingPercent
- def calculateLinebreakRatio(textSample):
- lineBreakCount = textSample.count('\n')
- lineBreakCount = float(lineBreakCount)
- lineBreakPercent = (lineBreakCount*100)/charCount
- return lineBreakPercent
-
- lowercasePercentage = calculateLowercasePercentage(textSample)
- doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
- unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
- lineBreakRatio = calculateLinebreakRatio(textSample)
-
- if req=='lowercase':
- return lowercasePercentage
- elif req == 'doublespace':
- return doubleSpaceRatio
- elif req == 'unusualspacing':
- return unusualSpacingRatio
- elif req == 'linebreak':
- return lineBreakRatio
- else:
- return None
- return
- def countFunctionWords ( textSample ):
- functionWords = enchant.request_pwl_dict((localpath+"/wordLists/englishFunctionWords.txt"))
- wordList = textSample.split(" ")
- wordList = filter(None, wordList) #remove any empty strings from list. Have made this unecessary in my fork of PyEnchant
- functionWordCount = 0
- for word in wordList:
- if functionWords.check(word):
- functionWordCount +=1
- return functionWordCount
- def calculateLexicalDensity( textSample ):
- functionWordCount = countFunctionWords( textSample )
- totalWordCount = countWordsIn( textSample )
- rawLexicalDensity = ((totalWordCount-functionWordCount)*100/totalWordCount)
- return rawLexicalDensity
- def findRepeatWords ( textSample, minFreq, context='all', maxWords='all'):
- def getWordCounts(textSample):
- counts = {}
- for word in sampleWords:
- if word not in counts:
- counts[word] = 0
- counts[word] += 1
- return counts
- def deleteSingleWords(counts):
- for word, count in counts.items():
- if count < minFreq:
- del counts[word]
- return counts
- def trimSize(counts,maxWords):
- def getKey(item):
- return item[1]
- trimmedCounts = counts.items() #convert to list of pairs
- trimmedCounts = sorted(trimmedCounts, key=getKey, reverse=True)
- trimmedCounts = trimmedCounts[:maxWords] #cut down to maxWords number of elements
- return trimmedCounts
- if (context == 'non-function'):
- #return only function words
- pass
- elif (context =='all'):
- pass
- else:
- print 'Invalid context. Available options: all, non-function'
- return False
-
- totalWords = countWordsIn(textSample)
- sampleWords = textSample.split()
- counts = getWordCounts(textSample)
- counts = deleteSingleWords(counts)
-
- if (maxWords != 'all'):
- if (type(maxWords) == int):
- counts = trimSize(counts,maxWords)
- else:
- #should raise error
- print 'maxWords must be \'all\' or an integer'
- return False
- return counts
- def howCommonIs ( word, context='all_google' ):
- if (context == 'all_google'):
- frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
- elif (context == 'google_1965'):
- frequencyFile = localpath+"/wordLists/frequency_googlengramssince1965.txt"
- else:
- print "Invalid context. Available options: all_google, google_1965"
- return False
-
- rankedWordList = enchant.request_pwl_dict(frequencyFile)
- if (rankedWordList.check(word)):
- wordRank = 0
- for line in fileinput.input(frequencyFile):
- currentLine = line.strip()
- if (word==currentLine):
- rank=fileinput.filelineno()
- fileinput.close()
- return rank
- else:
- return 'unique'
- return
- def frequencyOfRepeats (counts, context='all_google'):
- #returns an unordered list of the frequency of word
- counts = dict(counts)
- frequencyList = []
- for word in counts:
- frequency = howCommonIs(word, context=context)
- frequencyList.append(frequency)
-
- return frequencyList
- def findCommonMisspellings ( textSample, req='count' ):
- commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
- wordList = textSample.split(" ")
- commonMisspellingsInSample = []
- for word in wordList:
- if len(word)>0:
- if commonMisspellings.check(word):
- commonMisspellingsInSample.append(word)
- #only proceed if the list is not empty
- if commonMisspellingsInSample == []:
- return
- else:
- commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
- commonMisspellingsUsed.sort()
- misspellCounts = collections.Counter(commonMisspellingsInSample)
- commonMisspellingsInSample.sort()
- misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
- misspellCounts = dict(misspellCounts) #convert to regular dict
- if req == 'list':
- return commonMisspellingsUsed
- elif req == 'count':
- return misspellCounts
- elif req == 'none':
- return
- else:
- #should be raise error
- return 'invalid req argument (list|count|none)'
- return
- class textData(object):
- def __init__(self):
- self.samplewordcount = int(0)
- self.samplefunctionwordcount = int(0)
- self.rawlexicaldensity = int(0)
- self.strippedlexicaldensity = int(0)
- self.lowercasepercentage = int(0)
- self.doublespaceratio = float(0)
- self.unusualspacingratio = float(0)
- self.linebreakratio = float(0)
- self.repeatwordcounts = {}
- self.commonmisspellingscounts = {}
- def fill(self, textSample):
- self.strippedSample = stripData(textSample)
- self.samplewordcount = countWordsIn(textSample)
- self.rawlexicaldesnity = calculateLexicalDensity(textSample)
- self.strippedlexicaldensity = calculateLexicalDensity(self.strippedSample)
- self.lowercasepercentage = checkStructureMarkers(textSample,'lowercase')
- self.doublespacingratio = checkStructureMarkers(textSample,'doublespace')
- self.unusualspacingratio = checkStructureMarkers(textSample,'unusualspacing')
- self.linebreakratio = checkStructureMarkers(textSample,'linebreak')
- self.repeatwordcounts = findRepeatWords(self.strippedSample, 2)
- self.commonmisspellingscounts = findCommonMisspellings(self.strippedSample, 'count')
-
|