lonelocust
/
Idiolectalyzer


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
							'''
Created on 18 Apr 2016

@author: LE van Braam-Stewart LoneLocust@gmail.com

Idiolectalyzer 0.1
'''
import enchant
import scipy
import numpy
import re
import os
import collections
import string
import fileinput
localpath = os.path.dirname(os.path.realpath(__file__)) 

def stripData( textSample, howfar ='alphanumeric'):
    #Join elements to string if necessary
    textSample = ''.join(textSample)
    textSample = textSample.strip()
    textSample = textSample.replace("\n",' ')
    textSample = re.sub('  +',' ', textSample)
    pattern = re.compile('[\W ]+')
    toAlphanumeric=pattern.sub(' ', textSample)
    toNoSpaces=pattern.sub('', textSample)
    if howfar == 'alphanumeric':
        return toAlphanumeric
    elif howfar == 'nospaces':
        return toNoSpaces
    return 

def checkStructureMarkers(textSample, req='none'):
    charCount = len(textSample)
    def calculateLowercasePercentage(textSample):
        textSample = stripData(textSample, 'nospaces')
        charCount = len(textSample)
        lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
        lowercaseCount = float(lowercaseCount)
        lowercasePercent = (lowercaseCount*100)/charCount
        lowercasePercent = int(lowercasePercent)
        return lowercasePercent
    def calculateDoubleSpaceRatio(textSample):
        doubleSpaceCount = len(re.findall('  [^ ]', textSample))
        doubleSpaceCount = float(doubleSpaceCount)
        doubleSpacePercent = (doubleSpaceCount*100)/charCount
        return doubleSpacePercent
    def calculateUnusualSpacingRatio(textSample):
        unusualSpacingCount = len(re.findall('[^ ]  [^ ]', textSample))
        unusualSpacingCount = float(unusualSpacingCount)
        unusualSpacingPercent = (unusualSpacingCount*100)/charCount
        return unusualSpacingPercent
    def calculateLinebreakRatio(textSample):
        lineBreakCount = textSample.count('\n')
        lineBreakCount = float(lineBreakCount)
        lineBreakPercent = (lineBreakCount*100)/charCount
        return lineBreakPercent
    
    lowercasePercentage = calculateLowercasePercentage(textSample)    
    doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
    unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
    lineBreakRatio = calculateLinebreakRatio(textSample)
    
    if req=='lowercase':
        return lowercasePercentage
    elif req == 'doublespace':
        return doubleSpaceRatio
    elif req == 'unusualspacing':
        return unusualSpacingRatio
    elif req == 'linebreak':
        return lineBreakRatio
    else:
        return None
    return

def countFunctionWords ( textSample ):
    functionWords = enchant.request_pwl_dict((localpath+"/wordLists/englishFunctionWords.txt"))
    wordList = textSample.split(" ")
    wordList = filter(None, wordList) #remove any empty strings from list. Have made this unecessary in my fork of PyEnchant
    functionWordCount = 0
    for word in wordList:
        if functionWords.check(word):
            functionWordCount +=1
    return functionWordCount

def calculateLexicalDensity( textSample ):
    functionWordCount = countFunctionWords( textSample )
    totalWordCount = countWordsIn( textSample )
    rawLexicalDensity = ((totalWordCount-functionWordCount)*100/totalWordCount)
    return rawLexicalDensity

def findRepeatWords ( textSample, minFreq, context='all', maxWords='all'):
    def getWordCounts(textSample):
        counts = {}
        for word in sampleWords:
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
        return counts
    def deleteSingleWords(counts):
        for word, count in counts.items():
            if count < minFreq:
                del counts[word]
        return counts
    def trimSize(counts,maxWords):
        def getKey(item):
            return item[1]
        trimmedCounts = counts.items() #convert to list of pairs
        trimmedCounts = sorted(trimmedCounts, key=getKey, reverse=True)
        trimmedCounts = trimmedCounts[:maxWords] #cut down to maxWords number of elements
        return trimmedCounts
    if (context == 'non-function'):
        #return only function words
        pass
    elif (context =='all'):
        pass
    else:
        print 'Invalid context. Available options: all, non-function'
        return False
        
    totalWords = countWordsIn(textSample)
    sampleWords = textSample.split()
    counts = getWordCounts(textSample)
    counts = deleteSingleWords(counts)
    
    if (maxWords != 'all'):
        if (type(maxWords) == int):
            counts = trimSize(counts,maxWords)
        else:
            #should raise error
            print 'maxWords must be \'all\' or an integer'
            return False
    return counts

def howCommonIs ( word, context='all_google' ): 
    if (context == 'all_google'):
        frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
    elif (context == 'google_1965'):
        frequencyFile = localpath+"/wordLists/frequency_googlengramssince1965.txt"
    else:
        print "Invalid context. Available options: all_google, google_1965"
        return False
    
    rankedWordList = enchant.request_pwl_dict(frequencyFile)
    if (rankedWordList.check(word)):
        wordRank = 0
        for line in fileinput.input(frequencyFile):
            currentLine = line.strip()
            if (word==currentLine):
                rank=fileinput.filelineno()
                fileinput.close()
                return rank
    else:
        return 'unique'
    return

def frequencyOfRepeats (counts, context='all_google'):
    #returns an unordered list of the frequency of word
    counts = dict(counts)
    frequencyList = []
    for word in counts:
        frequency = howCommonIs(word, context=context)
        frequencyList.append(frequency)
        
    return frequencyList

def findCommonMisspellings ( textSample, req='count' ):
    commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
    wordList = textSample.split(" ")
    commonMisspellingsInSample = []
    for word in wordList:
        if len(word)>0: 
            if commonMisspellings.check(word):
                commonMisspellingsInSample.append(word)
    #only proceed if the list is not empty
    if commonMisspellingsInSample == []:
        return
    else:
        commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
        commonMisspellingsUsed.sort()
        misspellCounts = collections.Counter(commonMisspellingsInSample)
        commonMisspellingsInSample.sort()
        misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
        misspellCounts = dict(misspellCounts) #convert to regular dict
        if req == 'list':
            return commonMisspellingsUsed
        elif req == 'count':
            return misspellCounts
        elif req == 'none':
            return
        else:
            #should be raise error
            return 'invalid req argument (list|count|none)'
    return

class textData(object):
    def __init__(self):
        self.samplewordcount = int(0)
        self.samplefunctionwordcount = int(0)
        self.rawlexicaldensity = int(0)
        self.strippedlexicaldensity = int(0)
        self.lowercasepercentage = int(0)
        self.doublespaceratio = float(0)
        self.unusualspacingratio = float(0)
        self.linebreakratio = float(0)
        self.repeatwordcounts = {}
        self.commonmisspellingscounts = {}
    def fill(self, textSample):
        self.strippedSample = stripData(textSample)
        self.samplewordcount = countWordsIn(textSample)
        self.rawlexicaldesnity = calculateLexicalDensity(textSample)
        self.strippedlexicaldensity = calculateLexicalDensity(self.strippedSample)
        self.lowercasepercentage = checkStructureMarkers(textSample,'lowercase')
        self.doublespacingratio = checkStructureMarkers(textSample,'doublespace')
        self.unusualspacingratio = checkStructureMarkers(textSample,'unusualspacing')
        self.linebreakratio = checkStructureMarkers(textSample,'linebreak')
        self.repeatwordcounts = findRepeatWords(self.strippedSample, 2)
        self.commonmisspellingscounts = findCommonMisspellings(self.strippedSample, 'count')