5 Commits fb8d7dac43 ... a0e1cf554f

Author SHA1 Message Date
  Laura Stewart a0e1cf554f Unecessary multi-step test. 8 years ago
  Laura Stewart 61cd2d11d4 Changing data object to a pseudo-struct 8 years ago
  Laura Stewart 3e8829ba47 Starting to refactor into class structure which will be necessary for statistical comparisons. Currently awkward. 8 years ago
  Laura Stewart 6c83630c37 Function to return list of frequencies for repeat words 8 years ago
  Laura Stewart 611dad60b8 findRepeatWords also can specify a max number of words returned. 8 years ago
2 changed files with 61 additions and 15 deletions
  1. 45 5
      idiolectalyzer.py
  2. 16 10
      tests/test_idiolectalyzer.py

+ 45 - 5
idiolectalyzer.py

@@ -120,7 +120,7 @@ def findMostUsedWords( mostUsedSample ):
     print "Most used words not yet implimented"
     return
 
-def findRepeatWords ( textSample, minFreq, context='all'):
+def findRepeatWords ( textSample, minFreq, context='all', maxWords='all'):
     def getWordCounts(textSample):
         counts = {}
         for word in sampleWords:
@@ -133,14 +133,34 @@ def findRepeatWords ( textSample, minFreq, context='all'):
             if count < minFreq:
                 del counts[word]
         return counts
-    
+    def trimSize(counts,maxWords):
+        def getKey(item):
+            return item[1]
+        trimmedCounts = counts.items() #convert to list of pairs
+        trimmedCounts = sorted(trimmedCounts, key=getKey, reverse=True)
+        trimmedCounts = trimmedCounts[:maxWords] #cut down to maxWords number of elements
+        return trimmedCounts
     if (context == 'non-function'):
         #return only function words
         pass
+    elif (context =='all'):
+        pass
+    else:
+        print 'Invalid context. Available options: all, non-function'
+        return False
+        
     totalWords = countWordsIn(textSample)
     sampleWords = textSample.split()
     counts = getWordCounts(textSample)
     counts = deleteSingleWords(counts)
+    
+    if (maxWords != 'all'):
+        if (type(maxWords) == int):
+            counts = trimSize(counts,maxWords)
+        else:
+            #should raise error
+            print 'maxWords must be \'all\' or an integer'
+            return False
     return counts
 
 def howCommonIs ( word, context='all_google' ): 
@@ -165,6 +185,16 @@ def howCommonIs ( word, context='all_google' ):
         return 'unique'
     return
 
+def frequencyOfRepeats (counts, context='all_google'):
+    #returns an unordered list of the frequency of word
+    counts = dict(counts)
+    frequencyList = []
+    for word in counts:
+        frequency = howCommonIs(word, context=context)
+        frequencyList.append(frequency)
+        
+    return frequencyList
+
 def findCommonMisspellings ( textSample, req='none' ):
     commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
     wordList = textSample.split(" ")
@@ -193,13 +223,23 @@ def findCommonMisspellings ( textSample, req='none' ):
             return 'invalid req argument (list|count|none)'
     return
 
+class textData(object):
+    def __init__(self):
+        self.samplewordcount = int(0)
+        self.samplefunctionwordcount = int(0)
+        self.rawlexicaldensity = int(0)
+        self.strippedlexicaldensity = int(0)
+        self.lowercasepercentage = int(0)
+        self.doublespaceratio = float(0)
+        self.unusualspacingratio = float(0)
+        self.linebreakratio = float(0)
+        self.repeatwordcounts = {}
+        self.commonmisspellingscounts = {}
+                   
 if __name__ == '__main__':
     print "Idiolectalyzer 0.1\n"
 
     textSample1 = getTextSample()
     #textSample2 = getTextSample()
-    
-    lexicalDensity1=calculateLexicalDensity( textSample1 )
-    print "Raw lexical density is" , lexicalDensity1 , "%"
 
     print "\nSo it was written"

+ 16 - 10
tests/test_idiolectalyzer.py

@@ -94,7 +94,14 @@ class testTextAnalysis(unittest.TestCase):
         expectedCounts = {'his': 7, 'every': 5, 'like': 4, 'thou': 3, 'however,': 3, 'was': 6, 'he': 6, 'and': 13, 'which': 3, 'a': 6, 'the': 21}
         counts = idiolectalyzer.findRepeatWords(testText,3)
         self.assertEqual(expectedCounts, counts)
-    
+        expectedCounts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6)]
+        counts = idiolectalyzer.findRepeatWords(testText,3,maxWords=5)
+        self.assertEqual(expectedCounts, counts)
+        counts = idiolectalyzer.findRepeatWords(testText,3,maxWords='invalid')
+        self.assertEqual(False,counts)
+        counts = idiolectalyzer.findRepeatWords(testText,3,context='invalid')
+        self.assertEqual(False,counts)
+        
     def testHowCommonIs(self):
         google1965Expectation = 8131
         allgoogleExpectation = 6321
@@ -111,15 +118,8 @@ class testTextAnalysis(unittest.TestCase):
         testTextFile="mockdata/withspellingerrors.txt"
         testText = readTestText(testTextFile)
         spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
-        countedHeigth = spellingErrorsCount['heigth']
-        expectedHeigth = 7
-        countedBecuase = spellingErrorsCount['becuase']
-        expectedBecuase = 4
-        countedEcstacy = spellingErrorsCount['ecstacy']
-        expectedEcstacy = 1
-        self.assertEqual(countedHeigth,expectedHeigth)
-        self.assertEqual(countedBecuase,expectedBecuase)
-        self.assertEqual(countedEcstacy,expectedEcstacy)
+        expectedCounts = {'ecstacy': 1, 'becuase': 4, 'heigth': 7}
+        self.assertEqual(expectedCounts, spellingErrorsCount)
         
         testTextFile="mockdata/251words.txt"
         testText = readTestText(testTextFile)
@@ -127,5 +127,11 @@ class testTextAnalysis(unittest.TestCase):
         expectedResult = None
         self.assertEqual(spellingErrorsCount,expectedResult)
         
+    def testFrequencyOfRepeats(self):
+        counts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6), ('alsidkfjads', 7)]
+        expectedFrequency = [3, 67, 'unique', 1, 30, 53]
+        frequency = idiolectalyzer.frequencyOfRepeats(counts)
+        self.assertEqual(expectedFrequency,frequency)
+      
 if __name__ == '__main__':
     unittest.main()