3 Commits 997311c64a ... 6143e13ebf

Author SHA1 Message Date
  Laura Stewart 6143e13ebf Test additional data stripping 8 years ago
  Laura Stewart 7e81152a3d Data file with common spelling errors. 8 years ago
  Laura Stewart fc87657d9b Refactoring unecessary complicated names 8 years ago
3 changed files with 35 additions and 17 deletions
  1. 25 17
      idiolectalyzer.py
  2. 3 0
      tests/mockdata/withspellingerrors.txt
  3. 7 0
      tests/test_idiolectalyzer.py

+ 25 - 17
idiolectalyzer.py

@@ -9,25 +9,28 @@ Idiolectalyzer 0.1
 import enchant
 import re
 import os
-localpath = os.path.dirname(os.path.realpath(__file__))
+localpath = os.path.dirname(os.path.realpath(__file__)) 
+#from idiolectalyzerclasses import *
 
-def countWordsIn( countWordsSample ):
-    a = len( countWordsSample.split() )
+def countWordsIn( textSample ):
+    a = len( textSample.split() )
     return a
 
-def checkWordCount ( checkCountSample ):
+def checkWordCount ( textSample ):
     goodCount = 0
-    sampleWordcount = countWordsIn(checkCountSample)
+    sampleWordcount = countWordsIn(textSample)
     if sampleWordcount > 1000:
         goodCount = 1
     return goodCount
 
-def stripData( stripSample ):
+def stripData( textSample ):
     #Join elements to string if necessary
-    stripSample = ''.join(stripSample)
-    stripSample = stripSample.replace("\n",' ')
-    stripSample = re.sub('  +',' ', stripSample)
-    return stripSample
+    textSample = ''.join(textSample)
+    textSample = textSample.replace("\n",' ')
+    textSample = re.sub('  +',' ', textSample)
+    pattern = re.compile('[\W ]+')
+    toAlphanumeric=pattern.sub(' ', textSample)
+    return toAlphanumeric
 
 def getTextSample():
     done = 0
@@ -54,19 +57,19 @@ def checkStructureMarkers(textSample):
     lineBreakCount = textSample.count('\n')
     return
 
-def countFunctionWords ( functionWordsSample ):
+def countFunctionWords ( textSample ):
     functionWords = enchant.request_pwl_dict((localpath+"/wordLists/englishFunctionWords.txt"))
-    wordList = functionWordsSample.split(" ")
+    wordList = textSample.split(" ")
     functionWordCount = 0
     for word in wordList:
         if functionWords.check(word.strip()):
             functionWordCount +=1
     return functionWordCount
 
-def calculateLexicalDensity( lexicalSample ):
-    functionWordCount = countFunctionWords( lexicalSample )
+def calculateLexicalDensity( textSample ):
+    functionWordCount = countFunctionWords( textSample )
     print "functionwordcount", functionWordCount
-    totalWordCount = countWordsIn( lexicalSample )
+    totalWordCount = countWordsIn( textSample )
     print "totalwordcount", totalWordCount
     rawLexicalDensity = ((totalWordCount-functionWordCount)*100/totalWordCount)
     print "rawlexicaldensity", rawLexicalDensity
@@ -84,8 +87,13 @@ def howCommonIs ( commonIsSample ):
     print "Finding the use rank of words not yet implimented"
     return
 
-def findCommonMispellings ( commonMisspellingsSample ):
-    print "Identifying commonly misspelled words in string not yet implimented"
+def findCommonMisspellings ( textSample ):
+    commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellingsOxford"))
+    wordList = textSample.split(" ")
+    containsCommonMisspelling = 0
+    for word in wordList:
+        if commonMisspellings.check(word.strip()):
+            print word
     return
     
 print "Idiolectalyzer 0.1\n"

+ 3 - 0
tests/mockdata/withspellingerrors.txt

@@ -0,0 +1,3 @@
+I need to get out agression with an assasination attempt on my chauffer. 
+
+That might seem bizzare, but mind your own buisness.

+ 7 - 0
tests/test_idiolectalyzer.py

@@ -32,8 +32,15 @@ class testTextAnalysis(unittest.TestCase):
         
         lineBreaks = testText.count('\n')
         doubleSpaces = len(re.findall('  [^ ]', testText))
+        commas = testText.count('\.')
         self.assertEqual(lineBreaks,0)
         self.assertEqual(doubleSpaces,0)
+        self.assertEqual(commas,0)
+    
+    def testCommonMisspellings(self):
+        testTextFile="mockdata/withspellingerrors.txt"
+        testText = readTestText(testTextFile)
+        idiolectalyzer.findCommonMisspellings(testText)
 
 if __name__ == '__main__':
     unittest.main()