6 Commits e09d0ba45f ... 48ed5c0f52

Author SHA1 Message Date
  Laura Stewart 48ed5c0f52 HowCommonIs test now functional 8 years ago
  Laura Stewart 5973d9d06e HowCommonIs funct now functional 8 years ago
  Laura Stewart 19b808ee00 Word frequency funct 8 years ago
  Laura Stewart fb87a68666 Allow words with ' 8 years ago
  Laura Stewart 6e0b7b04e1 ngram parser test 8 years ago
  Laura Stewart f6035672a6 Modified mock data to test words with ' 8 years ago

+ 6 - 3
dataparsers/parse_google_ngram.py

@@ -67,10 +67,12 @@ MAX_WORDS = 100000
 MIN_WORD_LENGTH = 1
 
 def unwanted_characters_in_word(word):
-    """Return boolean indicating unwanted characters in the word."""
+    """Return boolean indicating unwanted characters in the word."""  
     for letter in word:
-        if ((letter in string.punctuation) or
-            (letter not in string.letters)):
+        #if ''((letter in string.punctuation) or
+        #    (letter not in string.letters)):
+        if ((letter not in string.letters) and
+            (letter is not r"'")):
             return True
     return False
 
@@ -111,6 +113,7 @@ def build_word_list(input_word_list, word_list_file_name, max_words, min_word_le
     for word in final_word_list:
         out_file.write('{0}\n'.format(word))
     out_file.close()
+    return final_word_list
 
 if __name__ == '__main__':
     usage = "usage: %prog -o word_list.txt input_ngram_file[s]"

+ 23 - 8
idiolectalyzer.py

@@ -10,6 +10,7 @@ import re
 import os
 import collections
 import string
+import fileinput
 localpath = os.path.dirname(os.path.realpath(__file__)) 
 
 def countWordsIn( textSample ):
@@ -123,12 +124,27 @@ def findRepeatwords ( findRepeatsSample ):
     print "Finding repeat words not yet implimented"
     return
 
-def howCommonIs ( word, context='all_google' ):
-    #this will become dependant on context. 
-    frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
-    print frequencyFile
-    #rankedWordList = enchant.request_pwl_dict()
-    print "Finding the use rank of words not yet implimented"
+def howCommonIs ( word, context='all_google' ): 
+    if (context == 'all_google'):
+        frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
+    elif (context == 'google_1965'):
+        frequencyFile = localpath+"/wordLists/frequency_googlengramssince1965.txt"
+    else:
+        print "Invalid context. Available options: all_google, google_1965"
+        return False
+    
+    rankedWordList = enchant.request_pwl_dict(frequencyFile)
+    if (rankedWordList.check(word)):
+        wordRank = 0
+        for line in fileinput.input(frequencyFile):
+            currentLine = line.strip()
+            if (word==currentLine):
+                rank=fileinput.filelineno()
+                fileinput.close()
+                return rank
+    else:
+        return 'unique'
+    print "I should not be printed"
     return
 
 def findCommonMisspellings ( textSample, req='none' ):
@@ -158,10 +174,9 @@ def findCommonMisspellings ( textSample, req='none' ):
             #should be raise error
             return 'invalid req argument (list|count|none)'
     return
-    
-print "Idiolectalyzer 0.1\n"
 
 if __name__ == '__main__':
+    print "Idiolectalyzer 0.1\n"
 
     textSample1 = getTextSample()
     #textSample2 = getTextSample()

+ 13 - 0
tests/mockdata/unlikely1gram.txt

@@ -115,6 +115,19 @@ ALBERICO	2005	15	10
 ALBERICO	2006	5	4
 ALBERICO	2007	1	1
 ALBERICO	2008	17	6
+AL'COCK	2001	1	1
+AL'COCK	2002	1	1
+AL'COCK	2003	3	3
+AL'COCK	2004	2	2
+AL'COCK	2005	1	1
+AL'COCK	2006	1	1
+AL'COCK	2007	1	1
+AL'COCK	2008	2	1
+AL'COCK	2009	1	1
+AL'COCK	2010	4	1
+AL'COCK	2011	3	3
+AL'COCK	2012	4	4
+AL'COCK	2013	1	1
 ALCOCK	1706	1	1
 ALCOCK	1740	1	1
 ALCOCK	1752	3	3

+ 10 - 1
tests/test_idiolectalyzer.py

@@ -92,7 +92,16 @@ class testTextAnalysis(unittest.TestCase):
         pass
     
     def testHowCommonIs(self):
-        idiolectalyzer.howCommonIs( "pie" )
+        google1965Expectation = 8128
+        allgoogleExpectation = 6321
+        pieRank = idiolectalyzer.howCommonIs( "pie" )
+        self.assertEqual(pieRank,allgoogleExpectation)
+        pieRank = idiolectalyzer.howCommonIs( "pie",context='google_1965')
+        self.assertEqual(pieRank,google1965Expectation)
+        pieRank = idiolectalyzer.howCommonIs( "pie",context='invalid_list')
+        self.assertFalse(pieRank)
+        pieRank = idiolectalyzer.howCommonIs( "pyropedonecrobestiality")
+        self.assertEqual(pieRank,"unique")
     
     def testCommonMisspellings(self):
         testTextFile="mockdata/withspellingerrors.txt"

+ 6 - 3
tests/test_ngramparser.py

@@ -14,12 +14,15 @@ class testNgramParser(unittest.TestCase):
         generatedFile = "mockdata/frequencyfile.txt"
         startYear = 1980
         minLength = 1
-        maxWords = 3
+        maxWords = 4
+        expectedGeneratedList = ['alcock', 'alberico', 'alkalinizing', "al'cock"]
         word_list = {}
         assert os.path.isfile(ngramFile)
         parse_google_ngram.process_ngram_file(ngramFile, word_list)
         sorted_list = sorted(word_list.iteritems(), key=itemgetter(1), reverse=True)
-        parse_google_ngram.build_word_list(sorted_list, generatedFile, maxWords, minLength)
-        #os.remove(generatedFile)
+        generatedList=parse_google_ngram.build_word_list(sorted_list, generatedFile, maxWords, minLength)
+        assert os.path.isfile(generatedFile)
+        self.assertEqual(generatedList,expectedGeneratedList)
+        os.remove(generatedFile)
 if __name__ == '__main__':
     unittest.main()