SHA1
--- a/dataparsers/parse_google_ngram.py
+++ b/dataparsers/parse_google_ngram.py
@@ -67,10 +67,12 @@ MAX_WORDS = 100000
 
				 MIN_WORD_LENGTH = 1
			
 
				 
			
 
				 def unwanted_characters_in_word(word):
			
 
				-    """Return boolean indicating unwanted characters in the word."""
			
 
				+    """Return boolean indicating unwanted characters in the word."""  
			
 
				     for letter in word:
			
 
				-        if ((letter in string.punctuation) or
			
 
				-            (letter not in string.letters)):
			
 
				+        #if ''((letter in string.punctuation) or
			
 
				+        #    (letter not in string.letters)):
			
 
				+        if ((letter not in string.letters) and
			
 
				+            (letter is not r"'")):
			
 
				             return True
			
 
				     return False
			
 
				 
			
@@ -111,6 +113,7 @@ def build_word_list(input_word_list, word_list_file_name, max_words, min_word_le
 
				     for word in final_word_list:
			
 
				         out_file.write('{0}\n'.format(word))
			
 
				     out_file.close()
			
 
				+    return final_word_list
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     usage = "usage: %prog -o word_list.txt input_ngram_file[s]"
			
--- a/idiolectalyzer.py
+++ b/idiolectalyzer.py
@@ -10,6 +10,7 @@ import re
 
				 import os
			
 
				 import collections
			
 
				 import string
			
 
				+import fileinput
			
 
				 localpath = os.path.dirname(os.path.realpath(__file__)) 
			
 
				 
			
 
				 def countWordsIn( textSample ):
			
@@ -123,12 +124,27 @@ def findRepeatwords ( findRepeatsSample ):
 
				     print "Finding repeat words not yet implimented"
			
 
				     return
			
 
				 
			
 
				-def howCommonIs ( word, context='all_google' ):
			
 
				-    #this will become dependant on context. 
			
 
				-    frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
			
 
				-    print frequencyFile
			
 
				-    #rankedWordList = enchant.request_pwl_dict()
			
 
				-    print "Finding the use rank of words not yet implimented"
			
 
				+def howCommonIs ( word, context='all_google' ): 
			
 
				+    if (context == 'all_google'):
			
 
				+        frequencyFile = localpath+"/wordLists/frequency_allgooglengrams.txt"
			
 
				+    elif (context == 'google_1965'):
			
 
				+        frequencyFile = localpath+"/wordLists/frequency_googlengramssince1965.txt"
			
 
				+    else:
			
 
				+        print "Invalid context. Available options: all_google, google_1965"
			
 
				+        return False
			
 
				+    
			
 
				+    rankedWordList = enchant.request_pwl_dict(frequencyFile)
			
 
				+    if (rankedWordList.check(word)):
			
 
				+        wordRank = 0
			
 
				+        for line in fileinput.input(frequencyFile):
			
 
				+            currentLine = line.strip()
			
 
				+            if (word==currentLine):
			
 
				+                rank=fileinput.filelineno()
			
 
				+                fileinput.close()
			
 
				+                return rank
			
 
				+    else:
			
 
				+        return 'unique'
			
 
				+    print "I should not be printed"
			
 
				     return
			
 
				 
			
 
				 def findCommonMisspellings ( textSample, req='none' ):
			
@@ -158,10 +174,9 @@ def findCommonMisspellings ( textSample, req='none' ):
 
				             #should be raise error
			
 
				             return 'invalid req argument (list|count|none)'
			
 
				     return
			
 
				-    
			
 
				-print "Idiolectalyzer 0.1\n"
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				+    print "Idiolectalyzer 0.1\n"
			
 
				 
			
 
				     textSample1 = getTextSample()
			
 
				     #textSample2 = getTextSample()
			
--- a/tests/mockdata/unlikely1gram.txt
+++ b/tests/mockdata/unlikely1gram.txt
@@ -115,6 +115,19 @@ ALBERICO	2005	15	10
 
				 ALBERICO	2006	5	4
			
 
				 ALBERICO	2007	1	1
			
 
				 ALBERICO	2008	17	6
			
 
				+AL'COCK	2001	1	1
			
 
				+AL'COCK	2002	1	1
			
 
				+AL'COCK	2003	3	3
			
 
				+AL'COCK	2004	2	2
			
 
				+AL'COCK	2005	1	1
			
 
				+AL'COCK	2006	1	1
			
 
				+AL'COCK	2007	1	1
			
 
				+AL'COCK	2008	2	1
			
 
				+AL'COCK	2009	1	1
			
 
				+AL'COCK	2010	4	1
			
 
				+AL'COCK	2011	3	3
			
 
				+AL'COCK	2012	4	4
			
 
				+AL'COCK	2013	1	1
			
 
				 ALCOCK	1706	1	1
			
 
				 ALCOCK	1740	1	1
			
 
				 ALCOCK	1752	3	3
			
--- a/tests/test_idiolectalyzer.py
+++ b/tests/test_idiolectalyzer.py
@@ -92,7 +92,16 @@ class testTextAnalysis(unittest.TestCase):
 
				         pass
			
 
				     
			
 
				     def testHowCommonIs(self):
			
 
				-        idiolectalyzer.howCommonIs( "pie" )
			
 
				+        google1965Expectation = 8128
			
 
				+        allgoogleExpectation = 6321
			
 
				+        pieRank = idiolectalyzer.howCommonIs( "pie" )
			
 
				+        self.assertEqual(pieRank,allgoogleExpectation)
			
 
				+        pieRank = idiolectalyzer.howCommonIs( "pie",context='google_1965')
			
 
				+        self.assertEqual(pieRank,google1965Expectation)
			
 
				+        pieRank = idiolectalyzer.howCommonIs( "pie",context='invalid_list')
			
 
				+        self.assertFalse(pieRank)
			
 
				+        pieRank = idiolectalyzer.howCommonIs( "pyropedonecrobestiality")
			
 
				+        self.assertEqual(pieRank,"unique")
			
 
				     
			
 
				     def testCommonMisspellings(self):
			
 
				         testTextFile="mockdata/withspellingerrors.txt"
			
--- a/tests/test_ngramparser.py
+++ b/tests/test_ngramparser.py
@@ -14,12 +14,15 @@ class testNgramParser(unittest.TestCase):
 
				         generatedFile = "mockdata/frequencyfile.txt"
			
 
				         startYear = 1980
			
 
				         minLength = 1
			
 
				-        maxWords = 3
			
 
				+        maxWords = 4
			
 
				+        expectedGeneratedList = ['alcock', 'alberico', 'alkalinizing', "al'cock"]
			
 
				         word_list = {}
			
 
				         assert os.path.isfile(ngramFile)
			
 
				         parse_google_ngram.process_ngram_file(ngramFile, word_list)
			
 
				         sorted_list = sorted(word_list.iteritems(), key=itemgetter(1), reverse=True)
			
 
				-        parse_google_ngram.build_word_list(sorted_list, generatedFile, maxWords, minLength)
			
 
				-        #os.remove(generatedFile)
			
 
				+        generatedList=parse_google_ngram.build_word_list(sorted_list, generatedFile, maxWords, minLength)
			
 
				+        assert os.path.isfile(generatedFile)
			
 
				+        self.assertEqual(generatedList,expectedGeneratedList)
			
 
				+        os.remove(generatedFile)
			
 
				 if __name__ == '__main__':
			
 
				     unittest.main()
Author	SHA1 Message	Date
Laura Stewart	48ed5c0f52 HowCommonIs test now functional	8 years ago
Laura Stewart	5973d9d06e HowCommonIs funct now functional	8 years ago
Laura Stewart	19b808ee00 Word frequency funct	8 years ago
Laura Stewart	fb87a68666 Allow words with '	8 years ago
Laura Stewart	6e0b7b04e1 ngram parser test	8 years ago
Laura Stewart	f6035672a6 Modified mock data to test words with '	8 years ago