SHA1
--- a/.gitignore
+++ b/.gitignore
@@ -33,4 +33,5 @@ nosetests.xml
 
				 # Mr Developer
			
 
				 .mr.developer.cfg
			
 
				 .project
			
 
				-.pydevproject
			
 
				+.pydevprojectconvenience scripts
			
 
				+Abortions
			
--- a/idiolectalyzer.py
+++ b/idiolectalyzer.py
@@ -8,8 +8,9 @@ Idiolectalyzer 0.1
 
				 import enchant
			
 
				 import re
			
 
				 import os
			
 
				+import collections
			
 
				+import string
			
 
				 localpath = os.path.dirname(os.path.realpath(__file__)) 
			
 
				-#from idiolectalyzerclasses import *
			
 
				 
			
 
				 def countWordsIn( textSample ):
			
 
				     a = len( textSample.split() )
			
@@ -22,14 +23,20 @@ def checkWordCount ( textSample ):
 
				         goodCount = 1
			
 
				     return goodCount
			
 
				 
			
 
				-def stripData( textSample ):
			
 
				+def stripData( textSample, howfar ='alphanumeric'):
			
 
				     #Join elements to string if necessary
			
 
				     textSample = ''.join(textSample)
			
 
				+    textSample = textSample.strip()
			
 
				     textSample = textSample.replace("\n",' ')
			
 
				     textSample = re.sub('  +',' ', textSample)
			
 
				     pattern = re.compile('[\W ]+')
			
 
				     toAlphanumeric=pattern.sub(' ', textSample)
			
 
				-    return toAlphanumeric
			
 
				+    toNoSpaces=pattern.sub('', textSample)
			
 
				+    if howfar == 'alphanumeric':
			
 
				+        return toAlphanumeric
			
 
				+    elif howfar == 'nospaces':
			
 
				+        return toNoSpaces
			
 
				+    return 
			
 
				 
			
 
				 def getTextSample():
			
 
				     done = 0
			
@@ -50,10 +57,47 @@ def getTextSample():
 
				         sample = stripData(sample)
			
 
				     return sample
			
 
				 
			
 
				-def checkStructureMarkers(textSample):
			
 
				-    doubleSpaceCount = len(re.findall('  [^ ]', textSample))
			
 
				-    unusualSpacingCount = len(re.findall('[^ ]  [^ ]', textSample))
			
 
				-    lineBreakCount = textSample.count('\n')
			
 
				+def checkStructureMarkers(textSample, req='none'):
			
 
				+    charCount = len(textSample)
			
 
				+    def calculateLowercasePercentage(textSample):
			
 
				+        textSample = stripData(textSample, 'nospaces')
			
 
				+        charCount = len(textSample)
			
 
				+        lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
			
 
				+        lowercaseCount = float(lowercaseCount)
			
 
				+        lowercasePercent = (lowercaseCount*100)/charCount
			
 
				+        lowercasePercent = int(lowercasePercent)
			
 
				+        return lowercasePercent
			
 
				+    def calculateDoubleSpaceRatio(textSample):
			
 
				+        doubleSpaceCount = len(re.findall('  [^ ]', textSample))
			
 
				+        doubleSpaceCount = float(doubleSpaceCount)
			
 
				+        doubleSpacePercent = (doubleSpaceCount*100)/charCount
			
 
				+        return doubleSpacePercent
			
 
				+    def calculateUnusualSpacingRatio(textSample):
			
 
				+        unusualSpacingCount = len(re.findall('[^ ]  [^ ]', textSample))
			
 
				+        unusualSpacingCount = float(unusualSpacingCount)
			
 
				+        unusualSpacingPercent = (unusualSpacingCount*100)/charCount
			
 
				+        return unusualSpacingPercent
			
 
				+    def calculateLinebreakRatio(textSample):
			
 
				+        lineBreakCount = textSample.count('\n')
			
 
				+        lineBreakCount = float(lineBreakCount)
			
 
				+        lineBreakPercent = (lineBreakCount*100)/charCount
			
 
				+        return lineBreakPercent
			
 
				+    
			
 
				+    lowercasePercentage = calculateLowercasePercentage(textSample)    
			
 
				+    doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
			
 
				+    unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
			
 
				+    lineBreakRatio = calculateLinebreakRatio(textSample)
			
 
				+    
			
 
				+    if req=='lowercase':
			
 
				+        return lowercasePercentage
			
 
				+    elif req == 'doublespace':
			
 
				+        return doubleSpaceRatio
			
 
				+    elif req == 'unusualspacing':
			
 
				+        return unusualSpacingRatio
			
 
				+    elif req == 'linebreak':
			
 
				+        return lineBreakRatio
			
 
				+    else:
			
 
				+        return None
			
 
				     return
			
 
				 
			
 
				 def countFunctionWords ( textSample ):
			
@@ -61,7 +105,7 @@ def countFunctionWords ( textSample ):
 
				     wordList = textSample.split(" ")
			
 
				     functionWordCount = 0
			
 
				     for word in wordList:
			
 
				-        if functionWords.check(word.strip()):
			
 
				+        if functionWords.check(word):
			
 
				             functionWordCount +=1
			
 
				     return functionWordCount
			
 
				 
			
@@ -86,13 +130,32 @@ def howCommonIs ( commonIsSample ):
 
				     print "Finding the use rank of words not yet implimented"
			
 
				     return
			
 
				 
			
 
				-def findCommonMisspellings ( textSample ):
			
 
				+def findCommonMisspellings ( textSample, req='none' ):
			
 
				     commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
			
 
				     wordList = textSample.split(" ")
			
 
				-    containsCommonMisspelling = 0
			
 
				+    commonMisspellingsInSample = []
			
 
				     for word in wordList:
			
 
				-        if commonMisspellings.check(word.strip()):
			
 
				-            print word
			
 
				+        if commonMisspellings.check(word):
			
 
				+            commonMisspellingsInSample.append(word)
			
 
				+    #only proceed if the list is not empty
			
 
				+    if commonMisspellingsInSample == []:
			
 
				+        return
			
 
				+    else:
			
 
				+        commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
			
 
				+        commonMisspellingsUsed.sort()
			
 
				+        misspellCounts = collections.Counter(commonMisspellingsInSample)
			
 
				+        commonMisspellingsInSample.sort()
			
 
				+        misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
			
 
				+        misspellCounts = dict(misspellCounts) #convert to regular dict
			
 
				+        if req == 'list':
			
 
				+            return commonMisspellingsUsed
			
 
				+        elif req == 'count':
			
 
				+            return misspellCounts
			
 
				+        elif req == 'none':
			
 
				+            return
			
 
				+        else:
			
 
				+            #should be raise error
			
 
				+            return 'invalid req argument (list|count|none)'
			
 
				     return
			
 
				     
			
 
				 print "Idiolectalyzer 0.1\n"
			
@@ -104,5 +167,7 @@ if __name__ == '__main__':
 
				     
			
 
				     lexicalDensity1=calculateLexicalDensity( textSample1 )
			
 
				     print "Raw lexical density is" , lexicalDensity1 , "%"
			
 
				+    
			
 
				+    checkStructureMarkers(textSample1)
			
 
				 
			
 
				     print "\nSo it was written"
			
--- a/tests/mockdata/26213words.txt
+++ b/tests/mockdata/26213words.txt
--- a/tests/mockdata/lotsofpunctuation.txt
+++ b/tests/mockdata/lotsofpunctuation.txt
@@ -0,0 +1,3 @@
 
				+@(*!#&!@The  old man withdrew,  but came back 
			
 
				+
			
 
				+immediately and     offered Zarathustra bread and wine. "A bad country for the hungry," said he; "that is why I live here. Animal and man come unto me, the anchorite. But bid thy companion eat and drink also, he is wearier than thou." Zarathustra answered: "My companion is dead; I shall hardly be able to persuade him to eat." "That doth not concern me," said the old man sullenly; "he that knocketh at my door must take what I offer him. Eat, and fare ye well!"—::
			
--- a/tests/mockdata/withspellingerrors.txt
+++ b/tests/mockdata/withspellingerrors.txt
--- a/tests/test_idiolectalyzer.py
+++ b/tests/test_idiolectalyzer.py
@@ -3,6 +3,7 @@ import sys
 
				 import re
			
 
				 sys.path.append("..")
			
 
				 import idiolectalyzer
			
 
				+import collections
			
 
				 
			
 
				 def readTestText(testTextFile):
			
 
				     with open(testTextFile,'r') as testFile:
			
@@ -25,7 +26,13 @@ class testTextAnalysis(unittest.TestCase):
 
				         testText = readTestText(testTextFile)
			
 
				         countedCount=idiolectalyzer.countWordsIn(testText)
			
 
				         self.assertEqual(countedCount,expectedCount)
			
 
				-    
			
 
				+        
			
 
				+        testTextFile="mockdata/26213words.txt"
			
 
				+        expectedCount = 26213
			
 
				+        testText = readTestText(testTextFile)
			
 
				+        countedCount=idiolectalyzer.countWordsIn(testText)
			
 
				+        self.assertEqual(countedCount,expectedCount)
			
 
				+        
			
 
				     def testCountFunctionWords(self):
			
 
				         testTextFile="mockdata/251words.txt"
			
 
				         expectedCount = 114
			
@@ -53,7 +60,38 @@ class testTextAnalysis(unittest.TestCase):
 
				     def testCommonMisspellings(self):
			
 
				         testTextFile="mockdata/withspellingerrors.txt"
			
 
				         testText = readTestText(testTextFile)
			
 
				-        idiolectalyzer.findCommonMisspellings(testText)
			
 
				-
			
 
				+        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
			
 
				+        countedHeigth = spellingErrorsCount['heigth']
			
 
				+        expectedHeigth = 7
			
 
				+        countedBecuase = spellingErrorsCount['becuase']
			
 
				+        expectedBecuase = 4
			
 
				+        countedEcstacy = spellingErrorsCount['ecstacy']
			
 
				+        expectedEcstacy = 1
			
 
				+        self.assertEqual(countedHeigth,expectedHeigth)
			
 
				+        self.assertEqual(countedBecuase,expectedBecuase)
			
 
				+        self.assertEqual(countedEcstacy,expectedEcstacy)
			
 
				+        
			
 
				+        testTextFile="mockdata/251words.txt"
			
 
				+        testText = readTestText(testTextFile)
			
 
				+        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
			
 
				+        expectedResult = None
			
 
				+        self.assertEqual(spellingErrorsCount,expectedResult)
			
 
				+    
			
 
				+    def testStructureMarkers(self):
			
 
				+        testTextFile="mockdata/lotsofpunctuation.txt"
			
 
				+        testText = readTestText(testTextFile)
			
 
				+        lowercase = idiolectalyzer.checkStructureMarkers(testText,'lowercase')
			
 
				+        self.assertEqual(lowercase,96)
			
 
				+        doublespace = idiolectalyzer.checkStructureMarkers(testText,'doublespace')
			
 
				+        self.assertLess(doublespace,.59)
			
 
				+        self.assertGreater(doublespace,.58)
			
 
				+        unusualcount = idiolectalyzer.checkStructureMarkers(testText,'unusualspacing')
			
 
				+        self.assertLess(unusualcount,.39)
			
 
				+        self.assertGreater(unusualcount,.38)
			
 
				+        linebreak = idiolectalyzer.checkStructureMarkers(testText,'linebreak')
			
 
				+        self.assertLess(linebreak,.39)
			
 
				+        self.assertGreater(linebreak,.38)
			
 
				+        #multiple assertions are to avoid performing additional functions in order to test what are returned now as floats
			
 
				+        
			
 
				 if __name__ == '__main__':
			
 
				     unittest.main()
Tekijä	SHA1 Viesti	Päivämäärä
Laura Stewart	2323bef183 Break structure markers into giving useful information	9 vuotta sitten
Laura Stewart	19ad43e781 Common misspellings test case for string without common misspellings.	9 vuotta sitten
Laura Stewart	bfb72fb6cb gitignore additions	9 vuotta sitten
Laura Stewart	35efe61120 Common misspellings function improve	9 vuotta sitten
Laura Stewart	640da2f97c Added test with much longer string.	9 vuotta sitten
Laura Stewart	7b8a445f2b changed spelling error file to pre-formatted as common misspelling counter does expects pre-formatted.	9 vuotta sitten
Laura Stewart	4f91901fc0 Need long file for testing after finding unexpected errors with long strings.	9 vuotta sitten
Laura Stewart	0379fb68e5 Removing inline stripping; should be done in strip function.	9 vuotta sitten
Laura Stewart	dafa2d7a0c mock data odd punctuation	9 vuotta sitten