9 Commitit e1d92c82fa ... 2323bef183

Tekijä SHA1 Viesti Päivämäärä
  Laura Stewart 2323bef183 Break structure markers into giving useful information 9 vuotta sitten
  Laura Stewart 19ad43e781 Common misspellings test case for string without common misspellings. 9 vuotta sitten
  Laura Stewart bfb72fb6cb gitignore additions 9 vuotta sitten
  Laura Stewart 35efe61120 Common misspellings function improve 9 vuotta sitten
  Laura Stewart 640da2f97c Added test with much longer string. 9 vuotta sitten
  Laura Stewart 7b8a445f2b changed spelling error file to pre-formatted as common misspelling counter does expects pre-formatted. 9 vuotta sitten
  Laura Stewart 4f91901fc0 Need long file for testing after finding unexpected errors with long strings. 9 vuotta sitten
  Laura Stewart 0379fb68e5 Removing inline stripping; should be done in strip function. 9 vuotta sitten
  Laura Stewart dafa2d7a0c mock data odd punctuation 9 vuotta sitten

+ 2 - 1
.gitignore

@@ -33,4 +33,5 @@ nosetests.xml
 # Mr Developer
 .mr.developer.cfg
 .project
-.pydevproject
+.pydevprojectconvenience scripts
+Abortions

+ 77 - 12
idiolectalyzer.py

@@ -8,8 +8,9 @@ Idiolectalyzer 0.1
 import enchant
 import re
 import os
+import collections
+import string
 localpath = os.path.dirname(os.path.realpath(__file__)) 
-#from idiolectalyzerclasses import *
 
 def countWordsIn( textSample ):
     a = len( textSample.split() )
@@ -22,14 +23,20 @@ def checkWordCount ( textSample ):
         goodCount = 1
     return goodCount
 
-def stripData( textSample ):
+def stripData( textSample, howfar ='alphanumeric'):
     #Join elements to string if necessary
     textSample = ''.join(textSample)
+    textSample = textSample.strip()
     textSample = textSample.replace("\n",' ')
     textSample = re.sub('  +',' ', textSample)
     pattern = re.compile('[\W ]+')
     toAlphanumeric=pattern.sub(' ', textSample)
-    return toAlphanumeric
+    toNoSpaces=pattern.sub('', textSample)
+    if howfar == 'alphanumeric':
+        return toAlphanumeric
+    elif howfar == 'nospaces':
+        return toNoSpaces
+    return 
 
 def getTextSample():
     done = 0
@@ -50,10 +57,47 @@ def getTextSample():
         sample = stripData(sample)
     return sample
 
-def checkStructureMarkers(textSample):
-    doubleSpaceCount = len(re.findall('  [^ ]', textSample))
-    unusualSpacingCount = len(re.findall('[^ ]  [^ ]', textSample))
-    lineBreakCount = textSample.count('\n')
+def checkStructureMarkers(textSample, req='none'):
+    charCount = len(textSample)
+    def calculateLowercasePercentage(textSample):
+        textSample = stripData(textSample, 'nospaces')
+        charCount = len(textSample)
+        lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
+        lowercaseCount = float(lowercaseCount)
+        lowercasePercent = (lowercaseCount*100)/charCount
+        lowercasePercent = int(lowercasePercent)
+        return lowercasePercent
+    def calculateDoubleSpaceRatio(textSample):
+        doubleSpaceCount = len(re.findall('  [^ ]', textSample))
+        doubleSpaceCount = float(doubleSpaceCount)
+        doubleSpacePercent = (doubleSpaceCount*100)/charCount
+        return doubleSpacePercent
+    def calculateUnusualSpacingRatio(textSample):
+        unusualSpacingCount = len(re.findall('[^ ]  [^ ]', textSample))
+        unusualSpacingCount = float(unusualSpacingCount)
+        unusualSpacingPercent = (unusualSpacingCount*100)/charCount
+        return unusualSpacingPercent
+    def calculateLinebreakRatio(textSample):
+        lineBreakCount = textSample.count('\n')
+        lineBreakCount = float(lineBreakCount)
+        lineBreakPercent = (lineBreakCount*100)/charCount
+        return lineBreakPercent
+    
+    lowercasePercentage = calculateLowercasePercentage(textSample)    
+    doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
+    unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
+    lineBreakRatio = calculateLinebreakRatio(textSample)
+    
+    if req=='lowercase':
+        return lowercasePercentage
+    elif req == 'doublespace':
+        return doubleSpaceRatio
+    elif req == 'unusualspacing':
+        return unusualSpacingRatio
+    elif req == 'linebreak':
+        return lineBreakRatio
+    else:
+        return None
     return
 
 def countFunctionWords ( textSample ):
@@ -61,7 +105,7 @@ def countFunctionWords ( textSample ):
     wordList = textSample.split(" ")
     functionWordCount = 0
     for word in wordList:
-        if functionWords.check(word.strip()):
+        if functionWords.check(word):
             functionWordCount +=1
     return functionWordCount
 
@@ -86,13 +130,32 @@ def howCommonIs ( commonIsSample ):
     print "Finding the use rank of words not yet implimented"
     return
 
-def findCommonMisspellings ( textSample ):
+def findCommonMisspellings ( textSample, req='none' ):
     commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
     wordList = textSample.split(" ")
-    containsCommonMisspelling = 0
+    commonMisspellingsInSample = []
     for word in wordList:
-        if commonMisspellings.check(word.strip()):
-            print word
+        if commonMisspellings.check(word):
+            commonMisspellingsInSample.append(word)
+    #only proceed if the list is not empty
+    if commonMisspellingsInSample == []:
+        return
+    else:
+        commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
+        commonMisspellingsUsed.sort()
+        misspellCounts = collections.Counter(commonMisspellingsInSample)
+        commonMisspellingsInSample.sort()
+        misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
+        misspellCounts = dict(misspellCounts) #convert to regular dict
+        if req == 'list':
+            return commonMisspellingsUsed
+        elif req == 'count':
+            return misspellCounts
+        elif req == 'none':
+            return
+        else:
+            #should be raise error
+            return 'invalid req argument (list|count|none)'
     return
     
 print "Idiolectalyzer 0.1\n"
@@ -104,5 +167,7 @@ if __name__ == '__main__':
     
     lexicalDensity1=calculateLexicalDensity( textSample1 )
     print "Raw lexical density is" , lexicalDensity1 , "%"
+    
+    checkStructureMarkers(textSample1)
 
     print "\nSo it was written"

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 1293 - 0
tests/mockdata/26213words.txt


+ 3 - 0
tests/mockdata/lotsofpunctuation.txt

@@ -0,0 +1,3 @@
+@(*!#&!@The  old man withdrew,  but came back 
+
+immediately and     offered Zarathustra bread and wine. "A bad country for the hungry," said he; "that is why I live here. Animal and man come unto me, the anchorite. But bid thy companion eat and drink also, he is wearier than thou." Zarathustra answered: "My companion is dead; I shall hardly be able to persuade him to eat." "That doth not concern me," said the old man sullenly; "he that knocketh at my door must take what I offer him. Eat, and fare ye well!"—::

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 1 - 3
tests/mockdata/withspellingerrors.txt


+ 41 - 3
tests/test_idiolectalyzer.py

@@ -3,6 +3,7 @@ import sys
 import re
 sys.path.append("..")
 import idiolectalyzer
+import collections
 
 def readTestText(testTextFile):
     with open(testTextFile,'r') as testFile:
@@ -25,7 +26,13 @@ class testTextAnalysis(unittest.TestCase):
         testText = readTestText(testTextFile)
         countedCount=idiolectalyzer.countWordsIn(testText)
         self.assertEqual(countedCount,expectedCount)
-    
+        
+        testTextFile="mockdata/26213words.txt"
+        expectedCount = 26213
+        testText = readTestText(testTextFile)
+        countedCount=idiolectalyzer.countWordsIn(testText)
+        self.assertEqual(countedCount,expectedCount)
+        
     def testCountFunctionWords(self):
         testTextFile="mockdata/251words.txt"
         expectedCount = 114
@@ -53,7 +60,38 @@ class testTextAnalysis(unittest.TestCase):
     def testCommonMisspellings(self):
         testTextFile="mockdata/withspellingerrors.txt"
         testText = readTestText(testTextFile)
-        idiolectalyzer.findCommonMisspellings(testText)
-
+        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
+        countedHeigth = spellingErrorsCount['heigth']
+        expectedHeigth = 7
+        countedBecuase = spellingErrorsCount['becuase']
+        expectedBecuase = 4
+        countedEcstacy = spellingErrorsCount['ecstacy']
+        expectedEcstacy = 1
+        self.assertEqual(countedHeigth,expectedHeigth)
+        self.assertEqual(countedBecuase,expectedBecuase)
+        self.assertEqual(countedEcstacy,expectedEcstacy)
+        
+        testTextFile="mockdata/251words.txt"
+        testText = readTestText(testTextFile)
+        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
+        expectedResult = None
+        self.assertEqual(spellingErrorsCount,expectedResult)
+    
+    def testStructureMarkers(self):
+        testTextFile="mockdata/lotsofpunctuation.txt"
+        testText = readTestText(testTextFile)
+        lowercase = idiolectalyzer.checkStructureMarkers(testText,'lowercase')
+        self.assertEqual(lowercase,96)
+        doublespace = idiolectalyzer.checkStructureMarkers(testText,'doublespace')
+        self.assertLess(doublespace,.59)
+        self.assertGreater(doublespace,.58)
+        unusualcount = idiolectalyzer.checkStructureMarkers(testText,'unusualspacing')
+        self.assertLess(unusualcount,.39)
+        self.assertGreater(unusualcount,.38)
+        linebreak = idiolectalyzer.checkStructureMarkers(testText,'linebreak')
+        self.assertLess(linebreak,.39)
+        self.assertGreater(linebreak,.38)
+        #multiple assertions are to avoid performing additional functions in order to test what are returned now as floats
+        
 if __name__ == '__main__':
     unittest.main()