|
@@ -8,8 +8,9 @@ Idiolectalyzer 0.1
|
|
|
import enchant
|
|
|
import re
|
|
|
import os
|
|
|
+import collections
|
|
|
+import string
|
|
|
localpath = os.path.dirname(os.path.realpath(__file__))
|
|
|
-#from idiolectalyzerclasses import *
|
|
|
|
|
|
def countWordsIn( textSample ):
|
|
|
a = len( textSample.split() )
|
|
@@ -22,14 +23,20 @@ def checkWordCount ( textSample ):
|
|
|
goodCount = 1
|
|
|
return goodCount
|
|
|
|
|
|
-def stripData( textSample ):
|
|
|
+def stripData( textSample, howfar ='alphanumeric'):
|
|
|
#Join elements to string if necessary
|
|
|
textSample = ''.join(textSample)
|
|
|
+ textSample = textSample.strip()
|
|
|
textSample = textSample.replace("\n",' ')
|
|
|
textSample = re.sub(' +',' ', textSample)
|
|
|
pattern = re.compile('[\W ]+')
|
|
|
toAlphanumeric=pattern.sub(' ', textSample)
|
|
|
- return toAlphanumeric
|
|
|
+ toNoSpaces=pattern.sub('', textSample)
|
|
|
+ if howfar == 'alphanumeric':
|
|
|
+ return toAlphanumeric
|
|
|
+ elif howfar == 'nospaces':
|
|
|
+ return toNoSpaces
|
|
|
+ return
|
|
|
|
|
|
def getTextSample():
|
|
|
done = 0
|
|
@@ -50,10 +57,47 @@ def getTextSample():
|
|
|
sample = stripData(sample)
|
|
|
return sample
|
|
|
|
|
|
-def checkStructureMarkers(textSample):
|
|
|
- doubleSpaceCount = len(re.findall(' [^ ]', textSample))
|
|
|
- unusualSpacingCount = len(re.findall('[^ ] [^ ]', textSample))
|
|
|
- lineBreakCount = textSample.count('\n')
|
|
|
+def checkStructureMarkers(textSample, req='none'):
|
|
|
+ charCount = len(textSample)
|
|
|
+ def calculateLowercasePercentage(textSample):
|
|
|
+ textSample = stripData(textSample, 'nospaces')
|
|
|
+ charCount = len(textSample)
|
|
|
+ lowercaseCount = len(filter(lambda z: z in string.lowercase, textSample))
|
|
|
+ lowercaseCount = float(lowercaseCount)
|
|
|
+ lowercasePercent = (lowercaseCount*100)/charCount
|
|
|
+ lowercasePercent = int(lowercasePercent)
|
|
|
+ return lowercasePercent
|
|
|
+ def calculateDoubleSpaceRatio(textSample):
|
|
|
+ doubleSpaceCount = len(re.findall(' [^ ]', textSample))
|
|
|
+ doubleSpaceCount = float(doubleSpaceCount)
|
|
|
+ doubleSpacePercent = (doubleSpaceCount*100)/charCount
|
|
|
+ return doubleSpacePercent
|
|
|
+ def calculateUnusualSpacingRatio(textSample):
|
|
|
+ unusualSpacingCount = len(re.findall('[^ ] [^ ]', textSample))
|
|
|
+ unusualSpacingCount = float(unusualSpacingCount)
|
|
|
+ unusualSpacingPercent = (unusualSpacingCount*100)/charCount
|
|
|
+ return unusualSpacingPercent
|
|
|
+ def calculateLinebreakRatio(textSample):
|
|
|
+ lineBreakCount = textSample.count('\n')
|
|
|
+ lineBreakCount = float(lineBreakCount)
|
|
|
+ lineBreakPercent = (lineBreakCount*100)/charCount
|
|
|
+ return lineBreakPercent
|
|
|
+
|
|
|
+ lowercasePercentage = calculateLowercasePercentage(textSample)
|
|
|
+ doubleSpaceRatio = calculateDoubleSpaceRatio(textSample)
|
|
|
+ unusualSpacingRatio = calculateUnusualSpacingRatio(textSample)
|
|
|
+ lineBreakRatio = calculateLinebreakRatio(textSample)
|
|
|
+
|
|
|
+ if req=='lowercase':
|
|
|
+ return lowercasePercentage
|
|
|
+ elif req == 'doublespace':
|
|
|
+ return doubleSpaceRatio
|
|
|
+ elif req == 'unusualspacing':
|
|
|
+ return unusualSpacingRatio
|
|
|
+ elif req == 'linebreak':
|
|
|
+ return lineBreakRatio
|
|
|
+ else:
|
|
|
+ return None
|
|
|
return
|
|
|
|
|
|
def countFunctionWords ( textSample ):
|
|
@@ -61,7 +105,7 @@ def countFunctionWords ( textSample ):
|
|
|
wordList = textSample.split(" ")
|
|
|
functionWordCount = 0
|
|
|
for word in wordList:
|
|
|
- if functionWords.check(word.strip()):
|
|
|
+ if functionWords.check(word):
|
|
|
functionWordCount +=1
|
|
|
return functionWordCount
|
|
|
|
|
@@ -86,13 +130,32 @@ def howCommonIs ( commonIsSample ):
|
|
|
print "Finding the use rank of words not yet implimented"
|
|
|
return
|
|
|
|
|
|
-def findCommonMisspellings ( textSample ):
|
|
|
+def findCommonMisspellings ( textSample, req='none' ):
|
|
|
commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellings.txt"))
|
|
|
wordList = textSample.split(" ")
|
|
|
- containsCommonMisspelling = 0
|
|
|
+ commonMisspellingsInSample = []
|
|
|
for word in wordList:
|
|
|
- if commonMisspellings.check(word.strip()):
|
|
|
- print word
|
|
|
+ if commonMisspellings.check(word):
|
|
|
+ commonMisspellingsInSample.append(word)
|
|
|
+ #only proceed if the list is not empty
|
|
|
+ if commonMisspellingsInSample == []:
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ commonMisspellingsUsed = list(set(commonMisspellingsInSample)) #converts to ordered
|
|
|
+ commonMisspellingsUsed.sort()
|
|
|
+ misspellCounts = collections.Counter(commonMisspellingsInSample)
|
|
|
+ commonMisspellingsInSample.sort()
|
|
|
+ misspellCounts = collections.Counter(commonMisspellingsInSample) #is Counter object
|
|
|
+ misspellCounts = dict(misspellCounts) #convert to regular dict
|
|
|
+ if req == 'list':
|
|
|
+ return commonMisspellingsUsed
|
|
|
+ elif req == 'count':
|
|
|
+ return misspellCounts
|
|
|
+ elif req == 'none':
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ #should be raise error
|
|
|
+ return 'invalid req argument (list|count|none)'
|
|
|
return
|
|
|
|
|
|
print "Idiolectalyzer 0.1\n"
|
|
@@ -104,5 +167,7 @@ if __name__ == '__main__':
|
|
|
|
|
|
lexicalDensity1=calculateLexicalDensity( textSample1 )
|
|
|
print "Raw lexical density is" , lexicalDensity1 , "%"
|
|
|
+
|
|
|
+ checkStructureMarkers(textSample1)
|
|
|
|
|
|
print "\nSo it was written"
|