|
@@ -9,25 +9,28 @@ Idiolectalyzer 0.1
|
|
|
import enchant
|
|
|
import re
|
|
|
import os
|
|
|
-localpath = os.path.dirname(os.path.realpath(__file__))
|
|
|
+localpath = os.path.dirname(os.path.realpath(__file__))
|
|
|
+#from idiolectalyzerclasses import *
|
|
|
|
|
|
-def countWordsIn( countWordsSample ):
|
|
|
- a = len( countWordsSample.split() )
|
|
|
+def countWordsIn( textSample ):
|
|
|
+ a = len( textSample.split() )
|
|
|
return a
|
|
|
|
|
|
-def checkWordCount ( checkCountSample ):
|
|
|
+def checkWordCount ( textSample ):
|
|
|
goodCount = 0
|
|
|
- sampleWordcount = countWordsIn(checkCountSample)
|
|
|
+ sampleWordcount = countWordsIn(textSample)
|
|
|
if sampleWordcount > 1000:
|
|
|
goodCount = 1
|
|
|
return goodCount
|
|
|
|
|
|
-def stripData( stripSample ):
|
|
|
+def stripData( textSample ):
|
|
|
#Join elements to string if necessary
|
|
|
- stripSample = ''.join(stripSample)
|
|
|
- stripSample = stripSample.replace("\n",' ')
|
|
|
- stripSample = re.sub(' +',' ', stripSample)
|
|
|
- return stripSample
|
|
|
+ textSample = ''.join(textSample)
|
|
|
+ textSample = textSample.replace("\n",' ')
|
|
|
+ textSample = re.sub(' +',' ', textSample)
|
|
|
+ pattern = re.compile('[\W ]+')
|
|
|
+ toAlphanumeric=pattern.sub(' ', textSample)
|
|
|
+ return toAlphanumeric
|
|
|
|
|
|
def getTextSample():
|
|
|
done = 0
|
|
@@ -54,19 +57,19 @@ def checkStructureMarkers(textSample):
|
|
|
lineBreakCount = textSample.count('\n')
|
|
|
return
|
|
|
|
|
|
-def countFunctionWords ( functionWordsSample ):
|
|
|
+def countFunctionWords ( textSample ):
|
|
|
functionWords = enchant.request_pwl_dict((localpath+"/wordLists/englishFunctionWords.txt"))
|
|
|
- wordList = functionWordsSample.split(" ")
|
|
|
+ wordList = textSample.split(" ")
|
|
|
functionWordCount = 0
|
|
|
for word in wordList:
|
|
|
if functionWords.check(word.strip()):
|
|
|
functionWordCount +=1
|
|
|
return functionWordCount
|
|
|
|
|
|
-def calculateLexicalDensity( lexicalSample ):
|
|
|
- functionWordCount = countFunctionWords( lexicalSample )
|
|
|
+def calculateLexicalDensity( textSample ):
|
|
|
+ functionWordCount = countFunctionWords( textSample )
|
|
|
print "functionwordcount", functionWordCount
|
|
|
- totalWordCount = countWordsIn( lexicalSample )
|
|
|
+ totalWordCount = countWordsIn( textSample )
|
|
|
print "totalwordcount", totalWordCount
|
|
|
rawLexicalDensity = ((totalWordCount-functionWordCount)*100/totalWordCount)
|
|
|
print "rawlexicaldensity", rawLexicalDensity
|
|
@@ -84,8 +87,13 @@ def howCommonIs ( commonIsSample ):
|
|
|
print "Finding the use rank of words not yet implimented"
|
|
|
return
|
|
|
|
|
|
-def findCommonMispellings ( commonMisspellingsSample ):
|
|
|
- print "Identifying commonly misspelled words in string not yet implimented"
|
|
|
+def findCommonMisspellings ( textSample ):
|
|
|
+ commonMisspellings = enchant.request_pwl_dict((localpath+"/wordLists/commonMisspellingsOxford"))
|
|
|
+ wordList = textSample.split(" ")
|
|
|
+ containsCommonMisspelling = 0
|
|
|
+ for word in wordList:
|
|
|
+ if commonMisspellings.check(word.strip()):
|
|
|
+ print word
|
|
|
return
|
|
|
|
|
|
print "Idiolectalyzer 0.1\n"
|