botify-hq · oleiade · May 31, 2012 · Jun 8, 2012 · Jun 8, 2012 · Jun 8, 2012
diff --git a/README.markdown b/README.markdown
@@ -55,8 +55,8 @@ Spell Checking
 Some parts of code from http://norvig.com/spell-correct.html
 
 ```
-from twitter_spelling import Correct
-c = Correct(settings_file_location)
+from twitter_spelling.correct import Correct
+c = Correct(settings_file_location, namespace='en')
 c.correct('my expression')
 ```
 

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(
-    name = "Twitter Spell Checking",
+    name = "twitter-spell-checking",
     version = "0.1",
     packages = find_packages(),
     include_package_data = True,
@@ -12,7 +12,10 @@
     license = "Apache",
     keywords = "twitter license licenser open-source",
     url = "http://github.com/sem-io/python-twitter-spell-checking",
-    install_requires = ['python-twitter'],
+    install_requires = [
+        'python-twitter',
+        'ujson'
+    ],
 
     # Setting up executable/main functions links
     entry_points = {

diff --git a/twitter_spelling/__init__.pyc b/twitter_spelling/__init__.pyc
diff --git a/twitter_spelling/cli.py b/twitter_spelling/cli.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
+
 from datetime import datetime
 import sys
 import argparse
 import os
-import json
+import ujson as json
 import ConfigParser
 
 import twitter
@@ -14,6 +15,7 @@
 # list of tuples object into a dictionnary
 items_to_dict = lambda items : {k:v for k,v in items}
 
+
 def fetch(args):
     print 'Welcome to Twitter Spell Checking : Fetching !'
     CONFIG = ConfigParser.ConfigParser()
@@ -22,7 +24,7 @@ def fetch(args):
     settings = items_to_dict(CONFIG.items('twitter'))
     config = items_to_dict(CONFIG.items('namespace:%s' % args.namespace))
     api = twitter.Api(consumer_key=settings['consumer_key'], consumer_secret=settings['consumer_secret'], access_token_key=settings['access_token'], access_token_secret=settings['access_token_secret'])
-    
+
     accounts = [account.replace(' ', '') for account in config['accounts'].split(',')]
     max_tweets_file = os.path.join(os.path.dirname(config['files']), 'max_tweets_%s.txt' % args.namespace)
 
@@ -35,23 +37,23 @@ def save_max_tweets():
         max_tweets = dict()
 
     print max_tweets_file
-    f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a') 
+    f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a')
     for account in accounts:
-        if account in max_tweets and max_tweets[account]>0:
+        if account in max_tweets and max_tweets[account] > 0:
             retrieving = "new"
         else:
             retrieving = "old"
             page = 0
         while True:
             if retrieving == "new":
-                print 'process %s since id %s' %  (account, max_tweets[account])
+                print 'process %s since id %s' % (account, max_tweets[account])
                 try:
                     tweets = api.GetUserTimeline(account, count=200, include_rts=False, since_id=max_tweets[account])
                 except twitter.TwitterError, e:
                     print 'error : %s' % str(e)
                     tweets = []
             else:
-                print 'process %s from zero, page %s' %  (account, page)
+                print 'process %s from zero, page %s' % (account, page)
                 try:
                     tweets = api.GetUserTimeline(account, count=200, include_rts=False, page=page)
                 except twitter.TwitterError, e:
@@ -60,11 +62,11 @@ def save_max_tweets():
             if tweets:
                 for s in tweets:
                     if is_valid(s, account):
-                        f.write(clean(s.text).lower().encode('UTF-8')+'\n')
+                        f.write(clean(s.text).lower().encode('UTF-8') + '\n')
                         if  account not in max_tweets or s.id > max_tweets[account]:
                             max_tweets[account] = s.id
                 if retrieving == "old":
-                    page+=1
+                    page += 1
                 save_max_tweets()
             else:
                 print 'no more tweets for %s' % account
@@ -87,4 +89,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
diff --git a/twitter_spelling/correct.py b/twitter_spelling/correct.py
@@ -1,28 +1,37 @@
-import re, collections
+# -*- coding: utf-8 -*-
+
+import re
+import collections
 import ConfigParser
-import os 
+import os
 
 # Lambda function which tranforms a ConfigParser items
 # list of tuples object into a dictionnary
-items_to_dict = lambda items : {k:v for k,v in items}
+items_to_dict = lambda items: {k:v for k,v in items}
+
+
+def words(text):
+    return re.findall('[a-z]+', text.lower())
 
-def words(text): return re.findall('[a-z]+', text.lower()) 
 
 def train(features):
     model = collections.defaultdict(lambda: 1)
     for f in features:
         model[f] += 1
     return model
 
-alphabet = 'abcdefghijklmnopqrstuvwxyz'
 
 def edits1(word):
-   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
-   deletes    = [a + b[1:] for a, b in splits if b]
-   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
-   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
-   inserts    = [a + c + b     for a, b in splits for c in alphabet]
-   return set(deletes + transposes + replaces + inserts)
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [a + b[1:] for a, b in splits if b]
+    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
+    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
+    inserts = [a + c + b     for a, b in splits for c in alphabet]
+
+    return set(deletes + transposes + replaces + inserts)
+
 
 class CorrectWord(object):
     def __init__(self, correct_inst):
@@ -31,12 +40,14 @@ def __init__(self, correct_inst):
     def known_edits2(self, word):
         return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in Correct.NWORDS[self.correct_inst.key])
 
-    def known(self, words): return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key])
+    def known(self, words):
+        return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key])
 
     def correct(self, word):
         candidates = self.known([word]) or self.known(edits1(word)) or self.known_edits2(word) or [word]
         return max(candidates, key=Correct.NWORDS[self.correct_inst.key].get)
 
+
 class Correct(object):
 
     NWORDS = dict()
@@ -47,11 +58,31 @@ def __init__(self, settings_file, namespace):
         CONFIG = ConfigParser.ConfigParser()
         CONFIG.read(settings_file)
 
-        config = items_to_dict(CONFIG.items('namespace:%s' % namespace))
-        self.key = '%s:%s' % (settings_file, namespace)
-        if self.key not in Correct.NWORDS:
-            Correct.NWORDS[self.key] = train(words(open(os.path.join(config['files'], 'tweets_%s.txt' % namespace)).read()))
+        self.config = items_to_dict(CONFIG.items('namespace:%s' % namespace))
+        self._key = '%s:%s' % (settings_file, namespace)
+        self._train_with(self._key)
+
+    def _train_with(self, key):
+        """
+        If given `key` has not trained content already,
+        compute it into NWORDS[key].
+        """
+        if key not in Correct.NWORDS:
+            Correct.NWORDS[self._key] = train(words(open(os.path.join(self.config['files'],
+                                                        'tweets_%s.txt' % self.namespace)).read()))
+
+    @property
+    def key(self):
+        self._key = ':'.join([self.settings_file, self.namespace])
+        self._train_with(self._key)  # Compute corrections for generated key
+        return self._key
 
-    def correct(self,word):
+    @key.setter
+    def key(self, value):
+        """Nota : value should be a pair, like (settings_file, namespace)"""
+        self._key = ':'.join(value)
+        self._train_with(self._key)  # Compute corrections for generated key
+
+    def correct(self, word):
         cw = CorrectWord(self)
         return ' '.join([cw.correct(w) for w in word.split(' ')])
diff --git a/twitter_spelling/correct.pyc b/twitter_spelling/correct.pyc
diff --git a/twitter_spelling/utils.py b/twitter_spelling/utils.py
@@ -1,13 +1,16 @@
-import re
+# -*- coding: utf-8 -*-
+
 import urlparse
 
 import unicodedata
 
+
 def remove_accents(input_str):
     nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
     only_ascii = nkfd_form.encode('ASCII', 'ignore')
     return only_ascii
 
+
 def clean(text):
     new_string = ''
     for i in text.split():
@@ -22,5 +25,6 @@ def clean(text):
             new_string = new_string.strip() + ' ' + i
     return remove_accents(new_string)
 
+
 def is_valid(tweet, user):
     return not tweet.in_reply_to_status_id and tweet.user.screen_name == user