diff --git a/README.markdown b/README.markdown index 707f42c..d201a30 100644 --- a/README.markdown +++ b/README.markdown @@ -55,8 +55,8 @@ Spell Checking Some parts of code from http://norvig.com/spell-correct.html ``` -from twitter_spelling import Correct -c = Correct(settings_file_location) +from twitter_spelling.correct import Correct +c = Correct(settings_file_location, namespace='en') c.correct('my expression') ``` diff --git a/setup.py b/setup.py index 9724594..b95a3bd 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name = "Twitter Spell Checking", + name = "twitter-spell-checking", version = "0.1", packages = find_packages(), include_package_data = True, @@ -12,7 +12,10 @@ license = "Apache", keywords = "twitter license licenser open-source", url = "http://github.com/sem-io/python-twitter-spell-checking", - install_requires = ['python-twitter'], + install_requires = [ + 'python-twitter', + 'ujson' + ], # Setting up executable/main functions links entry_points = { diff --git a/twitter_spelling/__init__.pyc b/twitter_spelling/__init__.pyc new file mode 100644 index 0000000..b6f28e9 Binary files /dev/null and b/twitter_spelling/__init__.pyc differ diff --git a/twitter_spelling/cli.py b/twitter_spelling/cli.py index 9326b0b..7996aab 100644 --- a/twitter_spelling/cli.py +++ b/twitter_spelling/cli.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- + from datetime import datetime import sys import argparse import os -import json +import ujson as json import ConfigParser import twitter @@ -14,6 +15,7 @@ # list of tuples object into a dictionnary items_to_dict = lambda items : {k:v for k,v in items} + def fetch(args): print 'Welcome to Twitter Spell Checking : Fetching !' CONFIG = ConfigParser.ConfigParser() @@ -22,7 +24,7 @@ def fetch(args): settings = items_to_dict(CONFIG.items('twitter')) config = items_to_dict(CONFIG.items('namespace:%s' % args.namespace)) api = twitter.Api(consumer_key=settings['consumer_key'], consumer_secret=settings['consumer_secret'], access_token_key=settings['access_token'], access_token_secret=settings['access_token_secret']) - + accounts = [account.replace(' ', '') for account in config['accounts'].split(',')] max_tweets_file = os.path.join(os.path.dirname(config['files']), 'max_tweets_%s.txt' % args.namespace) @@ -35,23 +37,23 @@ def save_max_tweets(): max_tweets = dict() print max_tweets_file - f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a') + f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a') for account in accounts: - if account in max_tweets and max_tweets[account]>0: + if account in max_tweets and max_tweets[account] > 0: retrieving = "new" else: retrieving = "old" page = 0 while True: if retrieving == "new": - print 'process %s since id %s' % (account, max_tweets[account]) + print 'process %s since id %s' % (account, max_tweets[account]) try: tweets = api.GetUserTimeline(account, count=200, include_rts=False, since_id=max_tweets[account]) except twitter.TwitterError, e: print 'error : %s' % str(e) tweets = [] else: - print 'process %s from zero, page %s' % (account, page) + print 'process %s from zero, page %s' % (account, page) try: tweets = api.GetUserTimeline(account, count=200, include_rts=False, page=page) except twitter.TwitterError, e: @@ -60,11 +62,11 @@ def save_max_tweets(): if tweets: for s in tweets: if is_valid(s, account): - f.write(clean(s.text).lower().encode('UTF-8')+'\n') + f.write(clean(s.text).lower().encode('UTF-8') + '\n') if account not in max_tweets or s.id > max_tweets[account]: max_tweets[account] = s.id if retrieving == "old": - page+=1 + page += 1 save_max_tweets() else: print 'no more tweets for %s' % account @@ -87,4 +89,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/twitter_spelling/correct.py b/twitter_spelling/correct.py index 18f81b8..ea2ebc5 100644 --- a/twitter_spelling/correct.py +++ b/twitter_spelling/correct.py @@ -1,12 +1,18 @@ -import re, collections +# -*- coding: utf-8 -*- + +import re +import collections import ConfigParser -import os +import os # Lambda function which tranforms a ConfigParser items # list of tuples object into a dictionnary -items_to_dict = lambda items : {k:v for k,v in items} +items_to_dict = lambda items: {k:v for k,v in items} + + +def words(text): + return re.findall('[a-z]+', text.lower()) -def words(text): return re.findall('[a-z]+', text.lower()) def train(features): model = collections.defaultdict(lambda: 1) @@ -14,15 +20,18 @@ def train(features): model[f] += 1 return model -alphabet = 'abcdefghijklmnopqrstuvwxyz' def edits1(word): - splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] - deletes = [a + b[1:] for a, b in splits if b] - transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] - replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] - inserts = [a + c + b for a, b in splits for c in alphabet] - return set(deletes + transposes + replaces + inserts) + alphabet = 'abcdefghijklmnopqrstuvwxyz' + + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [a + b[1:] for a, b in splits if b] + transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1] + replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] + inserts = [a + c + b for a, b in splits for c in alphabet] + + return set(deletes + transposes + replaces + inserts) + class CorrectWord(object): def __init__(self, correct_inst): @@ -31,12 +40,14 @@ def __init__(self, correct_inst): def known_edits2(self, word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in Correct.NWORDS[self.correct_inst.key]) - def known(self, words): return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key]) + def known(self, words): + return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key]) def correct(self, word): candidates = self.known([word]) or self.known(edits1(word)) or self.known_edits2(word) or [word] return max(candidates, key=Correct.NWORDS[self.correct_inst.key].get) + class Correct(object): NWORDS = dict() @@ -47,11 +58,31 @@ def __init__(self, settings_file, namespace): CONFIG = ConfigParser.ConfigParser() CONFIG.read(settings_file) - config = items_to_dict(CONFIG.items('namespace:%s' % namespace)) - self.key = '%s:%s' % (settings_file, namespace) - if self.key not in Correct.NWORDS: - Correct.NWORDS[self.key] = train(words(open(os.path.join(config['files'], 'tweets_%s.txt' % namespace)).read())) + self.config = items_to_dict(CONFIG.items('namespace:%s' % namespace)) + self._key = '%s:%s' % (settings_file, namespace) + self._train_with(self._key) + + def _train_with(self, key): + """ + If given `key` has not trained content already, + compute it into NWORDS[key]. + """ + if key not in Correct.NWORDS: + Correct.NWORDS[self._key] = train(words(open(os.path.join(self.config['files'], + 'tweets_%s.txt' % self.namespace)).read())) + + @property + def key(self): + self._key = ':'.join([self.settings_file, self.namespace]) + self._train_with(self._key) # Compute corrections for generated key + return self._key - def correct(self,word): + @key.setter + def key(self, value): + """Nota : value should be a pair, like (settings_file, namespace)""" + self._key = ':'.join(value) + self._train_with(self._key) # Compute corrections for generated key + + def correct(self, word): cw = CorrectWord(self) return ' '.join([cw.correct(w) for w in word.split(' ')]) diff --git a/twitter_spelling/correct.pyc b/twitter_spelling/correct.pyc new file mode 100644 index 0000000..5042346 Binary files /dev/null and b/twitter_spelling/correct.pyc differ diff --git a/twitter_spelling/utils.py b/twitter_spelling/utils.py index 3136d79..fc21fa1 100644 --- a/twitter_spelling/utils.py +++ b/twitter_spelling/utils.py @@ -1,13 +1,16 @@ -import re +# -*- coding: utf-8 -*- + import urlparse import unicodedata + def remove_accents(input_str): nkfd_form = unicodedata.normalize('NFKD', unicode(input_str)) only_ascii = nkfd_form.encode('ASCII', 'ignore') return only_ascii + def clean(text): new_string = '' for i in text.split(): @@ -22,5 +25,6 @@ def clean(text): new_string = new_string.strip() + ' ' + i return remove_accents(new_string) + def is_valid(tweet, user): return not tweet.in_reply_to_status_id and tweet.user.screen_name == user