Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ Spell Checking
Some parts of code from http://norvig.com/spell-correct.html

```
from twitter_spelling import Correct
c = Correct(settings_file_location)
from twitter_spelling.correct import Correct
c = Correct(settings_file_location, namespace='en')
c.correct('my expression')
```

Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(
name = "Twitter Spell Checking",
name = "twitter-spell-checking",
version = "0.1",
packages = find_packages(),
include_package_data = True,
Expand All @@ -12,7 +12,10 @@
license = "Apache",
keywords = "twitter license licenser open-source",
url = "http://github.com/sem-io/python-twitter-spell-checking",
install_requires = ['python-twitter'],
install_requires = [
'python-twitter',
'ujson'
],

# Setting up executable/main functions links
entry_points = {
Expand Down
Binary file added twitter_spelling/__init__.pyc
Binary file not shown.
19 changes: 10 additions & 9 deletions twitter_spelling/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-

from datetime import datetime
import sys
import argparse
import os
import json
import ujson as json
import ConfigParser

import twitter
Expand All @@ -14,6 +15,7 @@
# list of tuples object into a dictionnary
items_to_dict = lambda items : {k:v for k,v in items}


def fetch(args):
print 'Welcome to Twitter Spell Checking : Fetching !'
CONFIG = ConfigParser.ConfigParser()
Expand All @@ -22,7 +24,7 @@ def fetch(args):
settings = items_to_dict(CONFIG.items('twitter'))
config = items_to_dict(CONFIG.items('namespace:%s' % args.namespace))
api = twitter.Api(consumer_key=settings['consumer_key'], consumer_secret=settings['consumer_secret'], access_token_key=settings['access_token'], access_token_secret=settings['access_token_secret'])

accounts = [account.replace(' ', '') for account in config['accounts'].split(',')]
max_tweets_file = os.path.join(os.path.dirname(config['files']), 'max_tweets_%s.txt' % args.namespace)

Expand All @@ -35,23 +37,23 @@ def save_max_tweets():
max_tweets = dict()

print max_tweets_file
f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a')
f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a')
for account in accounts:
if account in max_tweets and max_tweets[account]>0:
if account in max_tweets and max_tweets[account] > 0:
retrieving = "new"
else:
retrieving = "old"
page = 0
while True:
if retrieving == "new":
print 'process %s since id %s' % (account, max_tweets[account])
print 'process %s since id %s' % (account, max_tweets[account])
try:
tweets = api.GetUserTimeline(account, count=200, include_rts=False, since_id=max_tweets[account])
except twitter.TwitterError, e:
print 'error : %s' % str(e)
tweets = []
else:
print 'process %s from zero, page %s' % (account, page)
print 'process %s from zero, page %s' % (account, page)
try:
tweets = api.GetUserTimeline(account, count=200, include_rts=False, page=page)
except twitter.TwitterError, e:
Expand All @@ -60,11 +62,11 @@ def save_max_tweets():
if tweets:
for s in tweets:
if is_valid(s, account):
f.write(clean(s.text).lower().encode('UTF-8')+'\n')
f.write(clean(s.text).lower().encode('UTF-8') + '\n')
if account not in max_tweets or s.id > max_tweets[account]:
max_tweets[account] = s.id
if retrieving == "old":
page+=1
page += 1
save_max_tweets()
else:
print 'no more tweets for %s' % account
Expand All @@ -87,4 +89,3 @@ def main():

if __name__ == '__main__':
main()

65 changes: 48 additions & 17 deletions twitter_spelling/correct.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,37 @@
import re, collections
# -*- coding: utf-8 -*-

import re
import collections
import ConfigParser
import os
import os

# Lambda function which tranforms a ConfigParser items
# list of tuples object into a dictionnary
items_to_dict = lambda items : {k:v for k,v in items}
items_to_dict = lambda items: {k:v for k,v in items}


def words(text):
return re.findall('[a-z]+', text.lower())

def words(text): return re.findall('[a-z]+', text.lower())

def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
inserts = [a + c + b for a, b in splits for c in alphabet]
return set(deletes + transposes + replaces + inserts)
alphabet = 'abcdefghijklmnopqrstuvwxyz'

splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
inserts = [a + c + b for a, b in splits for c in alphabet]

return set(deletes + transposes + replaces + inserts)


class CorrectWord(object):
def __init__(self, correct_inst):
Expand All @@ -31,12 +40,14 @@ def __init__(self, correct_inst):
def known_edits2(self, word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in Correct.NWORDS[self.correct_inst.key])

def known(self, words): return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key])
def known(self, words):
return set(w for w in words if w in Correct.NWORDS[self.correct_inst.key])

def correct(self, word):
candidates = self.known([word]) or self.known(edits1(word)) or self.known_edits2(word) or [word]
return max(candidates, key=Correct.NWORDS[self.correct_inst.key].get)


class Correct(object):

NWORDS = dict()
Expand All @@ -47,11 +58,31 @@ def __init__(self, settings_file, namespace):
CONFIG = ConfigParser.ConfigParser()
CONFIG.read(settings_file)

config = items_to_dict(CONFIG.items('namespace:%s' % namespace))
self.key = '%s:%s' % (settings_file, namespace)
if self.key not in Correct.NWORDS:
Correct.NWORDS[self.key] = train(words(open(os.path.join(config['files'], 'tweets_%s.txt' % namespace)).read()))
self.config = items_to_dict(CONFIG.items('namespace:%s' % namespace))
self._key = '%s:%s' % (settings_file, namespace)
self._train_with(self._key)

def _train_with(self, key):
"""
If given `key` has not trained content already,
compute it into NWORDS[key].
"""
if key not in Correct.NWORDS:
Correct.NWORDS[self._key] = train(words(open(os.path.join(self.config['files'],
'tweets_%s.txt' % self.namespace)).read()))

@property
def key(self):
self._key = ':'.join([self.settings_file, self.namespace])
self._train_with(self._key) # Compute corrections for generated key
return self._key

def correct(self,word):
@key.setter
def key(self, value):
"""Nota : value should be a pair, like (settings_file, namespace)"""
self._key = ':'.join(value)
self._train_with(self._key) # Compute corrections for generated key

def correct(self, word):
cw = CorrectWord(self)
return ' '.join([cw.correct(w) for w in word.split(' ')])
Binary file added twitter_spelling/correct.pyc
Binary file not shown.
6 changes: 5 additions & 1 deletion twitter_spelling/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import re
# -*- coding: utf-8 -*-

import urlparse

import unicodedata


def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
only_ascii = nkfd_form.encode('ASCII', 'ignore')
return only_ascii


def clean(text):
new_string = ''
for i in text.split():
Expand All @@ -22,5 +25,6 @@ def clean(text):
new_string = new_string.strip() + ' ' + i
return remove_accents(new_string)


def is_valid(tweet, user):
return not tweet.in_reply_to_status_id and tweet.user.screen_name == user