From 8ecd832e5c89f74acef8e9ce8d803a8115044137 Mon Sep 17 00:00:00 2001 From: J Letang Date: Mon, 15 May 2023 04:09:49 +0000 Subject: [PATCH] added 10 more stopwords --- mapper.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mapper.py b/mapper.py index 9fa3def..aee2933 100644 --- a/mapper.py +++ b/mapper.py @@ -1,14 +1,28 @@ #!/usr/bin/env python import sys +import string # get all lines from stdin for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() + # make lowercase + line = line.lower() + + # https://stackoverflow.com/questions/34860982/replace-the-punctuation-with-whitespace + # remove punctuation + punctuations = list(string.punctuation) + for p in punctuations: + if p in line: + line = line.replace(p, '') + # split the line into words; splits on any whitespace words = line.split() # output tuples (word, 1) in tab-delimited format + stopwords = set(['the', 'and', 'is', 'a', 'this', 'that', 'it', 'to', 'in', 'an', 'as']) + for word in words: - print '%s\t%s' % (word, "1") + if word not in stopwords: + print '%s\t%s' % (word, "1")