Skip to content

Commit 995c78b

Browse files
committed
Enhance emoji detection by utilizing a predefined emoji list and updating related tests
1 parent 3cadd01 commit 995c78b

File tree

3 files changed

+81
-10
lines changed

3 files changed

+81
-10
lines changed

src/team_comm_tools/features/reddit_tags.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import string
33
import re
4-
4+
from team_comm_tools.utils.preprocess import EMOJIS
55

66
def count_all_caps(text):
77
"""
@@ -191,6 +191,9 @@ def count_emojis(text):
191191
Returns:
192192
int: The number of emojis in the input text.
193193
"""
194-
emoji_pattern = r'[:;]-?\)+'
195-
emojis = re.findall(emoji_pattern, text)
194+
# emoji_pattern = r'[:;]-?\)+'
195+
emoji_list = sorted(EMOJIS, key=len, reverse=True)
196+
emoji_pattern = "|".join(re.escape(e) for e in emoji_list)
197+
compiled_pattern = re.compile(emoji_pattern)
198+
emojis = re.findall(compiled_pattern, text)
196199
return len(emojis)

src/team_comm_tools/utils/preprocess.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
import re
22
import pandas as pd
3-
import warnings
4-
3+
# import warnings
4+
5+
EMOJIS = {
6+
"(:", "(;", "):", "/:", ":(", ":)", ":/", ";)", # 8 emojis from LIWC 2017
7+
";(", # variants
8+
":-)", ":-(", ":-/", ";-)", # with noses
9+
":D", ":P", ":p", ":-D", ":-P", ":-p", # big grin & tongue out
10+
":O", ":-O", ":o", ":-o", # shock
11+
"XD", "xD", "xd", # laughing variants
12+
"<3", "</3", # hearts
13+
}
514

615
def preprocess_conversation_columns(df: pd.DataFrame, column_names: dict, grouping_keys: list,
716
cumulative_grouping: bool = False, within_task: bool = False) -> pd.DataFrame:
@@ -112,21 +121,19 @@ def preprocess_text(text: str) -> str:
112121
:return: The processed text containing only alphanumeric characters and spaces in lowercase.
113122
:rtype: str
114123
"""
115-
emojis_to_preserve = {
116-
"(:", "(;", "):", "/:", ":(", ":)", ":/", ";)"
117-
}
118124

119125
emoji_placeholders = {}
120126
# Replace each emoji with a unique placeholder
121-
for i, emoji in enumerate(emojis_to_preserve):
127+
for i, emoji in enumerate(EMOJIS):
122128
placeholder = f"EMOJI_{i}"
123129
emoji_placeholders[placeholder] = emoji
124130
text = text.replace(emoji, placeholder)
125131

126132
# Clean the text by removing unwanted characters, except placeholders
127133
text = re.sub(r"[^a-zA-Z0-9 EMOJI_]+", '', text)
128134
# Restore the preserved emojis by replacing placeholders back to original emojis
129-
for placeholder, emoji in emoji_placeholders.items():
135+
for placeholder in sorted(emoji_placeholders.keys(), key=len, reverse=True):
136+
emoji = emoji_placeholders[placeholder]
130137
text = text.replace(placeholder, emoji)
131138

132139
return text.lower()

tests/data/cleaned_data/test_chat_level.csv

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,3 +1332,64 @@ K,1,This is the same text.,positivity_zscore_conversation,
13321332
10_mix,first_person,"belief% sed— forgot{ euismod£ dolor/ couldn't¥ ipsum^ amet— affectation] Lorem> ipsum} drank^ Lorem) sed' forever[ amet/ mailed? euismod< sed? mightve[ dolor[ sed; affect' dolor> sed\ consistently' Lorem| euismod: may: tempor, colon( ipsum# xanax¥ dolor] cruel, amet... think– dolor< tempor, strikingly% euismod< cocks£ ipsum~ insides! ipsum. aren't- Lorem. sandy% adipiscing... consectetur) illuminating+ amet# steadiness= euismod¥ sed( except~ consectetur' tempor) most} adipiscing? here's"" tempor' dolor# understand; sed{ ipsum",first_person_lexical_wordcount,0
13331333
10_mix,nltk_english_stopwords,"belief% sed— forgot{ euismod£ dolor/ couldn't¥ ipsum^ amet— affectation] Lorem> ipsum} drank^ Lorem) sed' forever[ amet/ mailed? euismod< sed? mightve[ dolor[ sed; affect' dolor> sed\ consistently' Lorem| euismod: may: tempor, colon( ipsum# xanax¥ dolor] cruel, amet... think– dolor< tempor, strikingly% euismod< cocks£ ipsum~ insides! ipsum. aren't- Lorem. sandy% adipiscing... consectetur) illuminating+ amet# steadiness= euismod¥ sed( except~ consectetur' tempor) most} adipiscing? here's"" tempor' dolor# understand; sed{ ipsum",nltk_english_stopwords_lexical_wordcount,5
13341334
10_mix,hedge_words,"belief% sed— forgot{ euismod£ dolor/ couldn't¥ ipsum^ amet— affectation] Lorem> ipsum} drank^ Lorem) sed' forever[ amet/ mailed? euismod< sed? mightve[ dolor[ sed; affect' dolor> sed\ consistently' Lorem| euismod: may: tempor, colon( ipsum# xanax¥ dolor] cruel, amet... think– dolor< tempor, strikingly% euismod< cocks£ ipsum~ insides! ipsum. aren't- Lorem. sandy% adipiscing... consectetur) illuminating+ amet# steadiness= euismod¥ sed( except~ consectetur' tempor) most} adipiscing? here's"" tempor' dolor# understand; sed{ ipsum",hedge_words_lexical_wordcount,0
1335+
emoji_test,emoji_user_a,Hello :),num_emoji,1
1336+
emoji_test,emoji_user_b,Nice to see you :-)),num_emoji,1
1337+
emoji_test,emoji_user_a,Hey there ;-) How are you? :),num_emoji,2
1338+
emoji_test,emoji_user_b,No emoji here.,num_emoji,0
1339+
emoji_test,emoji_user_a,;) :) :-))),num_emoji,3
1340+
emoji_test,emoji_user_b,Mixed emotions: :-( ;( :( ,num_emoji,3
1341+
emoji_test,emoji_user_a,Too many smiles :) :) :),num_emoji,3
1342+
emoji_test,emoji_user_b,Tricky one: :-)-),num_emoji,1
1343+
emoji_test,emoji_user_a,Extra characters :-))abc:-),num_emoji,2
1344+
emoji_test,emoji_user_b,Combo ;-):-);-),num_emoji,3
1345+
emoji_test,emoji_user_a,Sad day :(,num_emoji,1
1346+
emoji_test,emoji_user_b,Confused face :/,num_emoji,1
1347+
emoji_test,emoji_user_a,Double smiles :) :),num_emoji,2
1348+
emoji_test,emoji_user_b,This is fine :D,num_emoji,1
1349+
emoji_test,emoji_user_a,Edge case :-):-):-D,num_emoji,3
1350+
emoji_test,emoji_user_b,Broken smile :-))abc:-),num_emoji,2
1351+
emoji_test,emoji_user_a,Mixed (: ;),num_emoji,2
1352+
emoji_test,emoji_user_b,Only symbols $%&!,num_emoji,0
1353+
emoji_test,emoji_user_a,Nested :)hello:),num_emoji,2
1354+
emoji_test,emoji_user_b,Sad then happy :( :),num_emoji,2
1355+
emoji_test,emoji_user_a,No spaces:;-):),num_emoji,2
1356+
emoji_test,emoji_user_b,Triple threat :):):),num_emoji,3
1357+
emoji_test,emoji_user_a,Reverse smile ): (:,num_emoji,2
1358+
emoji_test,emoji_user_b,"Hey! I'm doing great :-), thanks for asking. How about you? ;)",num_emoji,2
1359+
emoji_test,emoji_user_a,Ugh... today was rough :( Everything just kept going wrong :-(,num_emoji,2
1360+
emoji_test,emoji_user_b,LOL! That was hilarious :D:D:D!! Can't stop laughing!,num_emoji,3
1361+
emoji_test,emoji_user_a,Okay... so he said 'I'll be there :)' — and then never showed up.,num_emoji,1
1362+
emoji_test,emoji_user_b,"Meeting recaD:
1363+
- Client happy :)
1364+
- Budget approved ;)
1365+
- Launch on track :-D",num_emoji,3
1366+
emoji_test,emoji_user_a,"Meeting recap:
1367+
- Client happy :)
1368+
- Budget approved ;)
1369+
- Launch on track :-D",num_emoji,3
1370+
emoji_test,emoji_user_b,Mixed signals: first he smiled :) then frowned :( then smiled again :)),num_emoji,3
1371+
emoji_test,emoji_user_a,Edge-case test —;-):-(;-))... any of these match?,num_emoji,3
1372+
emoji_test,emoji_user_b,"Email thread:
1373+
> Sure, that works for me :)
1374+
> Thanks! :-)
1375+
> See you then ;)",num_emoji,3
1376+
emoji_test,emoji_user_a,Total chaos!!! :(( :(( :(( — but at least coffee :-),num_emoji,4
1377+
emoji_test,emoji_user_b,"He walked in and said, 'Good morning :-)' — like nothing happened!",num_emoji,1
1378+
emoji_test,emoji_user_a,Haha :D that was funny!,num_emoji,1
1379+
emoji_test,emoji_user_b,You got me :-P,num_emoji,1
1380+
emoji_test,emoji_user_a,Just kidding :p :P,num_emoji,2
1381+
emoji_test,emoji_user_b,All the reactions: :D :-D :P :-P,num_emoji,4
1382+
emoji_test,emoji_user_a,Lowercase alert :-p :p,num_emoji,2
1383+
emoji_test,emoji_user_b,No way! :O,num_emoji,1
1384+
emoji_test,emoji_user_a,You're kidding :-O,num_emoji,1
1385+
emoji_test,emoji_user_b,Mixed shock faces: :o :-o,num_emoji,2
1386+
emoji_test,emoji_user_a,Wow :O :o :-O :-o,num_emoji,4
1387+
emoji_test,emoji_user_b,OMG XD that cracked me up,num_emoji,1
1388+
emoji_test,emoji_user_a,xD lol,num_emoji,1
1389+
emoji_test,emoji_user_b,xd XD xD,num_emoji,3
1390+
emoji_test,emoji_user_a,Try this combo: xdxd,num_emoji,2
1391+
emoji_test,emoji_user_b,Real XDPro was here,num_emoji,1
1392+
emoji_test,emoji_user_a,I love this <3,num_emoji,1
1393+
emoji_test,emoji_user_b,Broken heart </3,num_emoji,1
1394+
emoji_test,emoji_user_a,Love and heartbreak: <3 </3,num_emoji,2
1395+
emoji_test,emoji_user_b,Wow<3cool</3,num_emoji,2

0 commit comments

Comments
 (0)