Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions pythainlp/text_cleaning/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""
text cleaning in NLP
"""
from nlp import extract_tokens, insert_tokens
import re

"""
_____________________________________________________________________________
| ประเภทข้อมูล | ตัวอย่างที่ต้องจัดการ |
| ------------ | ------------------------------------------------------- |
| อักขระพิเศษ | !!!, ..., \~, ^\_^ |
| อีโมจิ | 😊, 😊, ^\_^ |
| ตัวเลข | 3 |
| URL | [https://example.com](https://example.com) |
| Hashtag | #เที่ยวไทย, #สนุกสุดๆ |
| ตัวพิมพ์ใหญ่ | “สวัสดีค่ะ” → “สวัสดีค่ะ” (เปลี่ยนทั้งหมดเป็นพิมพ์เล็ก) |
| คำซ้ำ/คำยืด | “มากกกกกก” → “มาก” หรือ “ดีมาก” |
| ช่องว่างเกิน | (เช่นเว้นหลายช่อง) |
| stopwords | เช่น “แล้วก็”, “แบบว่า”, “คือ” |
_____________________________________________________________________________

"""


test = """
example 1 :

สวัสดีค่ะ!!! วันนี้อากาศดีมากๆๆ 😊😊 จะไปเที่ยวกับเพื่อนๆมา 3 คน ที่จังหวัดเชียงใหม่!!!
ลองเข้าไปดูรูปได้นะที่ https://facebook.com #เที่ยวไทย #สนุกสุดๆ
แล้วก็แบบว่า...คือมันดีมากกกกกก~ ขอบคุณค่ะ ^_^
"""
"""
example 2 :

OMG!!! This is sooooo AMAZING!!! 😍😍😍 Check it out: https://wow.com/article/123
I can't believe it..... @john_doe was right all along!!! 😂😂😂
Go to https://example.com RIGHT NOW!!! You won't regret it. #unbelievable #wowww
Sooooo goooooddddddd.... I'm in loooooveeee 💕💕💕
Visit www.testsite.org to learn more. This is just... WOW!!! 😱😱

"""

thai_special_chars_unicode = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also try to use the list defined here:

# Paiyannoi, Maiyamok, Phinthu, Thanthakhat, Nikhahit, Yamakkan:
# These signs can be part of a word
thai_signs = "\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e" # 6 chars

(it doesn't include ฯลฯ though)

"ๆ": "\u0E46",
"ฯ": "\u0E2F",
"ฯลฯ": "\u0E2Fล\u0E2F",
"๏": "\u0E4F",
"๛": "\u0E5B",
"๚": "\u0E5A",
"๚ะ": "\u0E5Aะ",
}

emoji_sentiment = {
"positive": [
"😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️"
],

"negative": [
"😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿"
],

"neutral": [
"😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲"
]
}

def replace_emoji_with_sentiment(sentence: str, emoji_dict: dict) -> str:
for emo in emoji_dict["positive"]:
sentence = sentence.replace(emo, " <<EMO_POS>> ")
for emo in emoji_dict["negative"]:
sentence = sentence.replace(emo, " <<EMO_NEG>> ")
for emo in emoji_dict["neutral"]:
sentence = sentence.replace(emo, " <<EMO_NEU>> ")
return sentence
Comment on lines +53 to +74
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
emoji_sentiment = {
"positive": [
"😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️"
],
"negative": [
"😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿"
],
"neutral": [
"😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲"
]
}
def replace_emoji_with_sentiment(sentence: str, emoji_dict: dict) -> str:
for emo in emoji_dict["positive"]:
sentence = sentence.replace(emo, " <<EMO_POS>> ")
for emo in emoji_dict["negative"]:
sentence = sentence.replace(emo, " <<EMO_NEG>> ")
for emo in emoji_dict["neutral"]:
sentence = sentence.replace(emo, " <<EMO_NEU>> ")
return sentence
emoji_sentiment = {
"POS": [
"😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️"
],
"NEG": [
"😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿"
],
"NEU": [
"😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲"
]
}
# Create an emoji-sentiment map from `emoji_sentiment`
emoji_to_tag = {}
for sentiment, emojis in emoji_sentiment.items():
tag = f" <<EMO_{sentiment}>> "
for emo in emojis:
emoji_to_tag[emo] = tag
# Alternatively, we can have the static map predefined.
def replace_emoji_with_sentiment(sentence: str) -> str:
for emo, tag in emoji_to_tag.items():
sentence = sentence.replace(emo, tag)
return sentence

We can also try to reduce the number of text passes from three to one by having an emoji-sentiment map.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! would it be okay if I add this code to the text normalization function?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please take it and use it in any way you want :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I think this is the same as emojiconv.py


def remove_thai_special_chars(text: str, pattern : str) -> str:
return re.sub(pattern, '', text)

def normalize_text(
text: str,
emoji_dict: dict = None,
url_mode: str = "token", # "token", "keep", "remove"
replace_url_token: str = "<<URL>>"
) -> str:
import re

def extract_tokens(text: str) -> tuple[str, list[str]]:
pattern = r'<<[^<>]+>>'
tokens = re.findall(pattern, text)
text_wo_tokens = re.sub(pattern, '', text)
return text_wo_tokens, tokens

def insert_tokens(text: str, tokens: list[str]) -> str:
return (text + ' ' + ' '.join(tokens)).strip()

text = text.lower()

url_pattern = r'https?://\S+|www\.\S+'

if url_mode == "token":
text = re.sub(url_pattern, replace_url_token, text)
elif url_mode == "remove":
text = re.sub(url_pattern, '', text)

text = re.sub(r'@\w+', '<<MENTION>>', text)
text = re.sub(r'#([\wก-๙]+)', r'<<HASHTAG_\1>>', text)

# Emoji → Sentiment token
if emoji_dict:
text = replace_emoji_with_sentiment(sentence=text, emoji_dict=emoji_dict)

# แยก token ก่อน clean
text_wo_tokens, tokens = extract_tokens(text)

# Clean ส่วนที่ไม่ใช่ token
text_wo_tokens = re.sub(r'[^\w\sก-๙.:/]', '', text_wo_tokens)

# ลบอักขระพิเศษเฉพาะไทย
th_special_chars_to_remove = [k for k in thai_special_chars_unicode if len(k) == 1 and k != 'ะ']
pattern = '[' + ''.join(re.escape(c) for c in th_special_chars_to_remove) + ']'
text_wo_tokens = re.sub(pattern, '', text_wo_tokens)

# Normalize คำยืด
text_wo_tokens = re.sub(r'(.)\1{2,}', r'\1\1', text_wo_tokens)
text_wo_tokens = re.sub(r'\s+', ' ', text_wo_tokens).strip()

return insert_tokens(text_wo_tokens, tokens)

print("Original")
print(test)

print("แบบเก็บ URL เต็ม:")
print(normalize_text(test, emoji_dict=emoji_sentiment, url_mode="keep"))

print("แบบแปลงเป็น <<URL>>:")
print(normalize_text(test, emoji_dict=emoji_sentiment, url_mode="token"))

print("แบบลบ URL ทิ้ง:")
print(normalize_text(test, emoji_dict=emoji_sentiment, url_mode="remove"))
1 change: 1 addition & 0 deletions pythainlp/text_cleaning/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .token_utils import extract_tokens, insert_tokens
3 changes: 3 additions & 0 deletions pythainlp/text_cleaning/nlp/token_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import re
from typing import List, Tuple

Loading