From 66b2e36a238a843c4a231d88dbf15ff35d95bce5 Mon Sep 17 00:00:00 2001 From: Madhur Date: Wed, 19 Nov 2025 05:02:34 +0530 Subject: [PATCH] Add LSTM based IMDB sentiment analysis example --- examples/nlp/imdb_lstm_sentiment.py | 155 ++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 examples/nlp/imdb_lstm_sentiment.py diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py new file mode 100644 index 0000000000..02ebf8799c --- /dev/null +++ b/examples/nlp/imdb_lstm_sentiment.py @@ -0,0 +1,155 @@ +""" +Title: Sentiment analysis with LSTM on the IMDB dataset +Author:Madhur Jain +Date created: 2025/11/19 +Last Modified: 2025/11/19 +Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews. +""" + +""" +## Introduction + +LSTM refers to Long short term memories, that is while predicting it not only keeps short term memory but also long term memory +LSTM uses sigmoid activation functions and tan-h activation functions: + The Sigmoid fn. ranges the values from 0 to 1, + tan-h function ranges the values from -1 to 1. +Doesn't let Gradient Descent of long term memory to vanish or short term memory to completely explode. +It contains 3 stages: + 1st stage: Determines what % of long term memory is remembered- c/a Forget Gate + 2nd stage: Determines how we would update long-term memory- c/a Input Gate + 3rd stage: Updates short term memory and it is the output of the entire stage- c/a Output Gate + +If you wanna know more deeply about it, I would recommend to watch Stanford Online: statistacl analysis with Python course lectures available on Youtube (for free) + +""" + +import os +os.environ["KERAS_BACKEND"] = "tensorflow" + +import keras +import tensorflow as tf +from keras import layers +from keras.models import Sequential + +""" +## Load the dataset +get the kAGGLE.json from your kaggle account->settings->create new token +""" +kaggle_dictionary = json.load(open("kaggle.json")) #converts json object to python dictionary +#Setup Kaggle collection as env vars +kaggle_dictionary.keys() + +os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"] +os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"] + +# unzip the dataset file +with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref: + zip_ref.extractall() + +#loading the dataset +data = pd.read_csv("/content/IMDB Dataset.csv") + +data.shape + +data.info() + +data.head() + +data["sentiment"].value_counts() + +data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True) + +data.head() + +data["sentiment"].value_counts() + + +""" +## Splitting into Training and test set +""" +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +print(train_data.shape) +print(test_data.shape) + + +""" +## Data Processing +""" +#Tokenize text data +# for text data one have to tokenize(convert words to integer in short) the data and stuff +tokenizer = Tokenizer(num_words=5000) +tokenizer.fit_on_texts(train_data["review"]) +X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200) +X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200) + +print(X_train) +print(X_test) + +Y_train = train_data["sentiment"] +Y_test = test_data["sentiment"] + +print(Y_train) +print(Y_test) + + +""" +## LSTM (Long Short Term Memory) Model +""" +# build the model + +model = Sequential() #sequential model +#add layers +model.add(Embedding(input_dim=5000, output_dim=128, input_shape=(200,))) +model.add(LSTM(128, dropout=0.2)) +model.add(Dense(1, activation="sigmoid")) + +model.summary() + +# compile the model +model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) +""" +## Training the Model +""" +model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2) + +""" +## Model Evaluation +""" +loss, accuracy = model.evaluate(X_test, Y_test) +print(f"Test Loss: {loss}") +print(f"Test Accuracy: {accuracy}") + +""" +### Predicting Values +""" +def predict_sentiment(review): + # tokenize and pad the review + sequence = tokenizer.texts_to_sequences([review]) + padded_sequence = pad_sequences(sequence, maxlen=200) + prediction = model.predict(padded_sequence) + sentiment = "positive" if prediction[0][0] > 0.5 else "negative" + return sentiment + + +# examples + +new_review = "This movie was fantastic. I loved it." +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "This movie was not that good" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "Great movie but could have added a better action scene" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "Mid movie" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +# ==================================================================================# +new_review = "I laughing while shitting damn what a watch" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") \ No newline at end of file