Project Overview
Difficulty: Advanced
Goal: Analyze sentiment (positive/negative/neutral) in text
Applications: Social media monitoring, customer feedback, product reviews
Time Required: 4-5 hours
What You'll Learn
- Text preprocessing and cleaning
- Feature extraction (TF-IDF, word embeddings)
- Building and training sentiment classifiers
- Handling imbalanced datasets
- Model evaluation metrics
- Real-time sentiment prediction
Step 1: Setup and Data Loading
# sentiment_analysis.py
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Sample dataset (in practice, use IMDB, Twitter, or Amazon reviews)
data = pd.DataFrame({
'text': [
"I absolutely love this product! Best purchase ever!",
"Terrible experience. Would not recommend to anyone.",
"It's okay, nothing special but works as expected.",
"Amazing quality and fast shipping. Very satisfied!",
"Waste of money. Poor quality and broke after one use.",
"Decent product for the price. No complaints.",
"Fantastic! Exceeded all my expectations!",
"Horrible customer service. Very disappointed.",
"Good value but could be better.",
"Outstanding! Will definitely buy again!"
],
'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative',
'neutral', 'positive', 'negative', 'neutral', 'positive']
})
print("Dataset:")
print(data)
print("\nSentiment distribution:")
print(data['sentiment'].value_counts())
Step 2: Text Preprocessing
# Text cleaning and preprocessing
class TextPreprocessor:
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def clean_text(self, text):
"""Clean and preprocess text"""
# Convert to lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
# Remove mentions and hashtags
text = re.sub(r'@\w+|#\w+', '', text)
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_and_lemmatize(self, text):
"""Tokenize and lemmatize text"""
words = text.split()
# Remove stopwords and lemmatize
words = [self.lemmatizer.lemmatize(word)
for word in words
if word not in self.stop_words]
return ' '.join(words)
def preprocess(self, text):
"""Complete preprocessing pipeline"""
text = self.clean_text(text)
text = self.tokenize_and_lemmatize(text)
return text
# Apply preprocessing
preprocessor = TextPreprocessor()
data['cleaned_text'] = data['text'].apply(preprocessor.preprocess)
print("\nPreprocessed data:")
print(data[['text', 'cleaned_text']].head())
Step 3: Feature Extraction
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
# Fit and transform the text
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['sentiment']
print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of unique words: {len(vectorizer.get_feature_names_out())}")
# Show some features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample features: {list(feature_names[:10])}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
Step 4: Train Multiple Models
# Train and compare different models
models = {
'Naive Bayes': MultinomialNB(),
'Logistic Regression': LogisticRegression(max_iter=1000),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"\n{'='*50}")
print(f"{name}")
print(f"{'='*50}")
print(f"Accuracy: {accuracy:.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Compare models
print("\nModel Comparison:")
for name, acc in results.items():
print(f"{name}: {acc:.2%}")
# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")
Step 5: Visualize Results
# Confusion Matrix
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=best_model.classes_,
yticklabels=best_model.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()
# Model comparison bar chart
plt.figure(figsize=(10, 6))
models_list = list(results.keys())
accuracies = list(results.values())
plt.bar(models_list, accuracies, color=['#2563eb', '#7c3aed', '#10b981'])
plt.title('Model Performance Comparison')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
for i, v in enumerate(accuracies):
plt.text(i, v + 0.02, f'{v:.2%}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()
# Feature importance (for Logistic Regression)
if best_model_name == 'Logistic Regression':
# Get feature importance
importance = np.abs(best_model.coef_[0])
feature_importance = pd.DataFrame({
'feature': vectorizer.get_feature_names_out(),
'importance': importance
}).sort_values('importance', ascending=False).head(20)
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()
Step 6: Real-time Prediction Function
# Create prediction function
def predict_sentiment(text, model, vectorizer, preprocessor):
"""Predict sentiment of new text"""
# Preprocess
cleaned = preprocessor.preprocess(text)
# Vectorize
features = vectorizer.transform([cleaned])
# Predict
sentiment = model.predict(features)[0]
probabilities = model.predict_proba(features)[0]
# Get probability for predicted class
class_idx = list(model.classes_).index(sentiment)
confidence = probabilities[class_idx]
return sentiment, confidence
# Test with new examples
test_texts = [
"This is absolutely amazing! I love it so much!",
"Terrible product. Very disappointed with the quality.",
"It's alright, nothing to complain about.",
"Best thing I've ever bought! Highly recommend!",
"Not worth the money. Save yourself the trouble."
]
print("\n" + "="*60)
print("SENTIMENT PREDICTIONS")
print("="*60)
for text in test_texts:
sentiment, confidence = predict_sentiment(
text, best_model, vectorizer, preprocessor
)
# Emoji mapping
emoji_map = {
'positive': '😊',
'negative': '😞',
'neutral': '😐'
}
print(f"\nText: {text}")
print(f"Sentiment: {sentiment} {emoji_map[sentiment]}")
print(f"Confidence: {confidence:.2%}")
Step 7: Advanced - Using BERT (Optional)
# Advanced: Use pre-trained BERT model
# pip install transformers torch
from transformers import pipeline
# Load pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english"
)
def predict_with_bert(text):
"""Predict sentiment using BERT"""
result = sentiment_pipeline(text)[0]
return result['label'], result['score']
# Test BERT model
print("\n" + "="*60)
print("BERT PREDICTIONS")
print("="*60)
for text in test_texts:
label, score = predict_with_bert(text)
print(f"\nText: {text}")
print(f"Sentiment: {label}")
print(f"Confidence: {score:.2%}")
Step 8: Save and Deploy Model
# Save the model and components
import pickle
# Save model
with open('sentiment_model.pkl', 'wb') as f:
pickle.dump(best_model, f)
# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
pickle.dump(vectorizer, f)
# Save preprocessor
with open('preprocessor.pkl', 'wb') as f:
pickle.dump(preprocessor, f)
print("Model saved successfully!")
# Load and use later
def load_sentiment_analyzer():
"""Load saved model"""
with open('sentiment_model.pkl', 'rb') as f:
model = pickle.load(f)
with open('vectorizer.pkl', 'rb') as f:
vectorizer = pickle.load(f)
with open('preprocessor.pkl', 'rb') as f:
preprocessor = pickle.load(f)
return model, vectorizer, preprocessor
# Test loading
loaded_model, loaded_vectorizer, loaded_preprocessor = load_sentiment_analyzer()
test_text = "This is fantastic!"
sentiment, conf = predict_sentiment(
test_text, loaded_model, loaded_vectorizer, loaded_preprocessor
)
print(f"\nLoaded model prediction: {sentiment} ({conf:.2%})")
Step 9: Build Web API (Flask)
# sentiment_api.py
from flask import Flask, request, jsonify
import pickle
app = Flask(__name__)
# Load model at startup
model, vectorizer, preprocessor = load_sentiment_analyzer()
@app.route('/predict', methods=['POST'])
def predict():
"""API endpoint for sentiment prediction"""
try:
data = request.json
text = data.get('text', '')
if not text:
return jsonify({'error': 'No text provided'}), 400
sentiment, confidence = predict_sentiment(
text, model, vectorizer, preprocessor
)
return jsonify({
'text': text,
'sentiment': sentiment,
'confidence': float(confidence)
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/batch_predict', methods=['POST'])
def batch_predict():
"""Batch prediction endpoint"""
try:
data = request.json
texts = data.get('texts', [])
results = []
for text in texts:
sentiment, confidence = predict_sentiment(
text, model, vectorizer, preprocessor
)
results.append({
'text': text,
'sentiment': sentiment,
'confidence': float(confidence)
})
return jsonify({'results': results})
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(debug=True, port=5000)
# Test API:
# curl -X POST http://localhost:5000/predict \
# -H "Content-Type: application/json" \
# -d '{"text": "This is amazing!"}'
Real-World Applications
Industry Use Cases
- Social Media: Monitor brand sentiment on Twitter, Facebook
- E-commerce: Analyze product reviews and ratings
- Customer Service: Prioritize urgent negative feedback
- Market Research: Understand consumer opinions
- Finance: Analyze news sentiment for stock predictions
- Politics: Gauge public opinion on policies
Enhancement Ideas
- 🎯 Add emotion detection (joy, anger, sadness, fear)
- 🎯 Implement aspect-based sentiment (analyze specific features)
- 🎯 Add multi-language support
- 🎯 Handle sarcasm and irony detection
- 🎯 Build real-time Twitter sentiment tracker
- 🎯 Create sentiment trends dashboard
- 🎯 Add context awareness for better accuracy
- 🎯 Implement active learning for continuous improvement
Dataset Resources
- IMDB Reviews: 50K movie reviews for binary sentiment
- Twitter Sentiment140: 1.6M tweets
- Amazon Reviews: Millions of product reviews
- Yelp Reviews: Restaurant reviews with ratings
- Stanford Sentiment Treebank: Fine-grained sentiment