Project: Sentiment Analysis

Project Overview

Difficulty: Advanced

Goal: Analyze sentiment (positive/negative/neutral) in text

Applications: Social media monitoring, customer feedback, product reviews

Time Required: 4-5 hours

What You'll Learn

Text preprocessing and cleaning
Feature extraction (TF-IDF, word embeddings)
Building and training sentiment classifiers
Handling imbalanced datasets
Model evaluation metrics
Real-time sentiment prediction

Step 1: Setup and Data Loading

# sentiment_analysis.py
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample dataset (in practice, use IMDB, Twitter, or Amazon reviews)
data = pd.DataFrame({
    'text': [
        "I absolutely love this product! Best purchase ever!",
        "Terrible experience. Would not recommend to anyone.",
        "It's okay, nothing special but works as expected.",
        "Amazing quality and fast shipping. Very satisfied!",
        "Waste of money. Poor quality and broke after one use.",
        "Decent product for the price. No complaints.",
        "Fantastic! Exceeded all my expectations!",
        "Horrible customer service. Very disappointed.",
        "Good value but could be better.",
        "Outstanding! Will definitely buy again!"
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative',
                  'neutral', 'positive', 'negative', 'neutral', 'positive']
})

print("Dataset:")
print(data)
print("\nSentiment distribution:")
print(data['sentiment'].value_counts())

Step 2: Text Preprocessing

# Text cleaning and preprocessing
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text):
        """Clean and preprocess text"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_and_lemmatize(self, text):
        """Tokenize and lemmatize text"""
        words = text.split()
        
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(word) 
                for word in words 
                if word not in self.stop_words]
        
        return ' '.join(words)
    
    def preprocess(self, text):
        """Complete preprocessing pipeline"""
        text = self.clean_text(text)
        text = self.tokenize_and_lemmatize(text)
        return text

# Apply preprocessing
preprocessor = TextPreprocessor()
data['cleaned_text'] = data['text'].apply(preprocessor.preprocess)

print("\nPreprocessed data:")
print(data[['text', 'cleaned_text']].head())

Step 3: Feature Extraction

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

# Fit and transform the text
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['sentiment']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of unique words: {len(vectorizer.get_feature_names_out())}")

# Show some features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample features: {list(feature_names[:10])}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Step 4: Train Multiple Models

# Train and compare different models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"\n{'='*50}")
    print(f"{name}")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.2%}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Compare models
print("\nModel Comparison:")
for name, acc in results.items():
    print(f"{name}: {acc:.2%}")

# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")

Step 5: Visualize Results

# Confusion Matrix
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=best_model.classes_,
            yticklabels=best_model.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# Model comparison bar chart
plt.figure(figsize=(10, 6))
models_list = list(results.keys())
accuracies = list(results.values())

plt.bar(models_list, accuracies, color=['#2563eb', '#7c3aed', '#10b981'])
plt.title('Model Performance Comparison')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.02, f'{v:.2%}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

# Feature importance (for Logistic Regression)
if best_model_name == 'Logistic Regression':
    # Get feature importance
    importance = np.abs(best_model.coef_[0])
    feature_importance = pd.DataFrame({
        'feature': vectorizer.get_feature_names_out(),
        'importance': importance
    }).sort_values('importance', ascending=False).head(20)
    
    plt.figure(figsize=(10, 8))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Importance')
    plt.title('Top 20 Most Important Features')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.show()

Step 6: Real-time Prediction Function

# Create prediction function
def predict_sentiment(text, model, vectorizer, preprocessor):
    """Predict sentiment of new text"""
    # Preprocess
    cleaned = preprocessor.preprocess(text)
    
    # Vectorize
    features = vectorizer.transform([cleaned])
    
    # Predict
    sentiment = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]
    
    # Get probability for predicted class
    class_idx = list(model.classes_).index(sentiment)
    confidence = probabilities[class_idx]
    
    return sentiment, confidence

# Test with new examples
test_texts = [
    "This is absolutely amazing! I love it so much!",
    "Terrible product. Very disappointed with the quality.",
    "It's alright, nothing to complain about.",
    "Best thing I've ever bought! Highly recommend!",
    "Not worth the money. Save yourself the trouble."
]

print("\n" + "="*60)
print("SENTIMENT PREDICTIONS")
print("="*60)

for text in test_texts:
    sentiment, confidence = predict_sentiment(
        text, best_model, vectorizer, preprocessor
    )
    
    # Emoji mapping
    emoji_map = {
        'positive': '😊',
        'negative': '😞',
        'neutral': '😐'
    }
    
    print(f"\nText: {text}")
    print(f"Sentiment: {sentiment} {emoji_map[sentiment]}")
    print(f"Confidence: {confidence:.2%}")

Step 7: Advanced - Using BERT (Optional)

# Advanced: Use pre-trained BERT model
# pip install transformers torch

from transformers import pipeline

# Load pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

def predict_with_bert(text):
    """Predict sentiment using BERT"""
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']

# Test BERT model
print("\n" + "="*60)
print("BERT PREDICTIONS")
print("="*60)

for text in test_texts:
    label, score = predict_with_bert(text)
    print(f"\nText: {text}")
    print(f"Sentiment: {label}")
    print(f"Confidence: {score:.2%}")

Step 8: Save and Deploy Model

# Save the model and components
import pickle

# Save model
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Model saved successfully!")

# Load and use later
def load_sentiment_analyzer():
    """Load saved model"""
    with open('sentiment_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    with open('preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)
    return model, vectorizer, preprocessor

# Test loading
loaded_model, loaded_vectorizer, loaded_preprocessor = load_sentiment_analyzer()
test_text = "This is fantastic!"
sentiment, conf = predict_sentiment(
    test_text, loaded_model, loaded_vectorizer, loaded_preprocessor
)
print(f"\nLoaded model prediction: {sentiment} ({conf:.2%})")

Step 9: Build Web API (Flask)

# sentiment_api.py
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load model at startup
model, vectorizer, preprocessor = load_sentiment_analyzer()

@app.route('/predict', methods=['POST'])
def predict():
    """API endpoint for sentiment prediction"""
    try:
        data = request.json
        text = data.get('text', '')
        
        if not text:
            return jsonify({'error': 'No text provided'}), 400
        
        sentiment, confidence = predict_sentiment(
            text, model, vectorizer, preprocessor
        )
        
        return jsonify({
            'text': text,
            'sentiment': sentiment,
            'confidence': float(confidence)
        })
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/batch_predict', methods=['POST'])
def batch_predict():
    """Batch prediction endpoint"""
    try:
        data = request.json
        texts = data.get('texts', [])
        
        results = []
        for text in texts:
            sentiment, confidence = predict_sentiment(
                text, model, vectorizer, preprocessor
            )
            results.append({
                'text': text,
                'sentiment': sentiment,
                'confidence': float(confidence)
            })
        
        return jsonify({'results': results})
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, port=5000)

# Test API:
# curl -X POST http://localhost:5000/predict \
#      -H "Content-Type: application/json" \
#      -d '{"text": "This is amazing!"}'

Real-World Applications

Industry Use Cases

Social Media: Monitor brand sentiment on Twitter, Facebook
E-commerce: Analyze product reviews and ratings
Customer Service: Prioritize urgent negative feedback
Market Research: Understand consumer opinions
Finance: Analyze news sentiment for stock predictions
Politics: Gauge public opinion on policies

Enhancement Ideas

🎯 Add emotion detection (joy, anger, sadness, fear)
🎯 Implement aspect-based sentiment (analyze specific features)
🎯 Add multi-language support
🎯 Handle sarcasm and irony detection
🎯 Build real-time Twitter sentiment tracker
🎯 Create sentiment trends dashboard
🎯 Add context awareness for better accuracy
🎯 Implement active learning for continuous improvement

Dataset Resources

IMDB Reviews: 50K movie reviews for binary sentiment
Twitter Sentiment140: 1.6M tweets
Amazon Reviews: Millions of product reviews
Yelp Reviews: Restaurant reviews with ratings
Stanford Sentiment Treebank: Fine-grained sentiment