NLP with Machine Learning

What is NLP?

Natural Language Processing (NLP) enables computers to understand, interpret, and generate human language. ML powers sentiment analysis, chatbots, translation, and more.

                Applications:
                Sentiment analysis: Customer reviews, social media
Text classification: Spam detection, topic categorization
Named Entity Recognition: Extract names, locations, dates
Machine translation: Language-to-language conversion
Question answering: Chatbots and virtual assistants

            

📝 Text Preprocessing

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

text = "The cats are running faster than dogs! They're amazing animals."

# 1. Lowercase
text_lower = text.lower()
print(f"Lowercase: {text_lower}")

# 2. Remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text_lower)
print(f"No punctuation: {text_clean}")

# 3. Tokenization
tokens = word_tokenize(text_clean)
print(f"Tokens: {tokens}")

# 4. Remove stopwords
stop_words = set(stopwords.words('english'))
tokens_filtered = [w for w in tokens if w not in stop_words]
print(f"No stopwords: {tokens_filtered}")

# 5. Stemming (rough)
stemmer = PorterStemmer()
stems = [stemmer.stem(w) for w in tokens_filtered]
print(f"Stems: {stems}")

# 6. Lemmatization (better)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(w) for w in tokens_filtered]
print(f"Lemmas: {lemmas}")

🔢 Bag of Words (BoW)

from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love machine learning",
    "Machine learning is amazing",
    "I hate spam emails",
    "Python is great for ML"
]

# Create BoW
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print(f"Vocabulary: {vectorizer.get_feature_names_out()}")
print(f"Shape: {X.shape}")
print(f"\nBoW matrix:")
print(X.toarray())

# With n-grams
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))  # unigrams + bigrams
X_ngram = vectorizer_ngram.fit_transform(documents)
print(f"\nWith bigrams: {X_ngram.shape}")

📊 TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF: Term Frequency - Inverse Document Frequency
# Reduces weight of common words
tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(documents)

print(f"TF-IDF shape: {X_tfidf.shape}")
print(f"Feature names: {tfidf.get_feature_names_out()}")
print(f"\nTF-IDF matrix:")
print(X_tfidf.toarray())

😊 Sentiment Analysis

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Sample movie reviews
reviews = [
    ("This movie is excellent!", 1),
    ("I loved it, amazing performance", 1),
    ("Best film I've seen this year", 1),
    ("Terrible movie, waste of time", 0),
    ("Boring and predictable", 0),
    ("Worst movie ever made", 0),
    ("Great acting and plot", 1),
    ("Disappointing and dull", 0)
] * 10  # Repeat for more data

texts, labels = zip(*reviews)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Vectorize
tfidf = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, 
      target_names=['Negative', 'Positive']))

# Predict new review
new_reviews = ["This movie is fantastic!", "I hated this film"]
new_tfidf = tfidf.transform(new_reviews)
predictions = clf.predict(new_tfidf)
for review, pred in zip(new_reviews, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"'{review}' -> {sentiment}")

📧 Spam Classification

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Sample spam/ham messages
messages = [
    ("Win free money now!", "spam"),
    ("Meeting tomorrow at 3pm", "ham"),
    ("Claim your prize immediately", "spam"),
    ("Can you review my code?", "ham"),
    ("Congratulations! You won $1000", "spam"),
    ("Dinner plans tonight?", "ham"),
    ("Click here for free iPhone", "spam"),
    ("Project deadline extended", "ham")
] * 15

texts, labels = zip(*messages)
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Pipeline: TF-IDF + Naive Bayes
spam_classifier = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=300)),
    ('clf', MultinomialNB())
])

spam_classifier.fit(X_train, y_train)
print(f"Accuracy: {spam_classifier.score(X_test, y_test):.3f}")

# Test
test_messages = [
    "Free lottery winner, claim now!",
    "Let's schedule a meeting"
]
predictions = spam_classifier.predict(test_messages)
for msg, pred in zip(test_messages, predictions):
    print(f"'{msg}' -> {pred}")

🔤 Word Embeddings - Word2Vec

# pip install gensim
from gensim.models import Word2Vec

sentences = [
    ['machine', 'learning', 'is', 'fun'],
    ['deep', 'learning', 'is', 'powerful'],
    ['python', 'is', 'great', 'for', 'machine', 'learning'],
    ['neural', 'networks', 'are', 'amazing']
] * 20  # Repeat for training

# Train Word2Vec
model = Word2Vec(sentences, vector_size=50, window=3, 
                min_count=1, epochs=100)

# Get vector for word
vector = model.wv['machine']
print(f"Vector for 'machine': {vector[:5]}...")  # First 5 dims

# Find similar words
similar = model.wv.most_similar('learning', topn=3)
print(f"\nMost similar to 'learning': {similar}")

# Word arithmetic
result = model.wv.most_similar(
    positive=['king', 'woman'],
    negative=['man'],
    topn=1
)
print(f"\nking - man + woman = {result}")

🏷️ Named Entity Recognition (NER)

import spacy

# Load model: python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

text = "Apple Inc. was founded by Steve Jobs in Cupertino, California on April 1, 1976."

doc = nlp(text)

print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

# Visualize
from spacy import displacy
displacy.render(doc, style='ent', jupyter=False)

🎯 Topic Modeling - LDA

from sklearn.decomposition import LatentDirichletAllocation

documents = [
    "Python programming language is great for data science",
    "Machine learning algorithms solve complex problems",
    "Deep learning neural networks are powerful",
    "JavaScript is used for web development",
    "React and Vue are JavaScript frameworks",
    "HTML CSS for frontend development",
    "Scikit-learn library for machine learning",
    "TensorFlow and PyTorch for deep learning"
] * 10

# Vectorize
vectorizer = CountVectorizer(max_features=50, stop_words='english')
X = vectorizer.fit_transform(documents)

# LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X)

# Display topics
feature_names = vectorizer.get_feature_names_out()
print("Topics:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[-5:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

🤖 Text Generation

from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

text = "To be or not to be, that is the question. " * 50

# Create character mappings
chars = sorted(set(text))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}

# Prepare sequences
seq_length = 40
sequences = []
next_chars = []

for i in range(len(text) - seq_length):
    sequences.append([char_to_idx[c] for c in text[i:i+seq_length]])
    next_chars.append(char_to_idx[text[i+seq_length]])

X = np.array(sequences)
y = np.array(next_chars)

# One-hot encode
X_encoded = keras.utils.to_categorical(X, num_classes=len(chars))
y_encoded = keras.utils.to_categorical(y, num_classes=len(chars))

# Build model
model = keras.Sequential([
    layers.LSTM(128, input_shape=(seq_length, len(chars))),
    layers.Dense(len(chars), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(X_encoded, y_encoded, epochs=50, batch_size=128, verbose=0)

# Generate text
def generate_text(model, start_text, length=100):
    generated = start_text
    for _ in range(length):
        x = [char_to_idx[c] for c in generated[-seq_length:]]
        x = np.array([x])
        x = keras.utils.to_categorical(x, num_classes=len(chars))
        pred = model.predict(x, verbose=0)[0]
        next_char = idx_to_char[np.argmax(pred)]
        generated += next_char
    return generated

print(generate_text(model, "To be or not to be"))

💡 Best Practices

Clean text properly: Lowercase, remove punctuation, stopwords
Use TF-IDF over BoW: Better captures importance
Try n-grams: Capture phrases, not just words
Lemmatize over stem: More accurate word forms
Handle class imbalance: Common in spam/sentiment tasks
Use pre-trained embeddings: Word2Vec, GloVe for better features
Consider context: LSTMs and Transformers capture sequential info
Validate on different domains: Models may not generalize

🎯 Key Takeaways

Preprocess text: Lowercase, tokenize, remove stopwords
BoW & TF-IDF convert text to numerical features
Naive Bayes works well for text classification
Word2Vec creates dense word embeddings
spaCy for NER and linguistic features
LDA discovers topics in documents
LSTMs for sequence modeling and text generation