šŸ’¬ Natural Language Processing Basics

Text Processing, Embeddings, and Transformers

What is NLP?

Natural Language Processing (NLP) enables computers to understand, interpret, and generate human language. It powers chatbots, translation services, sentiment analysis, and AI assistants like Siri and Alexa. NLP bridges the gap between human communication and machine understanding.

Real-World Applications:

  • Virtual assistants: Siri, Alexa, Google Assistant
  • Translation: Google Translate, DeepL
  • Chatbots: Customer service automation
  • Content moderation: Detecting spam, hate speech
  • Healthcare: Analyzing medical records, drug discovery
  • Finance: Sentiment analysis of news, reports

šŸ“ Text Preprocessing

import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Sample text
text = """Natural Language Processing is amazing! It helps computers understand 
human language. NLP has many applications: chatbots, translation, and more."""

print("Original text:")
print(text)
print()

# 1. Lowercase conversion
text_lower = text.lower()
print("1. Lowercase:")
print(text_lower[:50] + "...")
print()

# 2. Remove special characters and numbers
text_clean = re.sub(r'[^a-zA-Z\s]', '', text_lower)
print("2. Remove special chars:")
print(text_clean[:50] + "...")
print()

# 3. Tokenization (split into words)
tokens = word_tokenize(text_clean)
print(f"3. Tokenization: {len(tokens)} tokens")
print(tokens[:10])
print()

# 4. Remove stopwords (common words like 'is', 'the', 'a')
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print(f"4. Remove stopwords: {len(filtered_tokens)} tokens remaining")
print(filtered_tokens[:10])
print()

# 5. Stemming (reduce to root form)
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print("5. Stemming:")
print(stemmed[:10])
print()

# 6. Lemmatization (smarter than stemming)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("6. Lemmatization:")
print(lemmatized[:10])

# Complete preprocessing pipeline
def preprocess_text(text):
    """Complete text preprocessing"""
    
    # Lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Use pipeline
processed = preprocess_text(text)
print(f"\nāœ“ Final processed text ({len(processed)} tokens):")
print(' '.join(processed))

šŸ”¤ Text Representation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Sample documents
documents = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love deep learning",
    "Deep learning and machine learning are related"
]

# 1. Bag of Words (BoW)
print("1. Bag of Words Representation:")
print("="*50)

bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(documents)

print("Vocabulary:", bow_vectorizer.get_feature_names_out())
print("\nBoW Matrix:")
print(bow_matrix.toarray())
print()

# 2. TF-IDF (Term Frequency-Inverse Document Frequency)
print("2. TF-IDF Representation:")
print("="*50)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())
print()

# TF-IDF gives higher scores to:
# - Words that appear frequently in a document (TF)
# - Words that are rare across all documents (IDF)

print("TF-IDF scores interpretation:")
print("  • High score: Important word for this document")
print("  • Low score: Common word across documents")
print("  • Zero: Word doesn't appear in document")

🌟 Word Embeddings

# Word embeddings capture semantic meaning

from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Sample sentences for training
sentences = [
    ['king', 'queen', 'man', 'woman'],
    ['paris', 'france', 'london', 'england'],
    ['cat', 'kitten', 'dog', 'puppy'],
    ['good', 'better', 'bad', 'worse'],
    ['big', 'bigger', 'small', 'smaller']
]

# Train Word2Vec model
model = Word2Vec(
    sentences,
    vector_size=50,    # Dimension of word vectors
    window=2,          # Context window size
    min_count=1,       # Minimum word frequency
    workers=4
)

print("Word2Vec Model Trained!")
print(f"Vocabulary size: {len(model.wv)}")
print()

# Get word vector
king_vector = model.wv['king']
print(f"'king' vector (first 10 dims): {king_vector[:10]}")
print()

# Find similar words
print("Words similar to 'king':")
similar = model.wv.most_similar('king', topn=3)
for word, similarity in similar:
    print(f"  {word}: {similarity:.4f}")
print()

# Famous word analogy: king - man + woman ā‰ˆ queen
result = model.wv.most_similar(
    positive=['king', 'woman'],
    negative=['man'],
    topn=1
)
print(f"king - man + woman = {result[0][0]}")
print()

# Visualize embeddings with PCA
def visualize_embeddings(model, words):
    """Plot word embeddings in 2D"""
    
    # Get word vectors
    word_vectors = np.array([model.wv[word] for word in words if word in model.wv])
    words = [word for word in words if word in model.wv]
    
    # Reduce to 2D
    pca = PCA(n_components=2)
    vectors_2d = pca.fit_transform(word_vectors)
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], alpha=0.5)
    
    for i, word in enumerate(words):
        plt.annotate(word, (vectors_2d[i, 0], vectors_2d[i, 1]))
    
    plt.title('Word Embeddings Visualization')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True)
    plt.show()

# visualize_embeddings(model, model.wv.index_to_key)

Pre-trained Word Embeddings:

# Use pre-trained embeddings (trained on billions of words)

# Popular pre-trained embeddings:
embeddings_info = {
    'Word2Vec (Google)': {
        'Size': '3M words, 300 dims',
        'Trained on': 'Google News',
        'Download': 'gensim-data'
    },
    'GloVe (Stanford)': {
        'Size': '400K-2M words, 50-300 dims',
        'Trained on': 'Wikipedia, Twitter',
        'Download': 'nlp.stanford.edu/projects/glove'
    },
    'FastText (Facebook)': {
        'Size': '1M-2M words, 300 dims',
        'Trained on': 'Wikipedia, Common Crawl',
        'Download': 'fasttext.cc'
    }
}

print("Pre-trained Word Embeddings:")
for name, info in embeddings_info.items():
    print(f"\n{name}:")
    for key, value in info.items():
        print(f"  {key}: {value}")

# Using pre-trained embeddings
"""
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec
model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin',
    binary=True
)

# Now use it!
vector = model['computer']
similar = model.most_similar('computer', topn=5)
"""

šŸ¤– Text Classification

# Sentiment analysis example

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset (normally use larger dataset like IMDB reviews)
texts = [
    "This movie was fantastic! I loved it.",
    "Terrible film, waste of time.",
    "Amazing performance by the actors!",
    "Boring and predictable plot.",
    "One of the best movies I've ever seen!",
    "Disappointing and poorly made.",
    "Incredible storytelling and visuals.",
    "Not worth watching, very bad.",
]

labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative

# Vectorize text
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(texts)
y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# Train classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")
print()

# Test on new text
new_texts = [
    "This is absolutely wonderful!",
    "I really disliked this movie."
]

new_X = vectorizer.transform(new_texts)
predictions = classifier.predict(new_X)

print("Predictions on new text:")
for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"  '{text}' → {sentiment}")

🧠 Deep Learning for NLP

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
texts = [
    "I love this product",
    "This is terrible",
    "Amazing quality",
    "Very disappointed",
    "Highly recommend",
    "Waste of money"
]
labels = [1, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative

# Tokenize text
tokenizer = Tokenizer(num_words=1000, oov_token='')
tokenizer.fit_on_texts(texts)

# Convert to sequences
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=10, padding='post')

print("Original text:", texts[0])
print("Tokenized:", sequences[0])
print("Padded:", padded[0])
print()

# Build LSTM model
model = keras.Sequential([
    layers.Embedding(input_dim=1000, output_dim=16, input_length=10),
    layers.LSTM(32, return_sequences=True),
    layers.LSTM(16),
    layers.Dense(24, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# LSTM captures sequential information
# - Remembers context from earlier words
# - Understands word order matters
# - Better than BoW/TF-IDF for complex tasks

print("\nLSTM advantages:")
print("  • Understands word order")
print("  • Captures long-range dependencies")
print("  • Handles variable-length input")
print("  • Better for complex tasks")

šŸ”„ Transformers & Modern NLP

# Transformers revolutionized NLP

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Easy way: Use pipelines
print("1. Using Hugging Face Pipelines:")
print("="*50)

# Sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")

texts = [
    "I absolutely love this!",
    "This is the worst thing ever.",
    "It's okay, nothing special."
]

for text in texts:
    result = sentiment_analyzer(text)[0]
    print(f"Text: {text}")
    print(f"  Sentiment: {result['label']}, Score: {result['score']:.4f}\n")

# Other pipeline tasks
print("\nAvailable Pipeline Tasks:")
tasks = [
    'sentiment-analysis',
    'text-generation',
    'question-answering',
    'summarization',
    'translation',
    'named-entity-recognition',
    'fill-mask',
    'zero-shot-classification'
]

for task in tasks:
    print(f"  • {task}")

print()

# Advanced: Load specific model
print("2. Using Specific Models:")
print("="*50)

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize input
text = "This is an amazing product!"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Get predictions
outputs = model(**inputs)
predictions = tf.nn.softmax(outputs.logits, axis=-1).numpy()[0]

labels = ['Negative', 'Positive']
for label, score in zip(labels, predictions):
    print(f"  {label}: {score:.4f}")

print("\nāœ“ Transformers are the current state-of-the-art for NLP!")

Popular Transformer Models:

# Overview of popular models

models = {
    'BERT': {
        'Full name': 'Bidirectional Encoder Representations from Transformers',
        'Strength': 'Understanding context (both directions)',
        'Best for': 'Classification, Q&A, NER',
        'Variants': 'RoBERTa, DistilBERT, ALBERT'
    },
    'GPT': {
        'Full name': 'Generative Pre-trained Transformer',
        'Strength': 'Text generation',
        'Best for': 'Writing, completion, chat',
        'Variants': 'GPT-2, GPT-3, GPT-4'
    },
    'T5': {
        'Full name': 'Text-to-Text Transfer Transformer',
        'Strength': 'Versatile (all tasks as text-to-text)',
        'Best for': 'Translation, summarization, Q&A',
        'Variants': 'mT5 (multilingual), ByT5 (byte-level)'
    },
    'XLNet': {
        'Full name': 'Extra Long Network',
        'Strength': 'Better than BERT on some tasks',
        'Best for': 'Long documents, context understanding',
        'Variants': 'XLM-RoBERTa (multilingual)'
    }
}

print("Transformer Models Overview:")
for name, info in models.items():
    print(f"\n{name}:")
    for key, value in info.items():
        print(f"  {key}: {value}")

šŸš€ Complete NLP Project

# End-to-end sentiment analysis project

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import pipeline

# Load dataset (or use your own)
# Using 20 newsgroups as example
# data = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.hockey'])

# For this example, let's use custom data
texts = [
    "The product quality is outstanding",
    "Terrible customer service",
    "Best purchase I've made",
    "Complete waste of money",
    "Exceeded my expectations",
    "Very dissatisfied"
] * 10  # Repeat for larger dataset

labels = [1, 0, 1, 0, 1, 0] * 10

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print()

# Method 1: Traditional ML
print("Method 1: TF-IDF + Logistic Regression")
print("="*50)

vectorizer = TfidfVectorizer(max_features=100)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

accuracy_ml = clf.score(X_test_vec, y_test)
print(f"Accuracy: {accuracy_ml:.2%}\n")

# Method 2: Transformer model
print("Method 2: Pre-trained Transformer")
print("="*50)

classifier = pipeline("sentiment-analysis")

correct = 0
for text, true_label in zip(X_test, y_test):
    result = classifier(text)[0]
    pred_label = 1 if result['label'] == 'POSITIVE' else 0
    if pred_label == true_label:
        correct += 1

accuracy_transformer = correct / len(X_test)
print(f"Accuracy: {accuracy_transformer:.2%}\n")

# Compare methods
print("Comparison:")
print(f"  Traditional ML: {accuracy_ml:.2%}")
print(f"  Transformer: {accuracy_transformer:.2%}")
print("\nTransformers usually perform better but are slower!")

# Save model (traditional ML)
import joblib
joblib.dump(clf, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("\nāœ“ Model saved!")

šŸŽÆ Key Takeaways