Anomaly Detection

What is Anomaly Detection?

Anomaly detection identifies data points that differ significantly from the majority. Used in fraud detection, network security, quality control, and system monitoring.

                Applications:
                Fraud detection: Credit card fraud, insurance claims
Network security: Intrusion detection
Manufacturing: Defect detection
Healthcare: Disease outbreak detection
System monitoring: Server performance anomalies

            

🌳 Isolation Forest

from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt

# Generate normal data + outliers
X_normal, _ = make_blobs(n_samples=300, centers=1, n_features=2, 
                         cluster_std=1, random_state=42)
X_outliers = np.random.uniform(low=-8, high=8, size=(20, 2))
X = np.vstack([X_normal, X_outliers])

# Isolation Forest
iso_forest = IsolationForest(
    contamination=0.1,  # Expected proportion of outliers
    random_state=42,
    n_estimators=100
)

# Fit and predict (-1 for outliers, 1 for inliers)
y_pred = iso_forest.fit_predict(X)

# Get anomaly scores (lower = more anomalous)
scores = iso_forest.score_samples(X)

print(f"Number of anomalies detected: {(y_pred == -1).sum()}")
print(f"Score range: [{scores.min():.3f}, {scores.max():.3f}]")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], 
           c='blue', label='Normal', alpha=0.6)
plt.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], 
           c='red', label='Anomaly', s=100, marker='x')
plt.title('Isolation Forest Anomaly Detection')
plt.legend()
plt.show()

How It Works:

Builds random trees by randomly selecting features and split values
Anomalies are isolated faster (fewer splits needed)
Path length used as anomaly score
Fast and works well in high dimensions

🎯 Local Outlier Factor (LOF)

from sklearn.neighbors import LocalOutlierFactor

# LOF: Compares local density of point to neighbors
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1
)

# Fit and predict
y_pred_lof = lof.fit_predict(X)

# Negative outlier factor (lower = more anomalous)
scores_lof = lof.negative_outlier_factor_

print(f"Number of anomalies: {(y_pred_lof == -1).sum()}")
print(f"LOF scores range: [{scores_lof.min():.3f}, {scores_lof.max():.3f}]")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y_pred_lof == 1, 0], X[y_pred_lof == 1, 1],
           c='blue', label='Normal', alpha=0.6)
plt.scatter(X[y_pred_lof == -1, 0], X[y_pred_lof == -1, 1],
           c='red', label='Anomaly', s=100, marker='x')
plt.title('Local Outlier Factor')
plt.legend()
plt.show()

How It Works:

Measures local density deviation
Compares density of point to its k-nearest neighbors
Points in sparse regions flagged as outliers
Good for clusters with varying densities

🔵 One-Class SVM

from sklearn.svm import OneClassSVM

# One-Class SVM: Finds decision boundary around normal data
oc_svm = OneClassSVM(
    kernel='rbf',
    gamma='auto',
    nu=0.1  # Upper bound on fraction of outliers
)

y_pred_svm = oc_svm.fit_predict(X)

print(f"Number of anomalies: {(y_pred_svm == -1).sum()}")

# Decision function (distance from boundary)
scores_svm = oc_svm.decision_function(X)

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y_pred_svm == 1, 0], X[y_pred_svm == 1, 1],
           c='blue', label='Normal', alpha=0.6)
plt.scatter(X[y_pred_svm == -1, 0], X[y_pred_svm == -1, 1],
           c='red', label='Anomaly', s=100, marker='x')
plt.title('One-Class SVM')
plt.legend()
plt.show()

🧠 Autoencoder for Anomaly Detection

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Build autoencoder
input_dim = X_scaled.shape[1]
encoding_dim = 1

# Encoder
encoder = keras.Sequential([
    layers.Dense(8, activation='relu', input_shape=(input_dim,)),
    layers.Dense(4, activation='relu'),
    layers.Dense(encoding_dim, activation='relu')
])

# Decoder
decoder = keras.Sequential([
    layers.Dense(4, activation='relu', input_shape=(encoding_dim,)),
    layers.Dense(8, activation='relu'),
    layers.Dense(input_dim, activation='linear')
])

# Autoencoder
autoencoder = keras.Sequential([encoder, decoder])

autoencoder.compile(optimizer='adam', loss='mse')

# Train on normal data only
X_train = X_scaled[y_pred == 1]  # Use normal samples
history = autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# Reconstruction error
X_reconstructed = autoencoder.predict(X_scaled)
reconstruction_error = np.mean(np.square(X_scaled - X_reconstructed), axis=1)

# Threshold for anomalies
threshold = np.percentile(reconstruction_error, 90)
y_pred_ae = (reconstruction_error > threshold).astype(int)
y_pred_ae = np.where(y_pred_ae == 1, -1, 1)

print(f"Threshold: {threshold:.4f}")
print(f"Anomalies detected: {(y_pred_ae == -1).sum()}")

# Plot reconstruction error
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_error[y_pred == 1], bins=50, 
         alpha=0.6, label='Normal')
plt.hist(reconstruction_error[y_pred == -1], bins=50, 
         alpha=0.6, label='Anomaly')
plt.axvline(threshold, color='r', linestyle='--', label='Threshold')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.legend()
plt.show()

📊 Statistical Methods

Z-Score Method

# Detect outliers using standard deviation
def zscore_anomalies(data, threshold=3):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    z_scores = np.abs((data - mean) / std)
    
    # Any feature with z-score > threshold
    return np.any(z_scores > threshold, axis=1)

y_pred_zscore = zscore_anomalies(X, threshold=3)
print(f"Z-score anomalies: {y_pred_zscore.sum()}")

Interquartile Range (IQR)

# IQR method for outlier detection
def iqr_anomalies(data):
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = np.any((data < lower_bound) | (data > upper_bound), axis=1)
    return outliers

y_pred_iqr = iqr_anomalies(X)
print(f"IQR anomalies: {y_pred_iqr.sum()}")

📈 Time Series Anomaly Detection

# Generate time series with anomalies
np.random.seed(42)
n = 1000
t = np.linspace(0, 100, n)
signal = np.sin(t) + 0.1 * np.random.randn(n)

# Add anomalies
anomaly_indices = [200, 400, 600, 800]
signal[anomaly_indices] += np.random.uniform(2, 4, len(anomaly_indices))

# Sliding window approach
window_size = 20

def detect_ts_anomalies(data, window_size, threshold=3):
    anomalies = []
    for i in range(window_size, len(data)):
        window = data[i-window_size:i]
        mean = np.mean(window)
        std = np.std(window)
        
        z_score = abs((data[i] - mean) / std)
        if z_score > threshold:
            anomalies.append(i)
    
    return anomalies

anomalies = detect_ts_anomalies(signal, window_size)

# Plot
plt.figure(figsize=(15, 6))
plt.plot(t, signal, label='Signal')
plt.scatter(t[anomalies], signal[anomalies], 
           c='red', s=100, marker='x', label='Anomalies')
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time Series Anomaly Detection')
plt.legend()
plt.show()

print(f"Detected {len(anomalies)} anomalies")

📊 Algorithm Comparison

Method	Speed	Scalability	Best For
Isolation Forest	Fast	High-dimensional	General purpose, large datasets
LOF	Medium	Small-medium	Varying density clusters
One-Class SVM	Slow	Small datasets	Clear boundary around normal
Autoencoder	Slow (training)	High-dimensional	Complex patterns, images
Z-Score	Very fast	Any size	Gaussian data, quick check
IQR	Very fast	Any size	Univariate, robust to outliers

💡 Best Practices

Know your contamination: Estimate % of outliers in advance
Try multiple methods: Different algorithms for different patterns
Scale features: Important for distance-based methods (LOF, SVM)
Validate results: Manually inspect detected anomalies
Use domain knowledge: What makes sense as an anomaly?
Consider computational cost: Isolation Forest good for large data
Handle imbalanced data: Anomalies often rare
Monitor over time: Anomalies patterns may change

🎯 Key Takeaways

Isolation Forest fast and works well in high dimensions
LOF good for varying density clusters
One-Class SVM finds decision boundary around normal data
Autoencoders learn complex patterns, use reconstruction error
Statistical methods (Z-score, IQR) simple and fast baselines
Set contamination parameter based on expected outlier rate