Model Deployment

What is Model Deployment?

Model deployment is making your trained model available for real-world use. Turn notebooks into production APIs, web apps, or mobile apps.

                Deployment Options:
                REST API: Flask, FastAPI for web services
Cloud platforms: AWS, GCP, Azure
Containers: Docker for reproducibility
Serverless: AWS Lambda, Google Cloud Functions
Edge devices: Mobile, IoT devices

            

💾 Save & Load Models

import pickle
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Train a model
X, y = make_classification(n_samples=1000, n_features=20)
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)

# Save with pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load with pickle
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Save with joblib (better for sklearn)
joblib.dump(model, 'model.joblib')
loaded_model = joblib.load('model.joblib')

# Test
prediction = loaded_model.predict(X[:5])
print(f"Predictions: {prediction}")

🌐 Flask API

# app.py
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)

# Load model at startup
model = joblib.load('model.joblib')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get data from request
        data = request.get_json()
        features = np.array(data['features']).reshape(1, -1)
        
        # Make prediction
        prediction = model.predict(features)
        probability = model.predict_proba(features)
        
        return jsonify({
            'prediction': int(prediction[0]),
            'probability': probability[0].tolist(),
            'status': 'success'
        })
    
    except Exception as e:
        return jsonify({
            'error': str(e),
            'status': 'error'
        }), 400

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'healthy'})

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

# Run: python app.py
# Test: curl -X POST http://localhost:5000/predict \
#       -H "Content-Type: application/json" \
#       -d '{"features": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}'

⚡ FastAPI (Modern Alternative)

# main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import numpy as np
from typing import List

app = FastAPI(title="ML Model API")

# Load model
model = joblib.load('model.joblib')

class PredictionInput(BaseModel):
    features: List[float]

class PredictionOutput(BaseModel):
    prediction: int
    probability: List[float]

@app.post("/predict", response_model=PredictionOutput)
async def predict(input_data: PredictionInput):
    try:
        features = np.array(input_data.features).reshape(1, -1)
        prediction = model.predict(features)
        probability = model.predict_proba(features)
        
        return PredictionOutput(
            prediction=int(prediction[0]),
            probability=probability[0].tolist()
        )
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

@app.get("/health")
async def health():
    return {"status": "healthy"}

# Run: uvicorn main:app --reload
# Docs: http://localhost:8000/docs (automatic interactive API docs!)

🐳 Docker Containerization

# Dockerfile
FROM python:3.9-slim

WORKDIR /app

# Copy requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application
COPY app.py .
COPY model.joblib .

EXPOSE 5000

CMD ["python", "app.py"]

# Build: docker build -t ml-model-api .
# Run: docker run -p 5000:5000 ml-model-api

requirements.txt

flask==2.3.0
scikit-learn==1.3.0
joblib==1.3.0
numpy==1.24.0

☁️ AWS Deployment

1. AWS Lambda

# lambda_function.py
import json
import boto3
import joblib
import numpy as np

# Load model from S3
s3 = boto3.client('s3')
s3.download_file('my-bucket', 'model.joblib', '/tmp/model.joblib')
model = joblib.load('/tmp/model.joblib')

def lambda_handler(event, context):
    try:
        body = json.loads(event['body'])
        features = np.array(body['features']).reshape(1, -1)
        
        prediction = model.predict(features)
        probability = model.predict_proba(features)
        
        return {
            'statusCode': 200,
            'body': json.dumps({
                'prediction': int(prediction[0]),
                'probability': probability[0].tolist()
            })
        }
    except Exception as e:
        return {
            'statusCode': 400,
            'body': json.dumps({'error': str(e)})
        }

2. AWS SageMaker

import sagemaker
from sagemaker.sklearn.estimator import SKLearn

# Create SageMaker session
sagemaker_session = sagemaker.Session()
role = 'arn:aws:iam::YOUR_ACCOUNT:role/SageMakerRole'

# Create SKLearn estimator
sklearn = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type='ml.m5.large',
    framework_version='1.0-1',
    py_version='py3'
)

# Train
sklearn.fit({'train': 's3://bucket/train.csv'})

# Deploy
predictor = sklearn.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium'
)

# Predict
result = predictor.predict(data)
print(result)

🔥 Google Cloud Platform

# Deploy to Cloud Run
# 1. Build container
gcloud builds submit --tag gcr.io/PROJECT_ID/ml-model

# 2. Deploy
gcloud run deploy ml-model-api \
  --image gcr.io/PROJECT_ID/ml-model \
  --platform managed \
  --region us-central1 \
  --allow-unauthenticated

# 3. Test
curl -X POST https://ml-model-api-xxx.run.app/predict \
  -H "Content-Type: application/json" \
  -d '{"features": [...]}'

⚡ Streamlit Web App

# app.py
import streamlit as st
import joblib
import numpy as np

# Load model
@st.cache_resource
def load_model():
    return joblib.load('model.joblib')

model = load_model()

st.title('🤖 ML Model Predictor')
st.write('Enter features to get prediction')

# Input form
with st.form('prediction_form'):
    features = []
    cols = st.columns(4)
    
    for i in range(20):
        with cols[i % 4]:
            feature = st.number_input(f'Feature {i+1}', value=0.0)
            features.append(feature)
    
    submit = st.form_submit_button('Predict')

if submit:
    # Make prediction
    X = np.array(features).reshape(1, -1)
    prediction = model.predict(X)
    probability = model.predict_proba(X)
    
    # Display results
    st.success(f'Prediction: Class {prediction[0]}')
    st.write(f'Probability: {probability[0]}')
    
    # Visualize
    st.bar_chart(probability[0])

# Run: streamlit run app.py

📱 Model Optimization

Quantization

# Reduce model size for edge deployment
import tensorflow as tf

# Load model
model = tf.keras.models.load_model('model.h5')

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Save
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

print(f"Original size: {os.path.getsize('model.h5') / 1024:.2f} KB")
print(f"Quantized size: {os.path.getsize('model.tflite') / 1024:.2f} KB")

ONNX Export

# Export to ONNX for cross-platform compatibility
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Define input type
initial_type = [('float_input', FloatTensorType([None, 20]))]

# Convert
onnx_model = convert_sklearn(model, initial_types=initial_type)

# Save
with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

🔐 Production Best Practices

Authentication: API keys, OAuth for security

Rate limiting: Prevent abuse

from flask_limiter import Limiter

limiter = Limiter(app, key_func=lambda: request.remote_addr)

@app.route('/predict', methods=['POST'])
@limiter.limit("10 per minute")
def predict():
    # ...

Logging: Track requests and errors

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    logger.info(f"Prediction request from {request.remote_addr}")
    # ...

Monitoring: Track latency, errors, model drift
Versioning: Keep track of model versions
A/B testing: Compare model versions
Rollback plan: Quick revert if issues

📊 Monitoring

# Track predictions and performance
import time
from prometheus_client import Counter, Histogram

# Metrics
prediction_counter = Counter('predictions_total', 'Total predictions')
prediction_latency = Histogram('prediction_latency_seconds', 'Prediction latency')

@app.route('/predict', methods=['POST'])
def predict():
    start_time = time.time()
    
    # Make prediction
    result = model.predict(features)
    
    # Record metrics
    prediction_counter.inc()
    prediction_latency.observe(time.time() - start_time)
    
    return jsonify(result)

💡 Deployment Checklist

✅ Save model in portable format (pickle, joblib, ONNX)
✅ Create API with Flask/FastAPI
✅ Add input validation and error handling
✅ Containerize with Docker
✅ Set up logging and monitoring
✅ Add authentication and rate limiting
✅ Test API thoroughly
✅ Deploy to cloud platform
✅ Set up CI/CD pipeline
✅ Monitor in production

🎯 Key Takeaways

Save models with joblib or pickle
Create REST API with Flask or FastAPI
Containerize with Docker for reproducibility
Deploy to cloud (AWS, GCP, Azure)
Add monitoring and logging
Implement security (auth, rate limiting)
Use Streamlit for quick demo apps