What is Model Deployment?
Model deployment is making your trained model available for real-world use. Turn notebooks into production APIs, web apps, or mobile apps.
Deployment Options:
- REST API: Flask, FastAPI for web services
- Cloud platforms: AWS, GCP, Azure
- Containers: Docker for reproducibility
- Serverless: AWS Lambda, Google Cloud Functions
- Edge devices: Mobile, IoT devices
💾 Save & Load Models
import pickle
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# Train a model
X, y = make_classification(n_samples=1000, n_features=20)
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)
# Save with pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
# Load with pickle
with open('model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Save with joblib (better for sklearn)
joblib.dump(model, 'model.joblib')
loaded_model = joblib.load('model.joblib')
# Test
prediction = loaded_model.predict(X[:5])
print(f"Predictions: {prediction}")
🌐 Flask API
# app.py
from flask import Flask, request, jsonify
import joblib
import numpy as np
app = Flask(__name__)
# Load model at startup
model = joblib.load('model.joblib')
@app.route('/predict', methods=['POST'])
def predict():
try:
# Get data from request
data = request.get_json()
features = np.array(data['features']).reshape(1, -1)
# Make prediction
prediction = model.predict(features)
probability = model.predict_proba(features)
return jsonify({
'prediction': int(prediction[0]),
'probability': probability[0].tolist(),
'status': 'success'
})
except Exception as e:
return jsonify({
'error': str(e),
'status': 'error'
}), 400
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'healthy'})
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000)
# Run: python app.py
# Test: curl -X POST http://localhost:5000/predict \
# -H "Content-Type: application/json" \
# -d '{"features": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}'
⚡ FastAPI (Modern Alternative)
# main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import numpy as np
from typing import List
app = FastAPI(title="ML Model API")
# Load model
model = joblib.load('model.joblib')
class PredictionInput(BaseModel):
features: List[float]
class PredictionOutput(BaseModel):
prediction: int
probability: List[float]
@app.post("/predict", response_model=PredictionOutput)
async def predict(input_data: PredictionInput):
try:
features = np.array(input_data.features).reshape(1, -1)
prediction = model.predict(features)
probability = model.predict_proba(features)
return PredictionOutput(
prediction=int(prediction[0]),
probability=probability[0].tolist()
)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/health")
async def health():
return {"status": "healthy"}
# Run: uvicorn main:app --reload
# Docs: http://localhost:8000/docs (automatic interactive API docs!)
🐳 Docker Containerization
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# Copy requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY app.py .
COPY model.joblib .
EXPOSE 5000
CMD ["python", "app.py"]
# Build: docker build -t ml-model-api .
# Run: docker run -p 5000:5000 ml-model-api
requirements.txt
flask==2.3.0
scikit-learn==1.3.0
joblib==1.3.0
numpy==1.24.0
☁️ AWS Deployment
1. AWS Lambda
# lambda_function.py
import json
import boto3
import joblib
import numpy as np
# Load model from S3
s3 = boto3.client('s3')
s3.download_file('my-bucket', 'model.joblib', '/tmp/model.joblib')
model = joblib.load('/tmp/model.joblib')
def lambda_handler(event, context):
try:
body = json.loads(event['body'])
features = np.array(body['features']).reshape(1, -1)
prediction = model.predict(features)
probability = model.predict_proba(features)
return {
'statusCode': 200,
'body': json.dumps({
'prediction': int(prediction[0]),
'probability': probability[0].tolist()
})
}
except Exception as e:
return {
'statusCode': 400,
'body': json.dumps({'error': str(e)})
}
2. AWS SageMaker
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
# Create SageMaker session
sagemaker_session = sagemaker.Session()
role = 'arn:aws:iam::YOUR_ACCOUNT:role/SageMakerRole'
# Create SKLearn estimator
sklearn = SKLearn(
entry_point='train.py',
role=role,
instance_type='ml.m5.large',
framework_version='1.0-1',
py_version='py3'
)
# Train
sklearn.fit({'train': 's3://bucket/train.csv'})
# Deploy
predictor = sklearn.deploy(
initial_instance_count=1,
instance_type='ml.t2.medium'
)
# Predict
result = predictor.predict(data)
print(result)
🔥 Google Cloud Platform
# Deploy to Cloud Run
# 1. Build container
gcloud builds submit --tag gcr.io/PROJECT_ID/ml-model
# 2. Deploy
gcloud run deploy ml-model-api \
--image gcr.io/PROJECT_ID/ml-model \
--platform managed \
--region us-central1 \
--allow-unauthenticated
# 3. Test
curl -X POST https://ml-model-api-xxx.run.app/predict \
-H "Content-Type: application/json" \
-d '{"features": [...]}'
⚡ Streamlit Web App
# app.py
import streamlit as st
import joblib
import numpy as np
# Load model
@st.cache_resource
def load_model():
return joblib.load('model.joblib')
model = load_model()
st.title('🤖 ML Model Predictor')
st.write('Enter features to get prediction')
# Input form
with st.form('prediction_form'):
features = []
cols = st.columns(4)
for i in range(20):
with cols[i % 4]:
feature = st.number_input(f'Feature {i+1}', value=0.0)
features.append(feature)
submit = st.form_submit_button('Predict')
if submit:
# Make prediction
X = np.array(features).reshape(1, -1)
prediction = model.predict(X)
probability = model.predict_proba(X)
# Display results
st.success(f'Prediction: Class {prediction[0]}')
st.write(f'Probability: {probability[0]}')
# Visualize
st.bar_chart(probability[0])
# Run: streamlit run app.py
📱 Model Optimization
Quantization
# Reduce model size for edge deployment
import tensorflow as tf
# Load model
model = tf.keras.models.load_model('model.h5')
# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
# Save
with open('model.tflite', 'wb') as f:
f.write(tflite_model)
print(f"Original size: {os.path.getsize('model.h5') / 1024:.2f} KB")
print(f"Quantized size: {os.path.getsize('model.tflite') / 1024:.2f} KB")
ONNX Export
# Export to ONNX for cross-platform compatibility
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
# Define input type
initial_type = [('float_input', FloatTensorType([None, 20]))]
# Convert
onnx_model = convert_sklearn(model, initial_types=initial_type)
# Save
with open("model.onnx", "wb") as f:
f.write(onnx_model.SerializeToString())
🔐 Production Best Practices
- Authentication: API keys, OAuth for security
- Rate limiting: Prevent abuse
from flask_limiter import Limiter limiter = Limiter(app, key_func=lambda: request.remote_addr) @app.route('/predict', methods=['POST']) @limiter.limit("10 per minute") def predict(): # ... - Logging: Track requests and errors
import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @app.route('/predict', methods=['POST']) def predict(): logger.info(f"Prediction request from {request.remote_addr}") # ... - Monitoring: Track latency, errors, model drift
- Versioning: Keep track of model versions
- A/B testing: Compare model versions
- Rollback plan: Quick revert if issues
📊 Monitoring
# Track predictions and performance
import time
from prometheus_client import Counter, Histogram
# Metrics
prediction_counter = Counter('predictions_total', 'Total predictions')
prediction_latency = Histogram('prediction_latency_seconds', 'Prediction latency')
@app.route('/predict', methods=['POST'])
def predict():
start_time = time.time()
# Make prediction
result = model.predict(features)
# Record metrics
prediction_counter.inc()
prediction_latency.observe(time.time() - start_time)
return jsonify(result)
💡 Deployment Checklist
- ✅ Save model in portable format (pickle, joblib, ONNX)
- ✅ Create API with Flask/FastAPI
- ✅ Add input validation and error handling
- ✅ Containerize with Docker
- ✅ Set up logging and monitoring
- ✅ Add authentication and rate limiting
- ✅ Test API thoroughly
- ✅ Deploy to cloud platform
- ✅ Set up CI/CD pipeline
- ✅ Monitor in production
🎯 Key Takeaways
- Save models with joblib or pickle
- Create REST API with Flask or FastAPI
- Containerize with Docker for reproducibility
- Deploy to cloud (AWS, GCP, Azure)
- Add monitoring and logging
- Implement security (auth, rate limiting)
- Use Streamlit for quick demo apps