What are ControlNet and LoRA?
ControlNet adds spatial control to image generation, allowing you to guide the process with pose, depth, edges, and more. LoRA (Low-Rank Adaptation) enables efficient fine-tuning without retraining entire models.
Why These Matter:
- ControlNet: Precise control over image structure and composition
- LoRA: Train custom styles with minimal compute and storage
- Combination: Both work together for ultimate customization
- Efficiency: Fast training and small file sizes
🎮 ControlNet Overview
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler
import torch
from PIL import Image
import cv2
import numpy as np
# Available ControlNet models
controlnet_models = {
'canny': 'lllyasviel/sd-controlnet-canny', # Edge detection
'depth': 'lllyasviel/sd-controlnet-depth', # Depth maps
'pose': 'lllyasviel/sd-controlnet-openpose', # Human pose
'scribble': 'lllyasviel/sd-controlnet-scribble', # Rough sketches
'seg': 'lllyasviel/sd-controlnet-seg', # Segmentation masks
'normal': 'lllyasviel/sd-controlnet-normal', # Normal maps
'mlsd': 'lllyasviel/sd-controlnet-mlsd', # Straight lines
'hed': 'lllyasviel/sd-controlnet-hed', # HED boundary
}
print("ControlNet Models:")
for name, model_id in controlnet_models.items():
print(f" {name}: {model_id}")
print("\nControlNet enables spatial control through conditional inputs!")
🖼️ Canny Edge ControlNet
# Load Canny ControlNet
controlnet = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-canny",
torch_dtype=torch.float16
)
# Create pipeline
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=controlnet,
torch_dtype=torch.float16,
safety_checker=None
)
# Use efficient scheduler
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
print("ControlNet pipeline loaded!")
# Load and process input image
def get_canny_edges(image_path, low_threshold=100, high_threshold=200):
"""Extract canny edges from image"""
image = np.array(Image.open(image_path))
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
# Apply Canny edge detection
edges = cv2.Canny(gray, low_threshold, high_threshold)
# Convert back to RGB
edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
return Image.fromarray(edges)
# Example: Generate image from edges
def generate_from_edges(edge_image, prompt, num_steps=20):
"""Generate image guided by edge map"""
output = pipe(
prompt=prompt,
image=edge_image,
num_inference_steps=num_steps,
controlnet_conditioning_scale=1.0, # How much to follow edges
).images[0]
return output
# Example usage
edge_map = get_canny_edges("input_image.jpg")
result = generate_from_edges(
edge_map,
prompt="a beautiful woman in elegant dress, professional photo, 4k",
num_steps=25
)
result.save("controlnet_output.png")
print("Generated image with edge control!")
🧍 Pose ControlNet
from controlnet_aux import OpenposeDetector
# Load pose detector
openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
# Load pose ControlNet
pose_controlnet = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-openpose",
torch_dtype=torch.float16
)
pose_pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=pose_controlnet,
torch_dtype=torch.float16
)
pose_pipe.enable_model_cpu_offload()
def extract_pose(image_path):
"""Extract pose skeleton from image"""
image = Image.open(image_path)
pose = openpose(image)
return pose
def generate_with_pose(pose_image, prompt, negative_prompt="", steps=25):
"""Generate image matching pose"""
output = pose_pipe(
prompt=prompt,
negative_prompt=negative_prompt,
image=pose_image,
num_inference_steps=steps,
controlnet_conditioning_scale=1.0,
).images[0]
return output
# Example: Transfer pose to new character
pose_skeleton = extract_pose("dancer.jpg")
# Generate new image with same pose
result = generate_with_pose(
pose_skeleton,
prompt="anime character dancing, studio ghibli style, detailed",
negative_prompt="low quality, blurry, distorted",
steps=30
)
result.save("pose_transfer.png")
print("Created new character with transferred pose!")
🎨 Multiple ControlNets
from diffusers import MultiControlNetModel
# Load multiple ControlNets
controlnet_canny = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
)
controlnet_depth = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16
)
# Combine them
multi_controlnet = MultiControlNetModel([controlnet_canny, controlnet_depth])
# Create multi-control pipeline
multi_pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=multi_controlnet,
torch_dtype=torch.float16
)
multi_pipe.enable_model_cpu_offload()
def generate_multi_control(canny_image, depth_image, prompt,
canny_scale=0.8, depth_scale=0.6):
"""Generate with multiple conditioning inputs"""
output = multi_pipe(
prompt=prompt,
image=[canny_image, depth_image],
num_inference_steps=25,
controlnet_conditioning_scale=[canny_scale, depth_scale],
).images[0]
return output
# Example: Use both edge and depth
edges = get_canny_edges("scene.jpg")
depth_map = get_depth_map("scene.jpg") # Requires depth estimator
result = generate_multi_control(
edges, depth_map,
prompt="futuristic cyberpunk city at night, neon lights, 8k",
canny_scale=0.7,
depth_scale=0.5
)
result.save("multi_control.png")
print("Generated with edge + depth control!")
🚀 LoRA: Low-Rank Adaptation
from peft import LoraConfig, get_peft_model
from diffusers import DiffusionPipeline
# LoRA Configuration
lora_config = LoraConfig(
r=8, # Rank of decomposition (higher = more capacity)
lora_alpha=32, # Scaling factor
target_modules=["to_k", "to_q", "to_v", "to_out.0"], # Which layers
lora_dropout=0.1,
bias="none",
)
print("LoRA Configuration:")
print(f" Rank: {lora_config.r}")
print(f" Alpha: {lora_config.lora_alpha}")
print(f" Target modules: {lora_config.target_modules}")
# Key LoRA advantages:
advantages = {
'Parameter Efficiency': 'Only train <1% of model parameters',
'Storage': 'LoRA weights typically 2-200 MB vs 4+ GB full model',
'Training Speed': '10-100x faster than full fine-tuning',
'Combinable': 'Can merge multiple LoRAs together',
'Reversible': 'Easy to switch between LoRAs',
}
print("\nLoRA Advantages:")
for key, value in advantages.items():
print(f" {key}: {value}")
🎓 Training Custom LoRA
import torch
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
class CustomImageDataset(Dataset):
"""Dataset for LoRA training"""
def __init__(self, image_dir, captions_file, size=512):
self.image_dir = image_dir
self.size = size
# Load image paths and captions
with open(captions_file, 'r') as f:
lines = f.readlines()
self.data = []
for line in lines:
img_path, caption = line.strip().split('\t')
self.data.append((img_path, caption))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
img_path, caption = self.data[idx]
# Load and process image
image = Image.open(os.path.join(self.image_dir, img_path))
image = image.resize((self.size, self.size))
image = torch.from_numpy(np.array(image)).permute(2, 0, 1) / 127.5 - 1
return {'image': image, 'caption': caption}
# Setup training
def setup_lora_training(model_id="runwayml/stable-diffusion-v1-5"):
"""Initialize model for LoRA training"""
# Load components
unet = UNet2DConditionModel.from_pretrained(
model_id, subfolder="unet", torch_dtype=torch.float16
)
# Apply LoRA to UNet
unet = get_peft_model(unet, lora_config)
# Freeze non-LoRA parameters
for name, param in unet.named_parameters():
if 'lora' not in name:
param.requires_grad = False
# Count trainable parameters
trainable = sum(p.numel() for p in unet.parameters() if p.requires_grad)
total = sum(p.numel() for p in unet.parameters())
print(f"\nTrainable parameters: {trainable:,} ({trainable/total:.2%} of total)")
return unet
# Load model with LoRA
lora_unet = setup_lora_training()
print("\nReady to train custom LoRA!")
🔥 LoRA Training Loop
from torch.optim import AdamW
from diffusers import DDPMScheduler
def train_lora(unet, train_dataloader, num_epochs=10, lr=1e-4):
"""Train LoRA adapter"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
unet = unet.to(device)
# Setup optimizer (only LoRA parameters)
optimizer = AdamW(
[p for p in unet.parameters() if p.requires_grad],
lr=lr
)
# Noise scheduler
noise_scheduler = DDPMScheduler.from_pretrained(
"runwayml/stable-diffusion-v1-5",
subfolder="scheduler"
)
print(f"Training for {num_epochs} epochs...")
for epoch in range(num_epochs):
unet.train()
total_loss = 0
for batch_idx, batch in enumerate(train_dataloader):
images = batch['image'].to(device)
# Sample noise
noise = torch.randn_like(images)
# Random timesteps
timesteps = torch.randint(
0, noise_scheduler.config.num_train_timesteps,
(images.shape[0],), device=device
).long()
# Add noise to images
noisy_images = noise_scheduler.add_noise(images, noise, timesteps)
# Predict noise
noise_pred = unet(noisy_images, timesteps).sample
# Calculate loss
loss = torch.nn.functional.mse_loss(noise_pred, noise)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
if batch_idx % 50 == 0:
print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}")
avg_loss = total_loss / len(train_dataloader)
print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}\n")
print("LoRA training completed!")
return unet
# Example dataset
dataset = CustomImageDataset(
image_dir="./training_images",
captions_file="./captions.txt",
size=512
)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)
# Train (uncomment to run)
# trained_lora = train_lora(lora_unet, train_loader, num_epochs=5)
print("LoRA training setup complete!")
💾 Saving and Loading LoRA
from safetensors.torch import save_file, load_file
def save_lora_weights(model, save_path):
"""Save only LoRA weights"""
# Extract LoRA parameters
lora_state_dict = {}
for name, param in model.named_parameters():
if 'lora' in name and param.requires_grad:
lora_state_dict[name] = param.cpu()
# Save to safetensors (preferred) or pytorch
if save_path.endswith('.safetensors'):
save_file(lora_state_dict, save_path)
else:
torch.save(lora_state_dict, save_path)
size_mb = os.path.getsize(save_path) / (1024 * 1024)
print(f"Saved LoRA weights: {save_path} ({size_mb:.2f} MB)")
def load_lora_weights(model, lora_path):
"""Load LoRA weights into model"""
# Load weights
if lora_path.endswith('.safetensors'):
lora_state_dict = load_file(lora_path)
else:
lora_state_dict = torch.load(lora_path)
# Load into model
model.load_state_dict(lora_state_dict, strict=False)
print(f"Loaded LoRA from: {lora_path}")
return model
# Save trained LoRA
save_lora_weights(lora_unet, "my_custom_style.safetensors")
# Load LoRA for inference
def generate_with_lora(lora_path, prompt, steps=25):
"""Generate image using custom LoRA"""
# Load base pipeline
pipe = DiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
)
# Load LoRA weights
pipe.unet = load_lora_weights(pipe.unet, lora_path)
pipe = pipe.to("cuda")
# Generate
image = pipe(prompt, num_inference_steps=steps).images[0]
return image
# Use custom LoRA
result = generate_with_lora(
"my_custom_style.safetensors",
prompt="a magical forest in my_style",
steps=30
)
result.save("lora_output.png")
print("Generated with custom LoRA!")
🎭 Using Pre-trained LoRAs
from diffusers import DiffusionPipeline
def load_civitai_lora(base_model, lora_path, lora_scale=0.8):
"""Load LoRA from CivitAI or other sources"""
pipe = DiffusionPipeline.from_pretrained(
base_model,
torch_dtype=torch.float16
)
# Load LoRA weights
pipe.load_lora_weights(lora_path)
pipe = pipe.to("cuda")
# Set LoRA scale (how much influence)
pipe.set_lora_scale(lora_scale)
return pipe
# Example: Popular LoRA styles
lora_examples = {
'Anime Style': 'animefull_latest.safetensors',
'Studio Ghibli': 'ghibli_style.safetensors',
'Cyberpunk': 'cyberpunk_anime.safetensors',
'Oil Painting': 'oil_painting.safetensors',
'Pixel Art': 'pixel_art.safetensors',
}
# Load and use LoRA
pipe = load_civitai_lora(
"runwayml/stable-diffusion-v1-5",
"anime_style.safetensors",
lora_scale=0.9
)
# Generate with LoRA style
image = pipe(
"a beautiful landscape, sunset, mountains",
num_inference_steps=30,
guidance_scale=7.5
).images[0]
image.save("lora_styled.png")
print("Generated with pre-trained LoRA style!")
print("\nPopular LoRA repositories:")
print(" - CivitAI: https://civitai.com")
print(" - HuggingFace: https://huggingface.co/models?other=lora")
print(" - LoRA Library: Community-trained styles")
🔗 Combining ControlNet + LoRA
def generate_with_controlnet_and_lora(
controlnet_model,
lora_path,
control_image,
prompt,
lora_scale=0.8,
control_scale=1.0
):
"""Ultimate control: ControlNet for structure + LoRA for style"""
# Load ControlNet
controlnet = ControlNetModel.from_pretrained(
controlnet_model, torch_dtype=torch.float16
)
# Load pipeline with ControlNet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=controlnet,
torch_dtype=torch.float16
)
# Add LoRA
pipe.load_lora_weights(lora_path)
pipe.set_lora_scale(lora_scale)
pipe = pipe.to("cuda")
# Generate
output = pipe(
prompt=prompt,
image=control_image,
num_inference_steps=30,
controlnet_conditioning_scale=control_scale,
).images[0]
return output
# Example: Pose control + anime style
pose_image = extract_pose("person.jpg")
result = generate_with_controlnet_and_lora(
controlnet_model="lllyasviel/sd-controlnet-openpose",
lora_path="anime_style.safetensors",
control_image=pose_image,
prompt="anime character, detailed, colorful",
lora_scale=0.9,
control_scale=0.8
)
result.save("controlnet_lora_combo.png")
print("Perfect combination: Structure from ControlNet + Style from LoRA!")
📊 Comparison Table
| Aspect | ControlNet | LoRA |
|---|---|---|
| Purpose | Spatial/structural control | Style/concept learning |
| Training Data | Condition images + outputs | 10-100+ images of style/subject |
| File Size | ~1.4 GB (full model) | 2-200 MB (adapter only) |
| Training Time | Days on powerful GPU | Minutes to hours |
| Use Case | Pose, depth, edge guidance | Custom styles, characters |
| Combinable | Yes (multi-control) | Yes (merge LoRAs) |
🎯 Best Practices
- ControlNet scale 0.5-1.5: Higher = stricter adherence to control
- LoRA scale 0.6-1.0: Balance style without overpowering
- Training data quality: 20-50 high-quality images better than 100 poor ones
- Preprocessing: Consistent image sizes and quality
- Multiple LoRAs: Can combine 2-3 LoRAs at lower scales
- Experimentation: Test different scales and combinations
- Community models: Try existing LoRAs before training