ControlNet & LoRA Fine-Tuning

What are ControlNet and LoRA?

ControlNet adds spatial control to image generation, allowing you to guide the process with pose, depth, edges, and more. LoRA (Low-Rank Adaptation) enables efficient fine-tuning without retraining entire models.

                Why These Matter:
                ControlNet: Precise control over image structure and composition
LoRA: Train custom styles with minimal compute and storage
Combination: Both work together for ultimate customization
Efficiency: Fast training and small file sizes

            

🎮 ControlNet Overview

from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler
import torch
from PIL import Image
import cv2
import numpy as np

# Available ControlNet models
controlnet_models = {
    'canny': 'lllyasviel/sd-controlnet-canny',  # Edge detection
    'depth': 'lllyasviel/sd-controlnet-depth',  # Depth maps
    'pose': 'lllyasviel/sd-controlnet-openpose',  # Human pose
    'scribble': 'lllyasviel/sd-controlnet-scribble',  # Rough sketches
    'seg': 'lllyasviel/sd-controlnet-seg',  # Segmentation masks
    'normal': 'lllyasviel/sd-controlnet-normal',  # Normal maps
    'mlsd': 'lllyasviel/sd-controlnet-mlsd',  # Straight lines
    'hed': 'lllyasviel/sd-controlnet-hed',  # HED boundary
}

print("ControlNet Models:")
for name, model_id in controlnet_models.items():
    print(f"  {name}: {model_id}")

print("\nControlNet enables spatial control through conditional inputs!")

🖼️ Canny Edge ControlNet

# Load Canny ControlNet
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny",
    torch_dtype=torch.float16
)

# Create pipeline
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
    safety_checker=None
)

# Use efficient scheduler
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

print("ControlNet pipeline loaded!")

# Load and process input image
def get_canny_edges(image_path, low_threshold=100, high_threshold=200):
    """Extract canny edges from image"""
    image = np.array(Image.open(image_path))
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    
    # Apply Canny edge detection
    edges = cv2.Canny(gray, low_threshold, high_threshold)
    
    # Convert back to RGB
    edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
    
    return Image.fromarray(edges)

# Example: Generate image from edges
def generate_from_edges(edge_image, prompt, num_steps=20):
    """Generate image guided by edge map"""
    
    output = pipe(
        prompt=prompt,
        image=edge_image,
        num_inference_steps=num_steps,
        controlnet_conditioning_scale=1.0,  # How much to follow edges
    ).images[0]
    
    return output

# Example usage
edge_map = get_canny_edges("input_image.jpg")
result = generate_from_edges(
    edge_map,
    prompt="a beautiful woman in elegant dress, professional photo, 4k",
    num_steps=25
)

result.save("controlnet_output.png")
print("Generated image with edge control!")

🧍 Pose ControlNet

from controlnet_aux import OpenposeDetector

# Load pose detector
openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')

# Load pose ControlNet
pose_controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-openpose",
    torch_dtype=torch.float16
)

pose_pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=pose_controlnet,
    torch_dtype=torch.float16
)
pose_pipe.enable_model_cpu_offload()

def extract_pose(image_path):
    """Extract pose skeleton from image"""
    image = Image.open(image_path)
    pose = openpose(image)
    return pose

def generate_with_pose(pose_image, prompt, negative_prompt="", steps=25):
    """Generate image matching pose"""
    
    output = pose_pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image=pose_image,
        num_inference_steps=steps,
        controlnet_conditioning_scale=1.0,
    ).images[0]
    
    return output

# Example: Transfer pose to new character
pose_skeleton = extract_pose("dancer.jpg")

# Generate new image with same pose
result = generate_with_pose(
    pose_skeleton,
    prompt="anime character dancing, studio ghibli style, detailed",
    negative_prompt="low quality, blurry, distorted",
    steps=30
)

result.save("pose_transfer.png")
print("Created new character with transferred pose!")

🎨 Multiple ControlNets

from diffusers import MultiControlNetModel

# Load multiple ControlNets
controlnet_canny = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
)
controlnet_depth = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16
)

# Combine them
multi_controlnet = MultiControlNetModel([controlnet_canny, controlnet_depth])

# Create multi-control pipeline
multi_pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=multi_controlnet,
    torch_dtype=torch.float16
)
multi_pipe.enable_model_cpu_offload()

def generate_multi_control(canny_image, depth_image, prompt, 
                           canny_scale=0.8, depth_scale=0.6):
    """Generate with multiple conditioning inputs"""
    
    output = multi_pipe(
        prompt=prompt,
        image=[canny_image, depth_image],
        num_inference_steps=25,
        controlnet_conditioning_scale=[canny_scale, depth_scale],
    ).images[0]
    
    return output

# Example: Use both edge and depth
edges = get_canny_edges("scene.jpg")
depth_map = get_depth_map("scene.jpg")  # Requires depth estimator

result = generate_multi_control(
    edges, depth_map,
    prompt="futuristic cyberpunk city at night, neon lights, 8k",
    canny_scale=0.7,
    depth_scale=0.5
)

result.save("multi_control.png")
print("Generated with edge + depth control!")

🚀 LoRA: Low-Rank Adaptation

from peft import LoraConfig, get_peft_model
from diffusers import DiffusionPipeline

# LoRA Configuration
lora_config = LoraConfig(
    r=8,  # Rank of decomposition (higher = more capacity)
    lora_alpha=32,  # Scaling factor
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],  # Which layers
    lora_dropout=0.1,
    bias="none",
)

print("LoRA Configuration:")
print(f"  Rank: {lora_config.r}")
print(f"  Alpha: {lora_config.lora_alpha}")
print(f"  Target modules: {lora_config.target_modules}")

# Key LoRA advantages:
advantages = {
    'Parameter Efficiency': 'Only train <1% of model parameters',
    'Storage': 'LoRA weights typically 2-200 MB vs 4+ GB full model',
    'Training Speed': '10-100x faster than full fine-tuning',
    'Combinable': 'Can merge multiple LoRAs together',
    'Reversible': 'Easy to switch between LoRAs',
}

print("\nLoRA Advantages:")
for key, value in advantages.items():
    print(f"  {key}: {value}")

🎓 Training Custom LoRA

import torch
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class CustomImageDataset(Dataset):
    """Dataset for LoRA training"""
    
    def __init__(self, image_dir, captions_file, size=512):
        self.image_dir = image_dir
        self.size = size
        
        # Load image paths and captions
        with open(captions_file, 'r') as f:
            lines = f.readlines()
        
        self.data = []
        for line in lines:
            img_path, caption = line.strip().split('\t')
            self.data.append((img_path, caption))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path, caption = self.data[idx]
        
        # Load and process image
        image = Image.open(os.path.join(self.image_dir, img_path))
        image = image.resize((self.size, self.size))
        image = torch.from_numpy(np.array(image)).permute(2, 0, 1) / 127.5 - 1
        
        return {'image': image, 'caption': caption}

# Setup training
def setup_lora_training(model_id="runwayml/stable-diffusion-v1-5"):
    """Initialize model for LoRA training"""
    
    # Load components
    unet = UNet2DConditionModel.from_pretrained(
        model_id, subfolder="unet", torch_dtype=torch.float16
    )
    
    # Apply LoRA to UNet
    unet = get_peft_model(unet, lora_config)
    
    # Freeze non-LoRA parameters
    for name, param in unet.named_parameters():
        if 'lora' not in name:
            param.requires_grad = False
    
    # Count trainable parameters
    trainable = sum(p.numel() for p in unet.parameters() if p.requires_grad)
    total = sum(p.numel() for p in unet.parameters())
    
    print(f"\nTrainable parameters: {trainable:,} ({trainable/total:.2%} of total)")
    
    return unet

# Load model with LoRA
lora_unet = setup_lora_training()

print("\nReady to train custom LoRA!")

🔥 LoRA Training Loop

from torch.optim import AdamW
from diffusers import DDPMScheduler

def train_lora(unet, train_dataloader, num_epochs=10, lr=1e-4):
    """Train LoRA adapter"""
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    unet = unet.to(device)
    
    # Setup optimizer (only LoRA parameters)
    optimizer = AdamW(
        [p for p in unet.parameters() if p.requires_grad],
        lr=lr
    )
    
    # Noise scheduler
    noise_scheduler = DDPMScheduler.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        subfolder="scheduler"
    )
    
    print(f"Training for {num_epochs} epochs...")
    
    for epoch in range(num_epochs):
        unet.train()
        total_loss = 0
        
        for batch_idx, batch in enumerate(train_dataloader):
            images = batch['image'].to(device)
            
            # Sample noise
            noise = torch.randn_like(images)
            
            # Random timesteps
            timesteps = torch.randint(
                0, noise_scheduler.config.num_train_timesteps,
                (images.shape[0],), device=device
            ).long()
            
            # Add noise to images
            noisy_images = noise_scheduler.add_noise(images, noise, timesteps)
            
            # Predict noise
            noise_pred = unet(noisy_images, timesteps).sample
            
            # Calculate loss
            loss = torch.nn.functional.mse_loss(noise_pred, noise)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 50 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}\n")
    
    print("LoRA training completed!")
    return unet

# Example dataset
dataset = CustomImageDataset(
    image_dir="./training_images",
    captions_file="./captions.txt",
    size=512
)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Train (uncomment to run)
# trained_lora = train_lora(lora_unet, train_loader, num_epochs=5)

print("LoRA training setup complete!")

💾 Saving and Loading LoRA

from safetensors.torch import save_file, load_file

def save_lora_weights(model, save_path):
    """Save only LoRA weights"""
    
    # Extract LoRA parameters
    lora_state_dict = {}
    for name, param in model.named_parameters():
        if 'lora' in name and param.requires_grad:
            lora_state_dict[name] = param.cpu()
    
    # Save to safetensors (preferred) or pytorch
    if save_path.endswith('.safetensors'):
        save_file(lora_state_dict, save_path)
    else:
        torch.save(lora_state_dict, save_path)
    
    size_mb = os.path.getsize(save_path) / (1024 * 1024)
    print(f"Saved LoRA weights: {save_path} ({size_mb:.2f} MB)")

def load_lora_weights(model, lora_path):
    """Load LoRA weights into model"""
    
    # Load weights
    if lora_path.endswith('.safetensors'):
        lora_state_dict = load_file(lora_path)
    else:
        lora_state_dict = torch.load(lora_path)
    
    # Load into model
    model.load_state_dict(lora_state_dict, strict=False)
    print(f"Loaded LoRA from: {lora_path}")
    
    return model

# Save trained LoRA
save_lora_weights(lora_unet, "my_custom_style.safetensors")

# Load LoRA for inference
def generate_with_lora(lora_path, prompt, steps=25):
    """Generate image using custom LoRA"""
    
    # Load base pipeline
    pipe = DiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    )
    
    # Load LoRA weights
    pipe.unet = load_lora_weights(pipe.unet, lora_path)
    pipe = pipe.to("cuda")
    
    # Generate
    image = pipe(prompt, num_inference_steps=steps).images[0]
    return image

# Use custom LoRA
result = generate_with_lora(
    "my_custom_style.safetensors",
    prompt="a magical forest in my_style",
    steps=30
)
result.save("lora_output.png")

print("Generated with custom LoRA!")

🎭 Using Pre-trained LoRAs

from diffusers import DiffusionPipeline

def load_civitai_lora(base_model, lora_path, lora_scale=0.8):
    """Load LoRA from CivitAI or other sources"""
    
    pipe = DiffusionPipeline.from_pretrained(
        base_model,
        torch_dtype=torch.float16
    )
    
    # Load LoRA weights
    pipe.load_lora_weights(lora_path)
    pipe = pipe.to("cuda")
    
    # Set LoRA scale (how much influence)
    pipe.set_lora_scale(lora_scale)
    
    return pipe

# Example: Popular LoRA styles
lora_examples = {
    'Anime Style': 'animefull_latest.safetensors',
    'Studio Ghibli': 'ghibli_style.safetensors',
    'Cyberpunk': 'cyberpunk_anime.safetensors',
    'Oil Painting': 'oil_painting.safetensors',
    'Pixel Art': 'pixel_art.safetensors',
}

# Load and use LoRA
pipe = load_civitai_lora(
    "runwayml/stable-diffusion-v1-5",
    "anime_style.safetensors",
    lora_scale=0.9
)

# Generate with LoRA style
image = pipe(
    "a beautiful landscape, sunset, mountains",
    num_inference_steps=30,
    guidance_scale=7.5
).images[0]

image.save("lora_styled.png")

print("Generated with pre-trained LoRA style!")
print("\nPopular LoRA repositories:")
print("  - CivitAI: https://civitai.com")
print("  - HuggingFace: https://huggingface.co/models?other=lora")
print("  - LoRA Library: Community-trained styles")

🔗 Combining ControlNet + LoRA

def generate_with_controlnet_and_lora(
    controlnet_model,
    lora_path,
    control_image,
    prompt,
    lora_scale=0.8,
    control_scale=1.0
):
    """Ultimate control: ControlNet for structure + LoRA for style"""
    
    # Load ControlNet
    controlnet = ControlNetModel.from_pretrained(
        controlnet_model, torch_dtype=torch.float16
    )
    
    # Load pipeline with ControlNet
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        controlnet=controlnet,
        torch_dtype=torch.float16
    )
    
    # Add LoRA
    pipe.load_lora_weights(lora_path)
    pipe.set_lora_scale(lora_scale)
    pipe = pipe.to("cuda")
    
    # Generate
    output = pipe(
        prompt=prompt,
        image=control_image,
        num_inference_steps=30,
        controlnet_conditioning_scale=control_scale,
    ).images[0]
    
    return output

# Example: Pose control + anime style
pose_image = extract_pose("person.jpg")

result = generate_with_controlnet_and_lora(
    controlnet_model="lllyasviel/sd-controlnet-openpose",
    lora_path="anime_style.safetensors",
    control_image=pose_image,
    prompt="anime character, detailed, colorful",
    lora_scale=0.9,
    control_scale=0.8
)

result.save("controlnet_lora_combo.png")

print("Perfect combination: Structure from ControlNet + Style from LoRA!")

📊 Comparison Table

Aspect	ControlNet	LoRA
Purpose	Spatial/structural control	Style/concept learning
Training Data	Condition images + outputs	10-100+ images of style/subject
File Size	~1.4 GB (full model)	2-200 MB (adapter only)
Training Time	Days on powerful GPU	Minutes to hours
Use Case	Pose, depth, edge guidance	Custom styles, characters
Combinable	Yes (multi-control)	Yes (merge LoRAs)

🎯 Best Practices

ControlNet scale 0.5-1.5: Higher = stricter adherence to control
LoRA scale 0.6-1.0: Balance style without overpowering
Training data quality: 20-50 high-quality images better than 100 poor ones
Preprocessing: Consistent image sizes and quality
Multiple LoRAs: Can combine 2-3 LoRAs at lower scales
Experimentation: Test different scales and combinations
Community models: Try existing LoRAs before training