AI Video Generation

What is AI Video Generation?

AI video generation uses machine learning to create, edit, and enhance videos. From text-to-video synthesis to automated editing and deepfakes, AI is transforming video production.

                Key Technologies:
                Text-to-Video: Generate videos from text descriptions
Image-to-Video: Animate static images
Video-to-Video: Style transfer and editing
Deepfakes: Face swapping and manipulation

            

🎥 Stable Video Diffusion

from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
import torch

# Load Stable Video Diffusion model
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    torch_dtype=torch.float16,
    variant="fp16"
)

# Enable memory optimizations
pipe.enable_model_cpu_offload()
# pipe.enable_vae_slicing()  # For even lower memory

print("Stable Video Diffusion loaded!")
print("Model: img2vid-xt (image to video)")
print("Capabilities: Animate static images into short videos")

def generate_video_from_image(
    image_path,
    output_path="output_video.mp4",
    num_frames=25,
    fps=7,
    motion_bucket_id=127,
    noise_aug_strength=0.02,
    decode_chunk_size=8
):
    """
    Generate video from image using Stable Video Diffusion
    
    Args:
        image_path: Input image (will be resized to 1024x576)
        output_path: Output video file
        num_frames: Number of frames (14-25)
        fps: Frames per second
        motion_bucket_id: Amount of motion (1-255, higher = more motion)
        noise_aug_strength: Noise augmentation (0.0-1.0)
        decode_chunk_size: Memory/quality tradeoff
    """
    
    # Load and prepare image
    image = load_image(image_path)
    image = image.resize((1024, 576))
    
    print(f"Generating video from: {image_path}")
    print(f"Settings: {num_frames} frames @ {fps} fps")
    print(f"Motion level: {motion_bucket_id}/255")
    
    # Generate video frames
    frames = pipe(
        image=image,
        num_frames=num_frames,
        decode_chunk_size=decode_chunk_size,
        motion_bucket_id=motion_bucket_id,
        noise_aug_strength=noise_aug_strength,
    ).frames[0]
    
    # Export to video
    export_to_video(frames, output_path, fps=fps)
    
    print(f"✓ Video saved to: {output_path}")
    print(f"Duration: {num_frames/fps:.1f} seconds")
    
    return frames

# Example: Animate an image
video_frames = generate_video_from_image(
    image_path="landscape.jpg",
    output_path="animated_landscape.mp4",
    num_frames=25,
    fps=7,
    motion_bucket_id=127  # Moderate motion
)

print("\n✓ Image animated into video!")

🎨 RunwayML Gen-2

import requests
import time
import os

class RunwayMLClient:
    """Client for Runway Gen-2 API"""
    
    def __init__(self, api_key=None):
        self.api_key = api_key or os.getenv('RUNWAYML_API_KEY')
        self.base_url = "https://api.runwayml.com/v1"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
    
    def text_to_video(self, prompt, duration=4, aspect_ratio="16:9"):
        """
        Generate video from text prompt
        
        Args:
            prompt: Description of video
            duration: Length in seconds (4 or 8)
            aspect_ratio: "16:9" or "9:16"
        """
        
        payload = {
            "promptText": prompt,
            "duration": duration,
            "ratio": aspect_ratio,
            "watermark": False
        }
        
        # Submit generation request
        response = requests.post(
            f"{self.base_url}/generate",
            headers=self.headers,
            json=payload
        )
        
        task_id = response.json()['id']
        print(f"Generation started: {task_id}")
        
        # Poll for completion
        while True:
            status_response = requests.get(
                f"{self.base_url}/tasks/{task_id}",
                headers=self.headers
            )
            
            status = status_response.json()
            
            if status['status'] == 'SUCCEEDED':
                video_url = status['output'][0]
                print(f"✓ Video generated: {video_url}")
                return video_url
            elif status['status'] == 'FAILED':
                print(f"✗ Generation failed: {status.get('error')}")
                return None
            
            print(f"Status: {status['status']} ({status.get('progress', 0)}%)")
            time.sleep(3)
    
    def image_to_video(self, image_url, prompt=None, duration=4):
        """Animate image into video"""
        
        payload = {
            "promptImage": image_url,
            "promptText": prompt or "animate this image",
            "duration": duration
        }
        
        response = requests.post(
            f"{self.base_url}/generate",
            headers=self.headers,
            json=payload
        )
        
        task_id = response.json()['id']
        print(f"Animation started: {task_id}")
        
        # Wait for completion (similar polling as above)
        # ... implementation
        
        return task_id

# Example usage
client = RunwayMLClient()

# Text to video
video_url = client.text_to_video(
    prompt="a serene ocean sunset with waves gently rolling onto the beach",
    duration=4,
    aspect_ratio="16:9"
)

# Image to video
# video_url = client.image_to_video(
#     image_url="https://example.com/image.jpg",
#     prompt="camera slowly pans across the scene",
#     duration=4
# )

print("\nRunwayML Gen-2 offers:")
print("  • Text-to-video generation")
print("  • Image-to-video animation")
print("  • Video-to-video style transfer")
print("  • 4K resolution support")
print("\nGet API key: https://runwayml.com")

🎭 Face Animation with D-ID

import requests
import json

class DIDClient:
    """Client for D-ID talking head API"""
    
    def __init__(self, api_key=None):
        self.api_key = api_key or os.getenv('DID_API_KEY')
        self.base_url = "https://api.d-id.com"
        self.headers = {
            "Authorization": f"Basic {self.api_key}",
            "Content-Type": "application/json"
        }
    
    def create_talking_head(
        self,
        image_url,
        text=None,
        audio_url=None,
        voice_id="en-US-JennyNeural"
    ):
        """
        Animate face to speak text or audio
        
        Args:
            image_url: URL of face image
            text: Text to speak (if no audio)
            audio_url: Audio URL (if no text)
            voice_id: Voice for TTS
        """
        
        payload = {
            "source_url": image_url,
        }
        
        # Add script (text or audio)
        if audio_url:
            payload["script"] = {
                "type": "audio",
                "audio_url": audio_url
            }
        else:
            payload["script"] = {
                "type": "text",
                "input": text,
                "provider": {
                    "type": "microsoft",
                    "voice_id": voice_id
                }
            }
        
        # Create talk
        response = requests.post(
            f"{self.base_url}/talks",
            headers=self.headers,
            json=payload
        )
        
        talk_id = response.json()['id']
        print(f"Talking head creation started: {talk_id}")
        
        # Poll for result
        while True:
            status_response = requests.get(
                f"{self.base_url}/talks/{talk_id}",
                headers=self.headers
            )
            
            result = status_response.json()
            
            if result['status'] == 'done':
                video_url = result['result_url']
                print(f"✓ Talking head created: {video_url}")
                return video_url
            elif result['status'] == 'error':
                print(f"✗ Error: {result.get('error')}")
                return None
            
            print(f"Status: {result['status']}")
            time.sleep(2)
    
    def list_voices(self):
        """List available voices"""
        
        response = requests.get(
            f"{self.base_url}/tts/voices",
            headers=self.headers
        )
        
        voices = response.json()
        
        print("Available voices:")
        for voice in voices[:5]:
            print(f"  {voice['voice_id']}: {voice['name']} ({voice['language']})")
        
        return voices

# Example usage
did_client = DIDClient()

# Create talking head from image + text
video = did_client.create_talking_head(
    image_url="https://example.com/portrait.jpg",
    text="Hello! I am an AI-generated talking head. This technology can make any image speak.",
    voice_id="en-US-JennyNeural"
)

# List available voices
# did_client.list_voices()

print("\nD-ID enables:")
print("  • Talking head videos from photos")
print("  • Lip-sync to any audio")
print("  • 100+ voices in multiple languages")
print("  • Custom avatar creation")

🔄 Video Style Transfer

import torch
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline
import cv2
import numpy as np
from PIL import Image

def video_style_transfer(
    video_path,
    style_prompt,
    output_path="styled_video.mp4",
    fps=24
):
    """
    Apply style transfer to video using Stable Diffusion
    
    Args:
        video_path: Input video
        style_prompt: Style description
        output_path: Output video
        fps: Frames per second
    """
    
    # Load video
    cap = cv2.VideoCapture(video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Processing video: {total_frames} frames @ {width}x{height}")
    
    # Setup output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Load ControlNet for consistency
    controlnet = ControlNetModel.from_pretrained(
        "lllyasviel/sd-controlnet-canny",
        torch_dtype=torch.float16
    )
    
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        controlnet=controlnet,
        torch_dtype=torch.float16
    )
    pipe.enable_model_cpu_offload()
    
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert frame to canny edges for consistency
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 100, 200)
        edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
        edges_pil = Image.fromarray(edges_rgb)
        
        # Apply style
        styled = pipe(
            prompt=style_prompt,
            image=edges_pil,
            num_inference_steps=20,
            controlnet_conditioning_scale=0.8
        ).images[0]
        
        # Convert back to video frame
        styled_frame = cv2.cvtColor(np.array(styled), cv2.COLOR_RGB2BGR)
        styled_frame = cv2.resize(styled_frame, (width, height))
        
        out.write(styled_frame)
        
        frame_count += 1
        if frame_count % 10 == 0:
            print(f"Processed {frame_count}/{total_frames} frames")
    
    cap.release()
    out.release()
    
    print(f"✓ Styled video saved: {output_path}")

# Example usage
# video_style_transfer(
#     video_path="input_video.mp4",
#     style_prompt="anime style, studio ghibli, hand-drawn animation",
#     output_path="anime_styled_video.mp4"
# )

print("Video style transfer pipeline ready!")
print("\nStyle examples:")
print("  • 'oil painting by Van Gogh, impressionist'")
print("  • 'anime style, studio ghibli'")
print("  • 'cyberpunk neon aesthetic'")
print("  • 'pencil sketch, black and white'")
print("  • 'pixel art, 8-bit retro game'")

🎭 Deepfake Face Swapping

# Note: Use responsibly and ethically!

from insightface.app import FaceAnalysis
from insightface.model_zoo import get_model
import cv2

class FaceSwapper:
    """Face swapping for videos"""
    
    def __init__(self):
        # Initialize face analysis
        self.app = FaceAnalysis(name='buffalo_l')
        self.app.prepare(ctx_id=0, det_size=(640, 640))
        
        # Load face swapper model
        self.swapper = get_model('inswapper_128.onnx')
        
        print("Face Swapper initialized")
    
    def swap_face_in_image(self, source_img, target_img):
        """
        Swap face from source to target image
        
        Args:
            source_img: Image with face to use
            target_img: Image to swap face into
        """
        
        # Detect faces
        source_faces = self.app.get(source_img)
        target_faces = self.app.get(target_img)
        
        if len(source_faces) == 0:
            print("No face found in source image")
            return target_img
        
        if len(target_faces) == 0:
            print("No face found in target image")
            return target_img
        
        # Get the first face from each
        source_face = source_faces[0]
        target_face = target_faces[0]
        
        # Swap face
        result = self.swapper.get(target_img, target_face, source_face, paste_back=True)
        
        return result
    
    def swap_face_in_video(self, source_img_path, video_path, output_path):
        """
        Swap face throughout video
        
        Args:
            source_img_path: Path to source face image
            video_path: Input video
            output_path: Output video with swapped face
        """
        
        # Load source image
        source_img = cv2.imread(source_img_path)
        source_faces = self.app.get(source_img)
        
        if len(source_faces) == 0:
            print("No face found in source image")
            return
        
        source_face = source_faces[0]
        
        # Open video
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Setup output
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        frame_count = 0
        
        print(f"Processing {total_frames} frames...")
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Detect face in frame
            target_faces = self.app.get(frame)
            
            if len(target_faces) > 0:
                # Swap face
                result = self.swapper.get(
                    frame,
                    target_faces[0],
                    source_face,
                    paste_back=True
                )
                out.write(result)
            else:
                # No face detected, use original frame
                out.write(frame)
            
            frame_count += 1
            if frame_count % 30 == 0:
                print(f"Progress: {frame_count}/{total_frames} frames")
        
        cap.release()
        out.release()
        
        print(f"✓ Face swapped video saved: {output_path}")

# Example usage
# swapper = FaceSwapper()

# Swap face in single image
# source = cv2.imread("person_a.jpg")
# target = cv2.imread("person_b.jpg")
# result = swapper.swap_face_in_image(source, target)
# cv2.imwrite("swapped.jpg", result)

# Swap face in video
# swapper.swap_face_in_video(
#     source_img_path="source_face.jpg",
#     video_path="target_video.mp4",
#     output_path="face_swapped_video.mp4"
# )

print("\n⚠️ ETHICAL CONSIDERATIONS:")
print("  • Get consent before swapping faces")
print("  • Don't create misleading content")
print("  • Respect privacy and dignity")
print("  • Follow local laws and regulations")
print("  • Label synthetic media clearly")

✂️ AI Video Editing

from moviepy.editor import VideoFileClip, concatenate_videoclips, TextClip, CompositeVideoClip
import whisper

class AIVideoEditor:
    """AI-powered video editing assistant"""
    
    def __init__(self):
        # Load Whisper for transcription
        self.whisper_model = whisper.load_model("base")
    
    def transcribe_video(self, video_path):
        """
        Transcribe video audio with timestamps
        
        Returns segments with start/end times and text
        """
        
        # Extract audio
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Transcribe
        result = self.whisper_model.transcribe(
            audio_path,
            task="transcribe",
            verbose=False
        )
        
        print(f"✓ Transcribed {len(result['segments'])} segments")
        
        return result['segments']
    
    def auto_caption_video(self, video_path, output_path="captioned.mp4"):
        """Add automatic captions to video"""
        
        # Transcribe
        segments = self.transcribe_video(video_path)
        
        # Load video
        video = VideoFileClip(video_path)
        
        # Create caption clips
        caption_clips = []
        
        for segment in segments:
            start = segment['start']
            end = segment['end']
            text = segment['text'].strip()
            
            # Create text clip
            txt_clip = TextClip(
                text,
                fontsize=40,
                color='white',
                bg_color='black',
                size=(video.w * 0.8, None),
                method='caption'
            ).set_position(('center', 'bottom')).set_start(start).set_end(end)
            
            caption_clips.append(txt_clip)
        
        # Composite video with captions
        final = CompositeVideoClip([video] + caption_clips)
        
        # Export
        final.write_videofile(output_path, codec='libx264', audio_codec='aac')
        
        print(f"✓ Captioned video saved: {output_path}")
    
    def remove_silence(self, video_path, output_path="no_silence.mp4", 
                       silence_threshold=-40):
        """
        Remove silent parts from video
        
        Args:
            silence_threshold: Volume threshold in dB
        """
        
        from pydub import AudioSegment
        from pydub.silence import detect_nonsilent
        
        # Load video
        video = VideoFileClip(video_path)
        
        # Extract audio
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Detect non-silent parts
        audio = AudioSegment.from_wav(audio_path)
        nonsilent_ranges = detect_nonsilent(
            audio,
            min_silence_len=500,  # milliseconds
            silence_thresh=silence_threshold
        )
        
        # Create clips from non-silent parts
        clips = []
        for start_ms, end_ms in nonsilent_ranges:
            start_sec = start_ms / 1000.0
            end_sec = end_ms / 1000.0
            clips.append(video.subclip(start_sec, end_sec))
        
        # Concatenate clips
        final = concatenate_videoclips(clips)
        
        # Export
        final.write_videofile(output_path, codec='libx264', audio_codec='aac')
        
        print(f"✓ Removed silence. Original: {video.duration:.1f}s, New: {final.duration:.1f}s")
        
        return final
    
    def extract_highlights(self, video_path, num_clips=5, clip_duration=10):
        """
        Extract interesting segments from video
        Uses scene detection
        """
        
        from scenedetect import detect, ContentDetector
        
        # Detect scenes
        scene_list = detect(video_path, ContentDetector())
        
        print(f"Detected {len(scene_list)} scenes")
        
        # Load video
        video = VideoFileClip(video_path)
        
        # Extract clips
        highlights = []
        for i, scene in enumerate(scene_list[:num_clips]):
            start = scene[0].get_seconds()
            end = min(start + clip_duration, scene[1].get_seconds())
            
            clip = video.subclip(start, end)
            highlights.append(clip)
        
        # Concatenate highlights
        final = concatenate_videoclips(highlights)
        
        output_path = "highlights.mp4"
        final.write_videofile(output_path, codec='libx264')
        
        print(f"✓ Extracted {num_clips} highlight clips")
        
        return final

# Example usage
editor = AIVideoEditor()

# Auto-caption video
# editor.auto_caption_video("interview.mp4", "captioned_interview.mp4")

# Remove silence
# editor.remove_silence("podcast.mp4", "podcast_no_silence.mp4")

# Extract highlights
# editor.extract_highlights("long_video.mp4", num_clips=5)

print("\n✓ AI Video Editor ready!")
print("\nFeatures:")
print("  • Automatic transcription and captioning")
print("  • Silence removal for cleaner videos")
print("  • Highlight extraction from long videos")
print("  • Scene detection and analysis")

📊 Platform Comparison

Platform	Capability	Quality	Best For
Stable Video Diffusion	Image-to-video	Very Good	Animating images, open-source
Runway Gen-2	Text/image-to-video	Excellent	Professional content creation
Pika Labs	Text-to-video	Very Good	Creative video generation
D-ID	Talking heads	Excellent	Avatar videos, presentations
InsightFace	Face swapping	Very Good	Deepfakes, face replacement

🎯 Key Takeaways

Multiple approaches: Text-to-video, image-to-video, video-to-video
Stable Video Diffusion: Open-source image animation
Runway Gen-2: Industry-leading text-to-video
Talking heads: Animate photos with speech
Style transfer: Transform video aesthetics
AI editing: Auto-captions, silence removal, highlights
Deepfakes: Powerful but requires ethical use
Use cases: Marketing, education, entertainment, accessibility