What is AI Video Generation?
AI video generation uses machine learning to create, edit, and enhance videos. From text-to-video synthesis to automated editing and deepfakes, AI is transforming video production.
Key Technologies:
- Text-to-Video: Generate videos from text descriptions
- Image-to-Video: Animate static images
- Video-to-Video: Style transfer and editing
- Deepfakes: Face swapping and manipulation
š„ Stable Video Diffusion
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
import torch
# Load Stable Video Diffusion model
pipe = StableVideoDiffusionPipeline.from_pretrained(
"stabilityai/stable-video-diffusion-img2vid-xt",
torch_dtype=torch.float16,
variant="fp16"
)
# Enable memory optimizations
pipe.enable_model_cpu_offload()
# pipe.enable_vae_slicing() # For even lower memory
print("Stable Video Diffusion loaded!")
print("Model: img2vid-xt (image to video)")
print("Capabilities: Animate static images into short videos")
def generate_video_from_image(
image_path,
output_path="output_video.mp4",
num_frames=25,
fps=7,
motion_bucket_id=127,
noise_aug_strength=0.02,
decode_chunk_size=8
):
"""
Generate video from image using Stable Video Diffusion
Args:
image_path: Input image (will be resized to 1024x576)
output_path: Output video file
num_frames: Number of frames (14-25)
fps: Frames per second
motion_bucket_id: Amount of motion (1-255, higher = more motion)
noise_aug_strength: Noise augmentation (0.0-1.0)
decode_chunk_size: Memory/quality tradeoff
"""
# Load and prepare image
image = load_image(image_path)
image = image.resize((1024, 576))
print(f"Generating video from: {image_path}")
print(f"Settings: {num_frames} frames @ {fps} fps")
print(f"Motion level: {motion_bucket_id}/255")
# Generate video frames
frames = pipe(
image=image,
num_frames=num_frames,
decode_chunk_size=decode_chunk_size,
motion_bucket_id=motion_bucket_id,
noise_aug_strength=noise_aug_strength,
).frames[0]
# Export to video
export_to_video(frames, output_path, fps=fps)
print(f"ā Video saved to: {output_path}")
print(f"Duration: {num_frames/fps:.1f} seconds")
return frames
# Example: Animate an image
video_frames = generate_video_from_image(
image_path="landscape.jpg",
output_path="animated_landscape.mp4",
num_frames=25,
fps=7,
motion_bucket_id=127 # Moderate motion
)
print("\nā Image animated into video!")
šØ RunwayML Gen-2
import requests
import time
import os
class RunwayMLClient:
"""Client for Runway Gen-2 API"""
def __init__(self, api_key=None):
self.api_key = api_key or os.getenv('RUNWAYML_API_KEY')
self.base_url = "https://api.runwayml.com/v1"
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
def text_to_video(self, prompt, duration=4, aspect_ratio="16:9"):
"""
Generate video from text prompt
Args:
prompt: Description of video
duration: Length in seconds (4 or 8)
aspect_ratio: "16:9" or "9:16"
"""
payload = {
"promptText": prompt,
"duration": duration,
"ratio": aspect_ratio,
"watermark": False
}
# Submit generation request
response = requests.post(
f"{self.base_url}/generate",
headers=self.headers,
json=payload
)
task_id = response.json()['id']
print(f"Generation started: {task_id}")
# Poll for completion
while True:
status_response = requests.get(
f"{self.base_url}/tasks/{task_id}",
headers=self.headers
)
status = status_response.json()
if status['status'] == 'SUCCEEDED':
video_url = status['output'][0]
print(f"ā Video generated: {video_url}")
return video_url
elif status['status'] == 'FAILED':
print(f"ā Generation failed: {status.get('error')}")
return None
print(f"Status: {status['status']} ({status.get('progress', 0)}%)")
time.sleep(3)
def image_to_video(self, image_url, prompt=None, duration=4):
"""Animate image into video"""
payload = {
"promptImage": image_url,
"promptText": prompt or "animate this image",
"duration": duration
}
response = requests.post(
f"{self.base_url}/generate",
headers=self.headers,
json=payload
)
task_id = response.json()['id']
print(f"Animation started: {task_id}")
# Wait for completion (similar polling as above)
# ... implementation
return task_id
# Example usage
client = RunwayMLClient()
# Text to video
video_url = client.text_to_video(
prompt="a serene ocean sunset with waves gently rolling onto the beach",
duration=4,
aspect_ratio="16:9"
)
# Image to video
# video_url = client.image_to_video(
# image_url="https://example.com/image.jpg",
# prompt="camera slowly pans across the scene",
# duration=4
# )
print("\nRunwayML Gen-2 offers:")
print(" ⢠Text-to-video generation")
print(" ⢠Image-to-video animation")
print(" ⢠Video-to-video style transfer")
print(" ⢠4K resolution support")
print("\nGet API key: https://runwayml.com")
š Face Animation with D-ID
import requests
import json
class DIDClient:
"""Client for D-ID talking head API"""
def __init__(self, api_key=None):
self.api_key = api_key or os.getenv('DID_API_KEY')
self.base_url = "https://api.d-id.com"
self.headers = {
"Authorization": f"Basic {self.api_key}",
"Content-Type": "application/json"
}
def create_talking_head(
self,
image_url,
text=None,
audio_url=None,
voice_id="en-US-JennyNeural"
):
"""
Animate face to speak text or audio
Args:
image_url: URL of face image
text: Text to speak (if no audio)
audio_url: Audio URL (if no text)
voice_id: Voice for TTS
"""
payload = {
"source_url": image_url,
}
# Add script (text or audio)
if audio_url:
payload["script"] = {
"type": "audio",
"audio_url": audio_url
}
else:
payload["script"] = {
"type": "text",
"input": text,
"provider": {
"type": "microsoft",
"voice_id": voice_id
}
}
# Create talk
response = requests.post(
f"{self.base_url}/talks",
headers=self.headers,
json=payload
)
talk_id = response.json()['id']
print(f"Talking head creation started: {talk_id}")
# Poll for result
while True:
status_response = requests.get(
f"{self.base_url}/talks/{talk_id}",
headers=self.headers
)
result = status_response.json()
if result['status'] == 'done':
video_url = result['result_url']
print(f"ā Talking head created: {video_url}")
return video_url
elif result['status'] == 'error':
print(f"ā Error: {result.get('error')}")
return None
print(f"Status: {result['status']}")
time.sleep(2)
def list_voices(self):
"""List available voices"""
response = requests.get(
f"{self.base_url}/tts/voices",
headers=self.headers
)
voices = response.json()
print("Available voices:")
for voice in voices[:5]:
print(f" {voice['voice_id']}: {voice['name']} ({voice['language']})")
return voices
# Example usage
did_client = DIDClient()
# Create talking head from image + text
video = did_client.create_talking_head(
image_url="https://example.com/portrait.jpg",
text="Hello! I am an AI-generated talking head. This technology can make any image speak.",
voice_id="en-US-JennyNeural"
)
# List available voices
# did_client.list_voices()
print("\nD-ID enables:")
print(" ⢠Talking head videos from photos")
print(" ⢠Lip-sync to any audio")
print(" ⢠100+ voices in multiple languages")
print(" ⢠Custom avatar creation")
š Video Style Transfer
import torch
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline
import cv2
import numpy as np
from PIL import Image
def video_style_transfer(
video_path,
style_prompt,
output_path="styled_video.mp4",
fps=24
):
"""
Apply style transfer to video using Stable Diffusion
Args:
video_path: Input video
style_prompt: Style description
output_path: Output video
fps: Frames per second
"""
# Load video
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Processing video: {total_frames} frames @ {width}x{height}")
# Setup output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
# Load ControlNet for consistency
controlnet = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-canny",
torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=controlnet,
torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# Convert frame to canny edges for consistency
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 100, 200)
edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
edges_pil = Image.fromarray(edges_rgb)
# Apply style
styled = pipe(
prompt=style_prompt,
image=edges_pil,
num_inference_steps=20,
controlnet_conditioning_scale=0.8
).images[0]
# Convert back to video frame
styled_frame = cv2.cvtColor(np.array(styled), cv2.COLOR_RGB2BGR)
styled_frame = cv2.resize(styled_frame, (width, height))
out.write(styled_frame)
frame_count += 1
if frame_count % 10 == 0:
print(f"Processed {frame_count}/{total_frames} frames")
cap.release()
out.release()
print(f"ā Styled video saved: {output_path}")
# Example usage
# video_style_transfer(
# video_path="input_video.mp4",
# style_prompt="anime style, studio ghibli, hand-drawn animation",
# output_path="anime_styled_video.mp4"
# )
print("Video style transfer pipeline ready!")
print("\nStyle examples:")
print(" ⢠'oil painting by Van Gogh, impressionist'")
print(" ⢠'anime style, studio ghibli'")
print(" ⢠'cyberpunk neon aesthetic'")
print(" ⢠'pencil sketch, black and white'")
print(" ⢠'pixel art, 8-bit retro game'")
š Deepfake Face Swapping
# Note: Use responsibly and ethically!
from insightface.app import FaceAnalysis
from insightface.model_zoo import get_model
import cv2
class FaceSwapper:
"""Face swapping for videos"""
def __init__(self):
# Initialize face analysis
self.app = FaceAnalysis(name='buffalo_l')
self.app.prepare(ctx_id=0, det_size=(640, 640))
# Load face swapper model
self.swapper = get_model('inswapper_128.onnx')
print("Face Swapper initialized")
def swap_face_in_image(self, source_img, target_img):
"""
Swap face from source to target image
Args:
source_img: Image with face to use
target_img: Image to swap face into
"""
# Detect faces
source_faces = self.app.get(source_img)
target_faces = self.app.get(target_img)
if len(source_faces) == 0:
print("No face found in source image")
return target_img
if len(target_faces) == 0:
print("No face found in target image")
return target_img
# Get the first face from each
source_face = source_faces[0]
target_face = target_faces[0]
# Swap face
result = self.swapper.get(target_img, target_face, source_face, paste_back=True)
return result
def swap_face_in_video(self, source_img_path, video_path, output_path):
"""
Swap face throughout video
Args:
source_img_path: Path to source face image
video_path: Input video
output_path: Output video with swapped face
"""
# Load source image
source_img = cv2.imread(source_img_path)
source_faces = self.app.get(source_img)
if len(source_faces) == 0:
print("No face found in source image")
return
source_face = source_faces[0]
# Open video
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Setup output
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
print(f"Processing {total_frames} frames...")
while True:
ret, frame = cap.read()
if not ret:
break
# Detect face in frame
target_faces = self.app.get(frame)
if len(target_faces) > 0:
# Swap face
result = self.swapper.get(
frame,
target_faces[0],
source_face,
paste_back=True
)
out.write(result)
else:
# No face detected, use original frame
out.write(frame)
frame_count += 1
if frame_count % 30 == 0:
print(f"Progress: {frame_count}/{total_frames} frames")
cap.release()
out.release()
print(f"ā Face swapped video saved: {output_path}")
# Example usage
# swapper = FaceSwapper()
# Swap face in single image
# source = cv2.imread("person_a.jpg")
# target = cv2.imread("person_b.jpg")
# result = swapper.swap_face_in_image(source, target)
# cv2.imwrite("swapped.jpg", result)
# Swap face in video
# swapper.swap_face_in_video(
# source_img_path="source_face.jpg",
# video_path="target_video.mp4",
# output_path="face_swapped_video.mp4"
# )
print("\nā ļø ETHICAL CONSIDERATIONS:")
print(" ⢠Get consent before swapping faces")
print(" ⢠Don't create misleading content")
print(" ⢠Respect privacy and dignity")
print(" ⢠Follow local laws and regulations")
print(" ⢠Label synthetic media clearly")
āļø AI Video Editing
from moviepy.editor import VideoFileClip, concatenate_videoclips, TextClip, CompositeVideoClip
import whisper
class AIVideoEditor:
"""AI-powered video editing assistant"""
def __init__(self):
# Load Whisper for transcription
self.whisper_model = whisper.load_model("base")
def transcribe_video(self, video_path):
"""
Transcribe video audio with timestamps
Returns segments with start/end times and text
"""
# Extract audio
video = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
video.audio.write_audiofile(audio_path)
# Transcribe
result = self.whisper_model.transcribe(
audio_path,
task="transcribe",
verbose=False
)
print(f"ā Transcribed {len(result['segments'])} segments")
return result['segments']
def auto_caption_video(self, video_path, output_path="captioned.mp4"):
"""Add automatic captions to video"""
# Transcribe
segments = self.transcribe_video(video_path)
# Load video
video = VideoFileClip(video_path)
# Create caption clips
caption_clips = []
for segment in segments:
start = segment['start']
end = segment['end']
text = segment['text'].strip()
# Create text clip
txt_clip = TextClip(
text,
fontsize=40,
color='white',
bg_color='black',
size=(video.w * 0.8, None),
method='caption'
).set_position(('center', 'bottom')).set_start(start).set_end(end)
caption_clips.append(txt_clip)
# Composite video with captions
final = CompositeVideoClip([video] + caption_clips)
# Export
final.write_videofile(output_path, codec='libx264', audio_codec='aac')
print(f"ā Captioned video saved: {output_path}")
def remove_silence(self, video_path, output_path="no_silence.mp4",
silence_threshold=-40):
"""
Remove silent parts from video
Args:
silence_threshold: Volume threshold in dB
"""
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
# Load video
video = VideoFileClip(video_path)
# Extract audio
audio_path = "temp_audio.wav"
video.audio.write_audiofile(audio_path)
# Detect non-silent parts
audio = AudioSegment.from_wav(audio_path)
nonsilent_ranges = detect_nonsilent(
audio,
min_silence_len=500, # milliseconds
silence_thresh=silence_threshold
)
# Create clips from non-silent parts
clips = []
for start_ms, end_ms in nonsilent_ranges:
start_sec = start_ms / 1000.0
end_sec = end_ms / 1000.0
clips.append(video.subclip(start_sec, end_sec))
# Concatenate clips
final = concatenate_videoclips(clips)
# Export
final.write_videofile(output_path, codec='libx264', audio_codec='aac')
print(f"ā Removed silence. Original: {video.duration:.1f}s, New: {final.duration:.1f}s")
return final
def extract_highlights(self, video_path, num_clips=5, clip_duration=10):
"""
Extract interesting segments from video
Uses scene detection
"""
from scenedetect import detect, ContentDetector
# Detect scenes
scene_list = detect(video_path, ContentDetector())
print(f"Detected {len(scene_list)} scenes")
# Load video
video = VideoFileClip(video_path)
# Extract clips
highlights = []
for i, scene in enumerate(scene_list[:num_clips]):
start = scene[0].get_seconds()
end = min(start + clip_duration, scene[1].get_seconds())
clip = video.subclip(start, end)
highlights.append(clip)
# Concatenate highlights
final = concatenate_videoclips(highlights)
output_path = "highlights.mp4"
final.write_videofile(output_path, codec='libx264')
print(f"ā Extracted {num_clips} highlight clips")
return final
# Example usage
editor = AIVideoEditor()
# Auto-caption video
# editor.auto_caption_video("interview.mp4", "captioned_interview.mp4")
# Remove silence
# editor.remove_silence("podcast.mp4", "podcast_no_silence.mp4")
# Extract highlights
# editor.extract_highlights("long_video.mp4", num_clips=5)
print("\nā AI Video Editor ready!")
print("\nFeatures:")
print(" ⢠Automatic transcription and captioning")
print(" ⢠Silence removal for cleaner videos")
print(" ⢠Highlight extraction from long videos")
print(" ⢠Scene detection and analysis")
š Platform Comparison
| Platform | Capability | Quality | Best For |
|---|---|---|---|
| Stable Video Diffusion | Image-to-video | Very Good | Animating images, open-source |
| Runway Gen-2 | Text/image-to-video | Excellent | Professional content creation |
| Pika Labs | Text-to-video | Very Good | Creative video generation |
| D-ID | Talking heads | Excellent | Avatar videos, presentations |
| InsightFace | Face swapping | Very Good | Deepfakes, face replacement |
šÆ Key Takeaways
- Multiple approaches: Text-to-video, image-to-video, video-to-video
- Stable Video Diffusion: Open-source image animation
- Runway Gen-2: Industry-leading text-to-video
- Talking heads: Animate photos with speech
- Style transfer: Transform video aesthetics
- AI editing: Auto-captions, silence removal, highlights
- Deepfakes: Powerful but requires ethical use
- Use cases: Marketing, education, entertainment, accessibility