πŸ—£οΈ Text-to-Speech & Voice Cloning

Transform text into natural-sounding speech

What is Text-to-Speech (TTS)?

Text-to-Speech technology converts written text into spoken audio. Modern AI-powered TTS systems produce incredibly natural, expressive speech with emotion, emphasis, and even voice cloning capabilities.

Key Technologies:

  • Neural TTS: Deep learning for natural prosody
  • Voice Cloning: Replicate any voice from samples
  • Multilingual: Support for 50+ languages
  • Real-time: Low-latency streaming synthesis

πŸŽ™οΈ Basic TTS with pyttsx3

import pyttsx3

# Initialize TTS engine (offline, cross-platform)
engine = pyttsx3.init()

# Get available voices
voices = engine.getProperty('voices')
print("Available voices:")
for i, voice in enumerate(voices):
    print(f"  {i}: {voice.name} ({voice.gender}, {voice.age})")

# Configure voice properties
def configure_voice(engine, voice_index=0, rate=150, volume=1.0):
    """
    Configure TTS engine
    
    Args:
        voice_index: Which voice to use
        rate: Speed (words per minute, default 200)
        volume: Volume (0.0 to 1.0)
    """
    engine.setProperty('voice', voices[voice_index].id)
    engine.setProperty('rate', rate)
    engine.setProperty('volume', volume)

# Set voice
configure_voice(engine, voice_index=0, rate=150, volume=0.9)

# Speak text
text = "Hello! I am a text-to-speech system. I can convert any text into natural speech."
engine.say(text)
engine.runAndWait()

# Save to file
engine.save_to_file(text, 'output_speech.mp3')
engine.runAndWait()

print("βœ“ Speech generated and saved!")
print("\nNote: pyttsx3 uses system voices (offline but less natural)")

🌐 Google Text-to-Speech (gTTS)

from gtts import gTTS
import os
from playsound import playsound

def generate_speech_gtts(text, language='en', slow=False, output='speech.mp3'):
    """
    Generate speech using Google TTS
    
    Args:
        text: Text to convert
        language: Language code (en, es, fr, de, ja, etc.)
        slow: Whether to speak slowly
        output: Output filename
    """
    
    # Create TTS object
    tts = gTTS(text=text, lang=language, slow=slow)
    
    # Save audio
    tts.save(output)
    
    print(f"βœ“ Saved speech to: {output}")
    return output

# Example: English
text_en = "The quick brown fox jumps over the lazy dog."
generate_speech_gtts(text_en, language='en', output='english.mp3')

# Example: Spanish
text_es = "La inteligencia artificial estΓ‘ transformando el mundo."
generate_speech_gtts(text_es, language='es', output='spanish.mp3')

# Example: Japanese
text_ja = "δΊΊε·₯ηŸ₯θƒ½γ―δΈ–η•Œγ‚’ε€‰γˆγ¦γ„γΎγ™γ€‚"
generate_speech_gtts(text_ja, language='ja', output='japanese.mp3')

# Supported languages
from gtts.lang import tts_langs
languages = tts_langs()

print(f"\ngTTS supports {len(languages)} languages:")
for code, name in list(languages.items())[:10]:
    print(f"  {code}: {name}")
print("  ... and more!")

# Play audio (optional)
# playsound('english.mp3')

print("\nβœ“ gTTS is free but requires internet connection")

🎨 Advanced TTS with Coqui TTS

from TTS.api import TTS
import torch

# List available models
print("Available TTS models:")
print(TTS().list_models()[:5])  # Show first 5

# Load model
# Choose based on needs: quality vs speed
model_name = "tts_models/en/ljspeech/tacotron2-DDC"  # Good quality
# model_name = "tts_models/en/ljspeech/fast_pitch"  # Faster

tts = TTS(model_name=model_name)

print(f"\nLoaded model: {model_name}")
print(f"Using GPU: {torch.cuda.is_available()}")

def generate_speech_coqui(text, output_path="output.wav", speaker=None):
    """Generate speech with Coqui TTS"""
    
    tts.tts_to_file(
        text=text,
        file_path=output_path,
        speaker=speaker  # For multi-speaker models
    )
    
    print(f"βœ“ Generated: {output_path}")
    return output_path

# Generate speech
text = """
Artificial intelligence is revolutionizing how we interact with technology.
From voice assistants to content creation, AI is everywhere.
"""

generate_speech_coqui(text, output_path="ai_speech.wav")

# Multi-speaker model example
multi_speaker_model = "tts_models/en/vctk/vits"
tts_multi = TTS(model_name=multi_speaker_model)

# List available speakers
speakers = tts_multi.speakers
print(f"\nAvailable speakers: {len(speakers)}")
print(f"Sample speakers: {speakers[:5]}")

# Generate with specific speaker
tts_multi.tts_to_file(
    text="Hello! I am speaker number one.",
    file_path="speaker1.wav",
    speaker=speakers[0]
)

print("\nβœ“ Coqui TTS offers state-of-the-art quality!")

🎭 Voice Cloning with Coqui

# Voice cloning requires a voice sample
# XTTS model supports voice cloning

# Load XTTS model (supports voice cloning)
xtts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
tts_clone = TTS(model_name=xtts_model)

def clone_voice(text, speaker_wav, language='en', output_path='cloned.wav'):
    """
    Clone a voice and generate speech
    
    Args:
        text: Text to speak
        speaker_wav: Path to voice sample (3-10 seconds recommended)
        language: Language code
        output_path: Output file
    """
    
    print(f"Cloning voice from: {speaker_wav}")
    print(f"Generating: {text[:50]}...")
    
    tts_clone.tts_to_file(
        text=text,
        file_path=output_path,
        speaker_wav=speaker_wav,
        language=language
    )
    
    print(f"βœ“ Voice cloned and saved to: {output_path}")
    return output_path

# Example: Clone voice
# First, record or provide a voice sample
sample_text = """
Welcome to the future of voice synthesis. 
With just a short voice sample, AI can replicate your unique speaking style.
This technology opens up incredible possibilities for content creation.
"""

# Clone voice (uncomment with actual voice sample)
# clone_voice(
#     text=sample_text,
#     speaker_wav="voice_sample.wav",
#     language='en',
#     output_path='cloned_speech.wav'
# )

print("\nVoice Cloning Tips:")
print("  β€’ Use 3-10 seconds of clear audio")
print("  β€’ Single speaker, no background noise")
print("  β€’ Neutral emotion for best results")
print("  β€’ Higher quality sample = better cloning")

# Supported languages for XTTS
xtts_languages = [
    'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr',
    'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'
]

print(f"\nXTTS supports {len(xtts_languages)} languages for cloning!")

🎀 ElevenLabs API

from elevenlabs import generate, play, set_api_key, voices, Voice, VoiceSettings
import os

# Set API key
set_api_key(os.getenv('ELEVENLABS_API_KEY'))
# Get key from: https://elevenlabs.io

def list_available_voices():
    """List all available voices"""
    
    all_voices = voices()
    
    print("Available ElevenLabs Voices:")
    for voice in all_voices:
        print(f"  {voice.name} ({voice.voice_id})")
        print(f"    Category: {voice.category}")
        print(f"    Description: {voice.description[:60]}...")
        print()
    
    return all_voices

# List voices
available_voices = list_available_voices()

def generate_elevenlabs_speech(
    text,
    voice_name="Rachel",  # Or use voice_id
    model="eleven_multilingual_v2",
    output_path="elevenlabs_output.mp3"
):
    """
    Generate speech with ElevenLabs
    
    Models:
        - eleven_monolingual_v1: English only, fast
        - eleven_multilingual_v1: 29 languages
        - eleven_multilingual_v2: Better quality, more languages
        - eleven_turbo_v2: Fastest, lower latency
    """
    
    audio = generate(
        text=text,
        voice=voice_name,
        model=model
    )
    
    # Save audio
    with open(output_path, 'wb') as f:
        f.write(audio)
    
    print(f"βœ“ Generated with ElevenLabs: {output_path}")
    return audio

# Generate speech
text = """
ElevenLabs provides incredibly natural and expressive text-to-speech.
The voices are nearly indistinguishable from human speech,
with proper emotion, pacing, and intonation.
"""

audio = generate_elevenlabs_speech(
    text=text,
    voice_name="Rachel",
    model="eleven_multilingual_v2",
    output_path="elevenlabs_demo.mp3"
)

# Advanced: Custom voice settings
def generate_with_settings(text, voice_id, output_path="custom_voice.mp3"):
    """Generate with custom voice settings"""
    
    audio = generate(
        text=text,
        voice=Voice(
            voice_id=voice_id,
            settings=VoiceSettings(
                stability=0.5,  # 0-1, higher = more consistent
                similarity_boost=0.75,  # 0-1, higher = closer to original
                style=0.5,  # 0-1, exaggeration of speaking style
                use_speaker_boost=True  # Enhance voice clarity
            )
        ),
        model="eleven_multilingual_v2"
    )
    
    with open(output_path, 'wb') as f:
        f.write(audio)
    
    print(f"βœ“ Generated with custom settings: {output_path}")
    return audio

print("\nβœ“ ElevenLabs offers the highest quality TTS!")

🎬 Bark: Text-to-Audio Model

from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile as wavfile
import numpy as np

# Load Bark model
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("Bark Model Loaded!")
print("Bark can generate:")
print("  β€’ Speech in multiple languages")
print("  β€’ Music")
print("  β€’ Background noise")
print("  β€’ Sound effects")

def generate_bark_speech(text, voice_preset="v2/en_speaker_6", output="bark_output.wav"):
    """
    Generate speech with Bark
    
    Voice presets:
        - v2/en_speaker_0 to v2/en_speaker_9: English voices
        - v2/es_speaker_0 to v2/es_speaker_9: Spanish voices
        - v2/fr_speaker_0 to v2/fr_speaker_9: French voices
        - ... and more languages
    """
    
    # Process text
    inputs = processor(text, voice_preset=voice_preset).to(device)
    
    # Generate audio
    audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().squeeze()
    
    # Save audio (Bark uses 24kHz sample rate)
    sample_rate = model.generation_config.sample_rate
    wavfile.write(output, rate=sample_rate, data=audio_array)
    
    print(f"βœ“ Generated with Bark: {output}")
    return audio_array

# Generate speech
text = "Hello, I am Bark! [laughs] I can even add non-speech sounds and emotions!"

generate_bark_speech(
    text=text,
    voice_preset="v2/en_speaker_6",
    output="bark_demo.wav"
)

# Bark special features
special_examples = {
    'Laughter': "That's hilarious! [laughs]",
    'Music': "β™ͺ La la la β™ͺ [music]",
    'Gasps': "[gasps] Oh my goodness!",
    'Clears throat': "[clears throat] Let me explain...",
    'Sighs': "[sighs] That's unfortunate.",
}

print("\nBark Special Features:")
for feature, example in special_examples.items():
    print(f"  {feature}: {example}")

# Generate with emotion
emotional_text = """
[excited] This is absolutely amazing! 
[whispers] But let me tell you a secret... 
[laughs] AI can be really fun to work with!
"""

generate_bark_speech(
    text=emotional_text,
    voice_preset="v2/en_speaker_9",
    output="bark_emotional.wav"
)

print("\nβœ“ Bark is unique in generating non-speech audio!")

πŸŽ™οΈ Real-time Streaming TTS

import asyncio
from elevenlabs import stream

async def stream_text_to_speech(text, voice="Rachel"):
    """
    Stream TTS for real-time playback
    Useful for long-form content and live applications
    """
    
    # Split text into sentences for streaming
    sentences = text.split('. ')
    
    for i, sentence in enumerate(sentences):
        if not sentence.strip():
            continue
            
        print(f"Streaming sentence {i+1}/{len(sentences)}...")
        
        # Generate and stream audio
        audio_stream = generate(
            text=sentence + '.',
            voice=voice,
            model="eleven_turbo_v2",  # Use turbo for low latency
            stream=True
        )
        
        # In production, send to audio player
        stream(audio_stream)
    
    print("βœ“ Streaming complete!")

# Example: Stream long text
long_text = """
Artificial intelligence is transforming the world. 
From healthcare to education, AI is making an impact.
Voice synthesis technology has reached impressive levels of quality.
We can now generate natural-sounding speech in real-time.
This opens up new possibilities for accessibility and content creation.
"""

# Stream speech (uncomment to run)
# asyncio.run(stream_text_to_speech(long_text))

# Chunked generation for long texts
def generate_long_text_tts(text, chunk_size=500, output_dir="tts_chunks"):
    """Generate TTS for very long texts in chunks"""
    
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Split into chunks
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) 
              for i in range(0, len(words), chunk_size)]
    
    audio_files = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        
        output_path = f"{output_dir}/chunk_{i+1:03d}.mp3"
        
        audio = generate_elevenlabs_speech(
            text=chunk,
            voice_name="Rachel",
            output_path=output_path
        )
        
        audio_files.append(output_path)
    
    print(f"\nβœ“ Generated {len(chunks)} audio chunks")
    return audio_files

print("\nβœ“ Streaming TTS reduces latency for real-time applications!")

πŸ› οΈ TTS Production Application

class TTSApp:
    """Production-ready TTS application"""
    
    def __init__(self, provider='elevenlabs', voice='Rachel'):
        self.provider = provider
        self.voice = voice
        self.audio_cache = {}
    
    def text_to_speech(self, text, output_path=None, cache=True):
        """
        Generate speech with caching
        
        Args:
            text: Input text
            output_path: Where to save audio
            cache: Whether to cache generated audio
        """
        
        # Check cache
        cache_key = f"{self.provider}_{self.voice}_{hash(text)}"
        
        if cache and cache_key in self.audio_cache:
            print("βœ“ Retrieved from cache")
            return self.audio_cache[cache_key]
        
        # Generate audio based on provider
        if self.provider == 'elevenlabs':
            audio = generate_elevenlabs_speech(
                text=text,
                voice_name=self.voice,
                output_path=output_path or "temp_audio.mp3"
            )
        elif self.provider == 'coqui':
            audio = generate_speech_coqui(
                text=text,
                output_path=output_path or "temp_audio.wav"
            )
        else:
            audio = generate_speech_gtts(
                text=text,
                output=output_path or "temp_audio.mp3"
            )
        
        # Cache if enabled
        if cache:
            self.audio_cache[cache_key] = audio
        
        return audio
    
    def generate_audiobook(self, chapters, output_dir='audiobook'):
        """Generate audiobook from chapters"""
        
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        for i, chapter in enumerate(chapters):
            print(f"\nGenerating Chapter {i+1}/{len(chapters)}...")
            
            output_path = f"{output_dir}/chapter_{i+1:02d}.mp3"
            
            self.text_to_speech(
                text=chapter['text'],
                output_path=output_path,
                cache=False
            )
            
            print(f"βœ“ Chapter {i+1} complete: {chapter['title']}")
        
        print(f"\nβœ“ Audiobook generated in '{output_dir}/'")
    
    def add_background_music(self, speech_path, music_path, output_path, 
                            speech_volume=1.0, music_volume=0.3):
        """Mix speech with background music"""
        
        from pydub import AudioSegment
        
        # Load audio files
        speech = AudioSegment.from_file(speech_path)
        music = AudioSegment.from_file(music_path)
        
        # Adjust volumes
        speech = speech + (20 * np.log10(speech_volume))
        music = music + (20 * np.log10(music_volume))
        
        # Loop music to match speech length
        if len(music) < len(speech):
            repeats = int(len(speech) / len(music)) + 1
            music = music * repeats
        
        # Trim music to speech length
        music = music[:len(speech)]
        
        # Mix audio
        mixed = speech.overlay(music)
        
        # Export
        mixed.export(output_path, format='mp3')
        
        print(f"βœ“ Mixed audio saved to: {output_path}")
        return output_path

# Example usage
app = TTSApp(provider='elevenlabs', voice='Rachel')

# Generate single audio
text = "Welcome to our podcast about artificial intelligence and its impact on society."
app.text_to_speech(text, output_path="podcast_intro.mp3")

# Generate audiobook chapters
chapters = [
    {
        'title': 'Introduction',
        'text': 'Welcome to Chapter One. In this chapter, we explore the basics...'
    },
    {
        'title': 'Deep Learning',
        'text': 'Chapter Two focuses on deep learning fundamentals...'
    },
]

# app.generate_audiobook(chapters)

print("\nβœ“ TTS application ready for production use!")

πŸ“Š TTS Provider Comparison

Provider Quality Cost Best For
pyttsx3 Basic Free, offline Simple apps, no internet
gTTS Good Free Quick prototypes, many languages
Coqui TTS Very Good Free, open-source Self-hosted, customizable
ElevenLabs Excellent $5-$330/mo Production, highest quality
Bark Very Good Free, open-source Emotional speech, sound effects
Azure TTS Excellent Pay per use Enterprise, many voices

🎯 Key Takeaways