Text-to-Speech & Voice Cloning

What is Text-to-Speech (TTS)?

Text-to-Speech technology converts written text into spoken audio. Modern AI-powered TTS systems produce incredibly natural, expressive speech with emotion, emphasis, and even voice cloning capabilities.

                Key Technologies:
                Neural TTS: Deep learning for natural prosody
Voice Cloning: Replicate any voice from samples
Multilingual: Support for 50+ languages
Real-time: Low-latency streaming synthesis

            

🎙️ Basic TTS with pyttsx3

import pyttsx3

# Initialize TTS engine (offline, cross-platform)
engine = pyttsx3.init()

# Get available voices
voices = engine.getProperty('voices')
print("Available voices:")
for i, voice in enumerate(voices):
    print(f"  {i}: {voice.name} ({voice.gender}, {voice.age})")

# Configure voice properties
def configure_voice(engine, voice_index=0, rate=150, volume=1.0):
    """
    Configure TTS engine
    
    Args:
        voice_index: Which voice to use
        rate: Speed (words per minute, default 200)
        volume: Volume (0.0 to 1.0)
    """
    engine.setProperty('voice', voices[voice_index].id)
    engine.setProperty('rate', rate)
    engine.setProperty('volume', volume)

# Set voice
configure_voice(engine, voice_index=0, rate=150, volume=0.9)

# Speak text
text = "Hello! I am a text-to-speech system. I can convert any text into natural speech."
engine.say(text)
engine.runAndWait()

# Save to file
engine.save_to_file(text, 'output_speech.mp3')
engine.runAndWait()

print("✓ Speech generated and saved!")
print("\nNote: pyttsx3 uses system voices (offline but less natural)")

🌐 Google Text-to-Speech (gTTS)

from gtts import gTTS
import os
from playsound import playsound

def generate_speech_gtts(text, language='en', slow=False, output='speech.mp3'):
    """
    Generate speech using Google TTS
    
    Args:
        text: Text to convert
        language: Language code (en, es, fr, de, ja, etc.)
        slow: Whether to speak slowly
        output: Output filename
    """
    
    # Create TTS object
    tts = gTTS(text=text, lang=language, slow=slow)
    
    # Save audio
    tts.save(output)
    
    print(f"✓ Saved speech to: {output}")
    return output

# Example: English
text_en = "The quick brown fox jumps over the lazy dog."
generate_speech_gtts(text_en, language='en', output='english.mp3')

# Example: Spanish
text_es = "La inteligencia artificial está transformando el mundo."
generate_speech_gtts(text_es, language='es', output='spanish.mp3')

# Example: Japanese
text_ja = "人工知能は世界を変えています。"
generate_speech_gtts(text_ja, language='ja', output='japanese.mp3')

# Supported languages
from gtts.lang import tts_langs
languages = tts_langs()

print(f"\ngTTS supports {len(languages)} languages:")
for code, name in list(languages.items())[:10]:
    print(f"  {code}: {name}")
print("  ... and more!")

# Play audio (optional)
# playsound('english.mp3')

print("\n✓ gTTS is free but requires internet connection")

🎨 Advanced TTS with Coqui TTS

from TTS.api import TTS
import torch

# List available models
print("Available TTS models:")
print(TTS().list_models()[:5])  # Show first 5

# Load model
# Choose based on needs: quality vs speed
model_name = "tts_models/en/ljspeech/tacotron2-DDC"  # Good quality
# model_name = "tts_models/en/ljspeech/fast_pitch"  # Faster

tts = TTS(model_name=model_name)

print(f"\nLoaded model: {model_name}")
print(f"Using GPU: {torch.cuda.is_available()}")

def generate_speech_coqui(text, output_path="output.wav", speaker=None):
    """Generate speech with Coqui TTS"""
    
    tts.tts_to_file(
        text=text,
        file_path=output_path,
        speaker=speaker  # For multi-speaker models
    )
    
    print(f"✓ Generated: {output_path}")
    return output_path

# Generate speech
text = """
Artificial intelligence is revolutionizing how we interact with technology.
From voice assistants to content creation, AI is everywhere.
"""

generate_speech_coqui(text, output_path="ai_speech.wav")

# Multi-speaker model example
multi_speaker_model = "tts_models/en/vctk/vits"
tts_multi = TTS(model_name=multi_speaker_model)

# List available speakers
speakers = tts_multi.speakers
print(f"\nAvailable speakers: {len(speakers)}")
print(f"Sample speakers: {speakers[:5]}")

# Generate with specific speaker
tts_multi.tts_to_file(
    text="Hello! I am speaker number one.",
    file_path="speaker1.wav",
    speaker=speakers[0]
)

print("\n✓ Coqui TTS offers state-of-the-art quality!")

🎭 Voice Cloning with Coqui

# Voice cloning requires a voice sample
# XTTS model supports voice cloning

# Load XTTS model (supports voice cloning)
xtts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
tts_clone = TTS(model_name=xtts_model)

def clone_voice(text, speaker_wav, language='en', output_path='cloned.wav'):
    """
    Clone a voice and generate speech
    
    Args:
        text: Text to speak
        speaker_wav: Path to voice sample (3-10 seconds recommended)
        language: Language code
        output_path: Output file
    """
    
    print(f"Cloning voice from: {speaker_wav}")
    print(f"Generating: {text[:50]}...")
    
    tts_clone.tts_to_file(
        text=text,
        file_path=output_path,
        speaker_wav=speaker_wav,
        language=language
    )
    
    print(f"✓ Voice cloned and saved to: {output_path}")
    return output_path

# Example: Clone voice
# First, record or provide a voice sample
sample_text = """
Welcome to the future of voice synthesis. 
With just a short voice sample, AI can replicate your unique speaking style.
This technology opens up incredible possibilities for content creation.
"""

# Clone voice (uncomment with actual voice sample)
# clone_voice(
#     text=sample_text,
#     speaker_wav="voice_sample.wav",
#     language='en',
#     output_path='cloned_speech.wav'
# )

print("\nVoice Cloning Tips:")
print("  • Use 3-10 seconds of clear audio")
print("  • Single speaker, no background noise")
print("  • Neutral emotion for best results")
print("  • Higher quality sample = better cloning")

# Supported languages for XTTS
xtts_languages = [
    'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr',
    'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'
]

print(f"\nXTTS supports {len(xtts_languages)} languages for cloning!")

🎤 ElevenLabs API

from elevenlabs import generate, play, set_api_key, voices, Voice, VoiceSettings
import os

# Set API key
set_api_key(os.getenv('ELEVENLABS_API_KEY'))
# Get key from: https://elevenlabs.io

def list_available_voices():
    """List all available voices"""
    
    all_voices = voices()
    
    print("Available ElevenLabs Voices:")
    for voice in all_voices:
        print(f"  {voice.name} ({voice.voice_id})")
        print(f"    Category: {voice.category}")
        print(f"    Description: {voice.description[:60]}...")
        print()
    
    return all_voices

# List voices
available_voices = list_available_voices()

def generate_elevenlabs_speech(
    text,
    voice_name="Rachel",  # Or use voice_id
    model="eleven_multilingual_v2",
    output_path="elevenlabs_output.mp3"
):
    """
    Generate speech with ElevenLabs
    
    Models:
        - eleven_monolingual_v1: English only, fast
        - eleven_multilingual_v1: 29 languages
        - eleven_multilingual_v2: Better quality, more languages
        - eleven_turbo_v2: Fastest, lower latency
    """
    
    audio = generate(
        text=text,
        voice=voice_name,
        model=model
    )
    
    # Save audio
    with open(output_path, 'wb') as f:
        f.write(audio)
    
    print(f"✓ Generated with ElevenLabs: {output_path}")
    return audio

# Generate speech
text = """
ElevenLabs provides incredibly natural and expressive text-to-speech.
The voices are nearly indistinguishable from human speech,
with proper emotion, pacing, and intonation.
"""

audio = generate_elevenlabs_speech(
    text=text,
    voice_name="Rachel",
    model="eleven_multilingual_v2",
    output_path="elevenlabs_demo.mp3"
)

# Advanced: Custom voice settings
def generate_with_settings(text, voice_id, output_path="custom_voice.mp3"):
    """Generate with custom voice settings"""
    
    audio = generate(
        text=text,
        voice=Voice(
            voice_id=voice_id,
            settings=VoiceSettings(
                stability=0.5,  # 0-1, higher = more consistent
                similarity_boost=0.75,  # 0-1, higher = closer to original
                style=0.5,  # 0-1, exaggeration of speaking style
                use_speaker_boost=True  # Enhance voice clarity
            )
        ),
        model="eleven_multilingual_v2"
    )
    
    with open(output_path, 'wb') as f:
        f.write(audio)
    
    print(f"✓ Generated with custom settings: {output_path}")
    return audio

print("\n✓ ElevenLabs offers the highest quality TTS!")

🎬 Bark: Text-to-Audio Model

from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile as wavfile
import numpy as np

# Load Bark model
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("Bark Model Loaded!")
print("Bark can generate:")
print("  • Speech in multiple languages")
print("  • Music")
print("  • Background noise")
print("  • Sound effects")

def generate_bark_speech(text, voice_preset="v2/en_speaker_6", output="bark_output.wav"):
    """
    Generate speech with Bark
    
    Voice presets:
        - v2/en_speaker_0 to v2/en_speaker_9: English voices
        - v2/es_speaker_0 to v2/es_speaker_9: Spanish voices
        - v2/fr_speaker_0 to v2/fr_speaker_9: French voices
        - ... and more languages
    """
    
    # Process text
    inputs = processor(text, voice_preset=voice_preset).to(device)
    
    # Generate audio
    audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().squeeze()
    
    # Save audio (Bark uses 24kHz sample rate)
    sample_rate = model.generation_config.sample_rate
    wavfile.write(output, rate=sample_rate, data=audio_array)
    
    print(f"✓ Generated with Bark: {output}")
    return audio_array

# Generate speech
text = "Hello, I am Bark! [laughs] I can even add non-speech sounds and emotions!"

generate_bark_speech(
    text=text,
    voice_preset="v2/en_speaker_6",
    output="bark_demo.wav"
)

# Bark special features
special_examples = {
    'Laughter': "That's hilarious! [laughs]",
    'Music': "♪ La la la ♪ [music]",
    'Gasps': "[gasps] Oh my goodness!",
    'Clears throat': "[clears throat] Let me explain...",
    'Sighs': "[sighs] That's unfortunate.",
}

print("\nBark Special Features:")
for feature, example in special_examples.items():
    print(f"  {feature}: {example}")

# Generate with emotion
emotional_text = """
[excited] This is absolutely amazing! 
[whispers] But let me tell you a secret... 
[laughs] AI can be really fun to work with!
"""

generate_bark_speech(
    text=emotional_text,
    voice_preset="v2/en_speaker_9",
    output="bark_emotional.wav"
)

print("\n✓ Bark is unique in generating non-speech audio!")

🎙️ Real-time Streaming TTS

import asyncio
from elevenlabs import stream

async def stream_text_to_speech(text, voice="Rachel"):
    """
    Stream TTS for real-time playback
    Useful for long-form content and live applications
    """
    
    # Split text into sentences for streaming
    sentences = text.split('. ')
    
    for i, sentence in enumerate(sentences):
        if not sentence.strip():
            continue
            
        print(f"Streaming sentence {i+1}/{len(sentences)}...")
        
        # Generate and stream audio
        audio_stream = generate(
            text=sentence + '.',
            voice=voice,
            model="eleven_turbo_v2",  # Use turbo for low latency
            stream=True
        )
        
        # In production, send to audio player
        stream(audio_stream)
    
    print("✓ Streaming complete!")

# Example: Stream long text
long_text = """
Artificial intelligence is transforming the world. 
From healthcare to education, AI is making an impact.
Voice synthesis technology has reached impressive levels of quality.
We can now generate natural-sounding speech in real-time.
This opens up new possibilities for accessibility and content creation.
"""

# Stream speech (uncomment to run)
# asyncio.run(stream_text_to_speech(long_text))

# Chunked generation for long texts
def generate_long_text_tts(text, chunk_size=500, output_dir="tts_chunks"):
    """Generate TTS for very long texts in chunks"""
    
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Split into chunks
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) 
              for i in range(0, len(words), chunk_size)]
    
    audio_files = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        
        output_path = f"{output_dir}/chunk_{i+1:03d}.mp3"
        
        audio = generate_elevenlabs_speech(
            text=chunk,
            voice_name="Rachel",
            output_path=output_path
        )
        
        audio_files.append(output_path)
    
    print(f"\n✓ Generated {len(chunks)} audio chunks")
    return audio_files

print("\n✓ Streaming TTS reduces latency for real-time applications!")

🛠️ TTS Production Application

class TTSApp:
    """Production-ready TTS application"""
    
    def __init__(self, provider='elevenlabs', voice='Rachel'):
        self.provider = provider
        self.voice = voice
        self.audio_cache = {}
    
    def text_to_speech(self, text, output_path=None, cache=True):
        """
        Generate speech with caching
        
        Args:
            text: Input text
            output_path: Where to save audio
            cache: Whether to cache generated audio
        """
        
        # Check cache
        cache_key = f"{self.provider}_{self.voice}_{hash(text)}"
        
        if cache and cache_key in self.audio_cache:
            print("✓ Retrieved from cache")
            return self.audio_cache[cache_key]
        
        # Generate audio based on provider
        if self.provider == 'elevenlabs':
            audio = generate_elevenlabs_speech(
                text=text,
                voice_name=self.voice,
                output_path=output_path or "temp_audio.mp3"
            )
        elif self.provider == 'coqui':
            audio = generate_speech_coqui(
                text=text,
                output_path=output_path or "temp_audio.wav"
            )
        else:
            audio = generate_speech_gtts(
                text=text,
                output=output_path or "temp_audio.mp3"
            )
        
        # Cache if enabled
        if cache:
            self.audio_cache[cache_key] = audio
        
        return audio
    
    def generate_audiobook(self, chapters, output_dir='audiobook'):
        """Generate audiobook from chapters"""
        
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        for i, chapter in enumerate(chapters):
            print(f"\nGenerating Chapter {i+1}/{len(chapters)}...")
            
            output_path = f"{output_dir}/chapter_{i+1:02d}.mp3"
            
            self.text_to_speech(
                text=chapter['text'],
                output_path=output_path,
                cache=False
            )
            
            print(f"✓ Chapter {i+1} complete: {chapter['title']}")
        
        print(f"\n✓ Audiobook generated in '{output_dir}/'")
    
    def add_background_music(self, speech_path, music_path, output_path, 
                            speech_volume=1.0, music_volume=0.3):
        """Mix speech with background music"""
        
        from pydub import AudioSegment
        
        # Load audio files
        speech = AudioSegment.from_file(speech_path)
        music = AudioSegment.from_file(music_path)
        
        # Adjust volumes
        speech = speech + (20 * np.log10(speech_volume))
        music = music + (20 * np.log10(music_volume))
        
        # Loop music to match speech length
        if len(music) < len(speech):
            repeats = int(len(speech) / len(music)) + 1
            music = music * repeats
        
        # Trim music to speech length
        music = music[:len(speech)]
        
        # Mix audio
        mixed = speech.overlay(music)
        
        # Export
        mixed.export(output_path, format='mp3')
        
        print(f"✓ Mixed audio saved to: {output_path}")
        return output_path

# Example usage
app = TTSApp(provider='elevenlabs', voice='Rachel')

# Generate single audio
text = "Welcome to our podcast about artificial intelligence and its impact on society."
app.text_to_speech(text, output_path="podcast_intro.mp3")

# Generate audiobook chapters
chapters = [
    {
        'title': 'Introduction',
        'text': 'Welcome to Chapter One. In this chapter, we explore the basics...'
    },
    {
        'title': 'Deep Learning',
        'text': 'Chapter Two focuses on deep learning fundamentals...'
    },
]

# app.generate_audiobook(chapters)

print("\n✓ TTS application ready for production use!")

📊 TTS Provider Comparison

Provider	Quality	Cost	Best For
pyttsx3	Basic	Free, offline	Simple apps, no internet
gTTS	Good	Free	Quick prototypes, many languages
Coqui TTS	Very Good	Free, open-source	Self-hosted, customizable
ElevenLabs	Excellent	$5-$330/mo	Production, highest quality
Bark	Very Good	Free, open-source	Emotional speech, sound effects
Azure TTS	Excellent	Pay per use	Enterprise, many voices

🎯 Key Takeaways

Multiple options: From free basic TTS to premium voice cloning
Quality varies: Neural TTS sounds much more natural
Voice cloning: Create custom voices with 3-10 second samples
ElevenLabs: Current leader in quality and naturalness
Coqui/Bark: Best open-source alternatives
Streaming: Essential for real-time applications
Use cases: Audiobooks, podcasts, accessibility, content creation
Ethical considerations: Get consent for voice cloning