What is Text-to-Speech (TTS)?
Text-to-Speech technology converts written text into spoken audio. Modern AI-powered TTS systems produce incredibly natural, expressive speech with emotion, emphasis, and even voice cloning capabilities.
Key Technologies:
- Neural TTS: Deep learning for natural prosody
- Voice Cloning: Replicate any voice from samples
- Multilingual: Support for 50+ languages
- Real-time: Low-latency streaming synthesis
ποΈ Basic TTS with pyttsx3
import pyttsx3
# Initialize TTS engine (offline, cross-platform)
engine = pyttsx3.init()
# Get available voices
voices = engine.getProperty('voices')
print("Available voices:")
for i, voice in enumerate(voices):
print(f" {i}: {voice.name} ({voice.gender}, {voice.age})")
# Configure voice properties
def configure_voice(engine, voice_index=0, rate=150, volume=1.0):
"""
Configure TTS engine
Args:
voice_index: Which voice to use
rate: Speed (words per minute, default 200)
volume: Volume (0.0 to 1.0)
"""
engine.setProperty('voice', voices[voice_index].id)
engine.setProperty('rate', rate)
engine.setProperty('volume', volume)
# Set voice
configure_voice(engine, voice_index=0, rate=150, volume=0.9)
# Speak text
text = "Hello! I am a text-to-speech system. I can convert any text into natural speech."
engine.say(text)
engine.runAndWait()
# Save to file
engine.save_to_file(text, 'output_speech.mp3')
engine.runAndWait()
print("β Speech generated and saved!")
print("\nNote: pyttsx3 uses system voices (offline but less natural)")
π Google Text-to-Speech (gTTS)
from gtts import gTTS
import os
from playsound import playsound
def generate_speech_gtts(text, language='en', slow=False, output='speech.mp3'):
"""
Generate speech using Google TTS
Args:
text: Text to convert
language: Language code (en, es, fr, de, ja, etc.)
slow: Whether to speak slowly
output: Output filename
"""
# Create TTS object
tts = gTTS(text=text, lang=language, slow=slow)
# Save audio
tts.save(output)
print(f"β Saved speech to: {output}")
return output
# Example: English
text_en = "The quick brown fox jumps over the lazy dog."
generate_speech_gtts(text_en, language='en', output='english.mp3')
# Example: Spanish
text_es = "La inteligencia artificial estΓ‘ transformando el mundo."
generate_speech_gtts(text_es, language='es', output='spanish.mp3')
# Example: Japanese
text_ja = "δΊΊε·₯η₯θ½γ―δΈηγε€γγ¦γγΎγγ"
generate_speech_gtts(text_ja, language='ja', output='japanese.mp3')
# Supported languages
from gtts.lang import tts_langs
languages = tts_langs()
print(f"\ngTTS supports {len(languages)} languages:")
for code, name in list(languages.items())[:10]:
print(f" {code}: {name}")
print(" ... and more!")
# Play audio (optional)
# playsound('english.mp3')
print("\nβ gTTS is free but requires internet connection")
π¨ Advanced TTS with Coqui TTS
from TTS.api import TTS
import torch
# List available models
print("Available TTS models:")
print(TTS().list_models()[:5]) # Show first 5
# Load model
# Choose based on needs: quality vs speed
model_name = "tts_models/en/ljspeech/tacotron2-DDC" # Good quality
# model_name = "tts_models/en/ljspeech/fast_pitch" # Faster
tts = TTS(model_name=model_name)
print(f"\nLoaded model: {model_name}")
print(f"Using GPU: {torch.cuda.is_available()}")
def generate_speech_coqui(text, output_path="output.wav", speaker=None):
"""Generate speech with Coqui TTS"""
tts.tts_to_file(
text=text,
file_path=output_path,
speaker=speaker # For multi-speaker models
)
print(f"β Generated: {output_path}")
return output_path
# Generate speech
text = """
Artificial intelligence is revolutionizing how we interact with technology.
From voice assistants to content creation, AI is everywhere.
"""
generate_speech_coqui(text, output_path="ai_speech.wav")
# Multi-speaker model example
multi_speaker_model = "tts_models/en/vctk/vits"
tts_multi = TTS(model_name=multi_speaker_model)
# List available speakers
speakers = tts_multi.speakers
print(f"\nAvailable speakers: {len(speakers)}")
print(f"Sample speakers: {speakers[:5]}")
# Generate with specific speaker
tts_multi.tts_to_file(
text="Hello! I am speaker number one.",
file_path="speaker1.wav",
speaker=speakers[0]
)
print("\nβ Coqui TTS offers state-of-the-art quality!")
π Voice Cloning with Coqui
# Voice cloning requires a voice sample
# XTTS model supports voice cloning
# Load XTTS model (supports voice cloning)
xtts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
tts_clone = TTS(model_name=xtts_model)
def clone_voice(text, speaker_wav, language='en', output_path='cloned.wav'):
"""
Clone a voice and generate speech
Args:
text: Text to speak
speaker_wav: Path to voice sample (3-10 seconds recommended)
language: Language code
output_path: Output file
"""
print(f"Cloning voice from: {speaker_wav}")
print(f"Generating: {text[:50]}...")
tts_clone.tts_to_file(
text=text,
file_path=output_path,
speaker_wav=speaker_wav,
language=language
)
print(f"β Voice cloned and saved to: {output_path}")
return output_path
# Example: Clone voice
# First, record or provide a voice sample
sample_text = """
Welcome to the future of voice synthesis.
With just a short voice sample, AI can replicate your unique speaking style.
This technology opens up incredible possibilities for content creation.
"""
# Clone voice (uncomment with actual voice sample)
# clone_voice(
# text=sample_text,
# speaker_wav="voice_sample.wav",
# language='en',
# output_path='cloned_speech.wav'
# )
print("\nVoice Cloning Tips:")
print(" β’ Use 3-10 seconds of clear audio")
print(" β’ Single speaker, no background noise")
print(" β’ Neutral emotion for best results")
print(" β’ Higher quality sample = better cloning")
# Supported languages for XTTS
xtts_languages = [
'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr',
'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'
]
print(f"\nXTTS supports {len(xtts_languages)} languages for cloning!")
π€ ElevenLabs API
from elevenlabs import generate, play, set_api_key, voices, Voice, VoiceSettings
import os
# Set API key
set_api_key(os.getenv('ELEVENLABS_API_KEY'))
# Get key from: https://elevenlabs.io
def list_available_voices():
"""List all available voices"""
all_voices = voices()
print("Available ElevenLabs Voices:")
for voice in all_voices:
print(f" {voice.name} ({voice.voice_id})")
print(f" Category: {voice.category}")
print(f" Description: {voice.description[:60]}...")
print()
return all_voices
# List voices
available_voices = list_available_voices()
def generate_elevenlabs_speech(
text,
voice_name="Rachel", # Or use voice_id
model="eleven_multilingual_v2",
output_path="elevenlabs_output.mp3"
):
"""
Generate speech with ElevenLabs
Models:
- eleven_monolingual_v1: English only, fast
- eleven_multilingual_v1: 29 languages
- eleven_multilingual_v2: Better quality, more languages
- eleven_turbo_v2: Fastest, lower latency
"""
audio = generate(
text=text,
voice=voice_name,
model=model
)
# Save audio
with open(output_path, 'wb') as f:
f.write(audio)
print(f"β Generated with ElevenLabs: {output_path}")
return audio
# Generate speech
text = """
ElevenLabs provides incredibly natural and expressive text-to-speech.
The voices are nearly indistinguishable from human speech,
with proper emotion, pacing, and intonation.
"""
audio = generate_elevenlabs_speech(
text=text,
voice_name="Rachel",
model="eleven_multilingual_v2",
output_path="elevenlabs_demo.mp3"
)
# Advanced: Custom voice settings
def generate_with_settings(text, voice_id, output_path="custom_voice.mp3"):
"""Generate with custom voice settings"""
audio = generate(
text=text,
voice=Voice(
voice_id=voice_id,
settings=VoiceSettings(
stability=0.5, # 0-1, higher = more consistent
similarity_boost=0.75, # 0-1, higher = closer to original
style=0.5, # 0-1, exaggeration of speaking style
use_speaker_boost=True # Enhance voice clarity
)
),
model="eleven_multilingual_v2"
)
with open(output_path, 'wb') as f:
f.write(audio)
print(f"β Generated with custom settings: {output_path}")
return audio
print("\nβ ElevenLabs offers the highest quality TTS!")
π¬ Bark: Text-to-Audio Model
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile as wavfile
import numpy as np
# Load Bark model
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print("Bark Model Loaded!")
print("Bark can generate:")
print(" β’ Speech in multiple languages")
print(" β’ Music")
print(" β’ Background noise")
print(" β’ Sound effects")
def generate_bark_speech(text, voice_preset="v2/en_speaker_6", output="bark_output.wav"):
"""
Generate speech with Bark
Voice presets:
- v2/en_speaker_0 to v2/en_speaker_9: English voices
- v2/es_speaker_0 to v2/es_speaker_9: Spanish voices
- v2/fr_speaker_0 to v2/fr_speaker_9: French voices
- ... and more languages
"""
# Process text
inputs = processor(text, voice_preset=voice_preset).to(device)
# Generate audio
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
# Save audio (Bark uses 24kHz sample rate)
sample_rate = model.generation_config.sample_rate
wavfile.write(output, rate=sample_rate, data=audio_array)
print(f"β Generated with Bark: {output}")
return audio_array
# Generate speech
text = "Hello, I am Bark! [laughs] I can even add non-speech sounds and emotions!"
generate_bark_speech(
text=text,
voice_preset="v2/en_speaker_6",
output="bark_demo.wav"
)
# Bark special features
special_examples = {
'Laughter': "That's hilarious! [laughs]",
'Music': "βͺ La la la βͺ [music]",
'Gasps': "[gasps] Oh my goodness!",
'Clears throat': "[clears throat] Let me explain...",
'Sighs': "[sighs] That's unfortunate.",
}
print("\nBark Special Features:")
for feature, example in special_examples.items():
print(f" {feature}: {example}")
# Generate with emotion
emotional_text = """
[excited] This is absolutely amazing!
[whispers] But let me tell you a secret...
[laughs] AI can be really fun to work with!
"""
generate_bark_speech(
text=emotional_text,
voice_preset="v2/en_speaker_9",
output="bark_emotional.wav"
)
print("\nβ Bark is unique in generating non-speech audio!")
ποΈ Real-time Streaming TTS
import asyncio
from elevenlabs import stream
async def stream_text_to_speech(text, voice="Rachel"):
"""
Stream TTS for real-time playback
Useful for long-form content and live applications
"""
# Split text into sentences for streaming
sentences = text.split('. ')
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
print(f"Streaming sentence {i+1}/{len(sentences)}...")
# Generate and stream audio
audio_stream = generate(
text=sentence + '.',
voice=voice,
model="eleven_turbo_v2", # Use turbo for low latency
stream=True
)
# In production, send to audio player
stream(audio_stream)
print("β Streaming complete!")
# Example: Stream long text
long_text = """
Artificial intelligence is transforming the world.
From healthcare to education, AI is making an impact.
Voice synthesis technology has reached impressive levels of quality.
We can now generate natural-sounding speech in real-time.
This opens up new possibilities for accessibility and content creation.
"""
# Stream speech (uncomment to run)
# asyncio.run(stream_text_to_speech(long_text))
# Chunked generation for long texts
def generate_long_text_tts(text, chunk_size=500, output_dir="tts_chunks"):
"""Generate TTS for very long texts in chunks"""
import os
os.makedirs(output_dir, exist_ok=True)
# Split into chunks
words = text.split()
chunks = [' '.join(words[i:i+chunk_size])
for i in range(0, len(words), chunk_size)]
audio_files = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
output_path = f"{output_dir}/chunk_{i+1:03d}.mp3"
audio = generate_elevenlabs_speech(
text=chunk,
voice_name="Rachel",
output_path=output_path
)
audio_files.append(output_path)
print(f"\nβ Generated {len(chunks)} audio chunks")
return audio_files
print("\nβ Streaming TTS reduces latency for real-time applications!")
π οΈ TTS Production Application
class TTSApp:
"""Production-ready TTS application"""
def __init__(self, provider='elevenlabs', voice='Rachel'):
self.provider = provider
self.voice = voice
self.audio_cache = {}
def text_to_speech(self, text, output_path=None, cache=True):
"""
Generate speech with caching
Args:
text: Input text
output_path: Where to save audio
cache: Whether to cache generated audio
"""
# Check cache
cache_key = f"{self.provider}_{self.voice}_{hash(text)}"
if cache and cache_key in self.audio_cache:
print("β Retrieved from cache")
return self.audio_cache[cache_key]
# Generate audio based on provider
if self.provider == 'elevenlabs':
audio = generate_elevenlabs_speech(
text=text,
voice_name=self.voice,
output_path=output_path or "temp_audio.mp3"
)
elif self.provider == 'coqui':
audio = generate_speech_coqui(
text=text,
output_path=output_path or "temp_audio.wav"
)
else:
audio = generate_speech_gtts(
text=text,
output=output_path or "temp_audio.mp3"
)
# Cache if enabled
if cache:
self.audio_cache[cache_key] = audio
return audio
def generate_audiobook(self, chapters, output_dir='audiobook'):
"""Generate audiobook from chapters"""
import os
os.makedirs(output_dir, exist_ok=True)
for i, chapter in enumerate(chapters):
print(f"\nGenerating Chapter {i+1}/{len(chapters)}...")
output_path = f"{output_dir}/chapter_{i+1:02d}.mp3"
self.text_to_speech(
text=chapter['text'],
output_path=output_path,
cache=False
)
print(f"β Chapter {i+1} complete: {chapter['title']}")
print(f"\nβ Audiobook generated in '{output_dir}/'")
def add_background_music(self, speech_path, music_path, output_path,
speech_volume=1.0, music_volume=0.3):
"""Mix speech with background music"""
from pydub import AudioSegment
# Load audio files
speech = AudioSegment.from_file(speech_path)
music = AudioSegment.from_file(music_path)
# Adjust volumes
speech = speech + (20 * np.log10(speech_volume))
music = music + (20 * np.log10(music_volume))
# Loop music to match speech length
if len(music) < len(speech):
repeats = int(len(speech) / len(music)) + 1
music = music * repeats
# Trim music to speech length
music = music[:len(speech)]
# Mix audio
mixed = speech.overlay(music)
# Export
mixed.export(output_path, format='mp3')
print(f"β Mixed audio saved to: {output_path}")
return output_path
# Example usage
app = TTSApp(provider='elevenlabs', voice='Rachel')
# Generate single audio
text = "Welcome to our podcast about artificial intelligence and its impact on society."
app.text_to_speech(text, output_path="podcast_intro.mp3")
# Generate audiobook chapters
chapters = [
{
'title': 'Introduction',
'text': 'Welcome to Chapter One. In this chapter, we explore the basics...'
},
{
'title': 'Deep Learning',
'text': 'Chapter Two focuses on deep learning fundamentals...'
},
]
# app.generate_audiobook(chapters)
print("\nβ TTS application ready for production use!")
π TTS Provider Comparison
| Provider | Quality | Cost | Best For |
|---|---|---|---|
| pyttsx3 | Basic | Free, offline | Simple apps, no internet |
| gTTS | Good | Free | Quick prototypes, many languages |
| Coqui TTS | Very Good | Free, open-source | Self-hosted, customizable |
| ElevenLabs | Excellent | $5-$330/mo | Production, highest quality |
| Bark | Very Good | Free, open-source | Emotional speech, sound effects |
| Azure TTS | Excellent | Pay per use | Enterprise, many voices |
π― Key Takeaways
- Multiple options: From free basic TTS to premium voice cloning
- Quality varies: Neural TTS sounds much more natural
- Voice cloning: Create custom voices with 3-10 second samples
- ElevenLabs: Current leader in quality and naturalness
- Coqui/Bark: Best open-source alternatives
- Streaming: Essential for real-time applications
- Use cases: Audiobooks, podcasts, accessibility, content creation
- Ethical considerations: Get consent for voice cloning