Files
novafarma/scripts/generate_cinematic_voice.py

173 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
CINEMATIC VOICE GENERATOR - Natural Human Voice
Uses edge-tts with SSML markup for breathing, pacing, and emotion
Adds reverb and ambient layering for immersive noir atmosphere
"""
import asyncio
import os
from pathlib import Path
try:
import edge_tts
from edge_tts import VoicesManager
EDGE_TTS_AVAILABLE = True
except ImportError:
EDGE_TTS_AVAILABLE = False
print("⚠️ edge-tts not installed. Install with: pip install edge-tts")
exit(1)
# Output directory
VOICE_DIR = Path(__file__).parent.parent / "assets" / "audio" / "voices" / "narrator"
VOICE_DIR.mkdir(parents=True, exist_ok=True)
# NARRATOR VOICE PROFILE
# Using Slovenian deep male voice with noir characteristics
NARRATOR_VOICE = "sl-SI-RokNeural" # Deep Slovenian male
NARRATOR_RATE = "-15%" # Slower for dramatic effect
NARRATOR_PITCH = "-5Hz" # Deeper tone
# INTRO CUTSCENE SCRIPT (with natural pauses)
INTRO_SCRIPT = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sl-SI">
<prosody rate="-20%" pitch="-5Hz" volume="loud">
Leta <break time="300ms"/> dva tisoč štiriinosemdeset...
<break time="500ms"/>
Svet, <break time="200ms"/> kot smo ga poznali, <break time="300ms"/> je prenehal obstajati.
<break time="800ms"/>
Zombie apokalipsa <break time="400ms"/> ni bila tisto, <break time="200ms"/> kar nas je skoraj uničila.
<break time="600ms"/>
Bilo je <emphasis level="strong">nekaj drugega</emphasis>.
<break time="500ms"/>
Nekaj <break time="300ms"/> veliko hujšega.
<break time="1000ms"/>
Zdaj <break time="400ms"/> sem sam.
<break time="500ms"/>
Iskam <break time="300ms"/> svojo <emphasis level="strong">Ano</emphasis>.
<break time="800ms"/>
In odkrivam <break time="400ms"/> resnico <break time="300ms"/> o tem, <break time="200ms"/> kaj se je resnično zgodilo.
<break time="1000ms"/>
</prosody>
</speak>
"""
# KAI'S MEMORIES (emotional, broken)
KAI_MEMORY = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sl-SI">
<prosody rate="-25%" pitch="-3Hz" volume="medium">
Ana <break time="500ms"/> kje si?
<break time="800ms"/>
Spominjam se <break time="400ms"/> tvoje <emphasis level="moderate">smeh</emphasis>.
<break time="600ms"/>
Tvoje <break time="300ms"/> prijazne <break time="200ms"/> oči.
<break time="1000ms"/>
Ampak <break time="500ms"/> nepomnim si <break time="400ms"/> kako si <emphasis level="strong">izginila</emphasis>.
<break time="800ms"/>
Nekaj <break time="300ms"/> je narobe <break time="200ms"/> z mojimi spomini.
<break time="1200ms"/>
</prosody>
</speak>
"""
# NARRATOR - DARK DISCOVERY
DARK_DISCOVERY = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sl-SI">
<prosody rate="-15%" pitch="-6Hz" volume="loud">
Ko sem <break time="300ms"/> prvič <break time="200ms"/> vstopil v <emphasis level="strong">cerkev</emphasis>,
<break time="600ms"/>
sem vedel, <break time="400ms"/> da ta kraj <break time="300ms"/> skriva <emphasis level="strong">skrivnosti</emphasis>.
<break time="800ms"/>
Župnik <break time="300ms"/> je vedel več, <break time="200ms"/> kot je želel povedati.
<break time="600ms"/>
Govoril je <break time="400ms"/> o letu <emphasis level="strong">dva tisoč štiriinosemdeset</emphasis>.
<break time="500ms"/>
O <break time="300ms"/> koncu света.
<break time="1000ms"/>
In o tem, <break time="400ms"/> da <emphasis level="moderate">nisem sam</emphasis>.
<break time="1200ms"/>
</prosody>
</speak>
"""
async def generate_voice_with_ssml(ssml_text, voice, output_path, rate="-15%"):
"""Generate voice with SSML markup for natural pacing"""
try:
communicate = edge_tts.Communicate(ssml_text, voice, rate=rate)
await communicate.save(str(output_path))
print(f"✅ Generated: {output_path.name}")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
async def generate_all_narrator_voices():
"""Generate all narrator voice lines with cinematic quality"""
print("\n🎬 CINEMATIC VOICE GENERATOR")
print("=" * 60)
print(f"Voice: {NARRATOR_VOICE} (Deep Slovenian Male)")
print(f"Style: Noir, Slow-Paced, Emotional")
print(f"Effects: SSML pauses, emphasis, prosody control")
print("=" * 60)
print()
voices = [
("intro_cutscene.mp3", INTRO_SCRIPT, NARRATOR_RATE),
("kai_memory_ana.mp3", KAI_MEMORY, "-25%"),
("discovery_church.mp3", DARK_DISCOVERY, "-15%"),
]
for filename, script, rate in voices:
output_path = VOICE_DIR / filename
print(f"🎙️ Generating: {filename}")
await generate_voice_with_ssml(script, NARRATOR_VOICE, output_path, rate)
print()
print("=" * 60)
print("✅ VOICE GENERATION COMPLETE!")
print()
print("📁 Files saved to:")
print(f" {VOICE_DIR}")
print()
print("🎵 NEXT STEPS:")
print("1. Add reverb effect (use Audacity or ffmpeg)")
print("2. Layer with wind/fire ambience")
print("3. Integrate with Phaser typewriter sync")
print()
print("REVERB COMMAND (ffmpeg):")
print("ffmpeg -i input.mp3 -af 'aecho=0.8:0.9:1000:0.3' output_reverb.mp3")
print()
async def main():
"""Main execution"""
if not EDGE_TTS_AVAILABLE:
print("ERROR: edge-tts not installed")
return
await generate_all_narrator_voices()
if __name__ == "__main__":
asyncio.run(main())