| |
| |
| |
|
|
| use clap::{Parser, Subcommand}; |
| use indextts::{ |
| pipeline::{IndexTTS, SynthesisOptions}, |
| Config, Result, |
| }; |
| use std::path::PathBuf; |
|
|
| #[derive(Parser)] |
| #[command( |
| name = "indextts", |
| about = "High-performance Text-to-Speech engine in Rust", |
| version, |
| author |
| )] |
| struct Cli { |
| #[command(subcommand)] |
| command: Commands, |
| } |
|
|
| #[derive(Subcommand)] |
| enum Commands { |
| |
| Synthesize { |
| |
| #[arg(short, long)] |
| text: String, |
|
|
| |
| #[arg(short = 'v', long)] |
| voice: PathBuf, |
|
|
| |
| #[arg(short, long, default_value = "output.wav")] |
| output: PathBuf, |
|
|
| |
| #[arg(short, long)] |
| config: Option<PathBuf>, |
|
|
| |
| #[arg(short, long, default_value = "models")] |
| model_dir: PathBuf, |
|
|
| |
| #[arg(long)] |
| emotion: Option<String>, |
|
|
| |
| #[arg(long, default_value = "1.0")] |
| emotion_alpha: f32, |
|
|
| |
| #[arg(long, default_value = "50")] |
| top_k: usize, |
|
|
| |
| #[arg(long, default_value = "0.95")] |
| top_p: f32, |
|
|
| |
| #[arg(long, default_value = "1.1")] |
| repetition_penalty: f32, |
|
|
| |
| #[arg(long)] |
| fp16: bool, |
|
|
| |
| #[arg(short, long, default_value = "cpu")] |
| device: String, |
| }, |
|
|
| |
| SynthesizeFile { |
| |
| #[arg(short, long)] |
| input: PathBuf, |
|
|
| |
| #[arg(short = 'v', long)] |
| voice: PathBuf, |
|
|
| |
| #[arg(short, long, default_value = "output.wav")] |
| output: PathBuf, |
|
|
| |
| #[arg(short, long)] |
| config: Option<PathBuf>, |
|
|
| |
| #[arg(short, long, default_value = "models")] |
| model_dir: PathBuf, |
|
|
| |
| #[arg(long, default_value = "200")] |
| silence_ms: u32, |
| }, |
|
|
| |
| InitConfig { |
| |
| #[arg(short, long, default_value = "config.yaml")] |
| output: PathBuf, |
| }, |
|
|
| |
| Info, |
|
|
| |
| Benchmark { |
| |
| #[arg(short, long, default_value = "10")] |
| iterations: usize, |
| }, |
| } |
|
|
| fn main() -> Result<()> { |
| |
| env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); |
|
|
| let cli = Cli::parse(); |
|
|
| match cli.command { |
| Commands::Synthesize { |
| text, |
| voice, |
| output, |
| config, |
| model_dir, |
| emotion, |
| emotion_alpha, |
| top_k, |
| top_p, |
| repetition_penalty, |
| fp16: _, |
| device: _, |
| } => { |
| log::info!("IndexTTS Synthesizer"); |
| log::info!("===================="); |
|
|
| |
| let cfg = if let Some(config_path) = config { |
| Config::load(config_path)? |
| } else { |
| let mut cfg = Config::default(); |
| cfg.model_dir = model_dir; |
| cfg |
| }; |
|
|
| |
| let tts = IndexTTS::new(cfg)?; |
|
|
| |
| let emotion_vec = emotion.map(|s| { |
| s.split(',') |
| .filter_map(|v| v.trim().parse::<f32>().ok()) |
| .collect::<Vec<f32>>() |
| }); |
|
|
| |
| let options = SynthesisOptions { |
| emotion_vector: emotion_vec, |
| emotion_alpha, |
| sampling: indextts::model::SamplingStrategy::TopKP { k: top_k, p: top_p }, |
| repetition_penalty, |
| ..Default::default() |
| }; |
|
|
| |
| log::info!("Text: {}", &text[..text.len().min(100)]); |
| log::info!("Voice: {}", voice.display()); |
| log::info!("Output: {}", output.display()); |
|
|
| let result = tts.synthesize_to_file( |
| &text, |
| voice.to_str().unwrap(), |
| output.to_str().unwrap(), |
| &options, |
| )?; |
|
|
| log::info!("Duration: {}", result.duration_formatted()); |
| log::info!("Processing time: {:.2}s", result.processing_time); |
| log::info!("Real-time factor: {:.3}x", result.rtf); |
|
|
| println!("✓ Synthesis complete: {}", output.display()); |
| } |
|
|
| Commands::SynthesizeFile { |
| input, |
| voice, |
| output, |
| config, |
| model_dir, |
| silence_ms, |
| } => { |
| log::info!("IndexTTS File Synthesizer"); |
| log::info!("=========================="); |
|
|
| |
| let text = std::fs::read_to_string(&input)?; |
|
|
| |
| let cfg = if let Some(config_path) = config { |
| Config::load(config_path)? |
| } else { |
| let mut cfg = Config::default(); |
| cfg.model_dir = model_dir; |
| cfg |
| }; |
|
|
| |
| let tts = IndexTTS::new(cfg)?; |
|
|
| |
| let options = SynthesisOptions { |
| segment_silence_ms: silence_ms, |
| ..Default::default() |
| }; |
|
|
| |
| log::info!("Input file: {}", input.display()); |
| log::info!("Text length: {} characters", text.len()); |
|
|
| let result = tts.synthesize_long( |
| &text, |
| voice.to_str().unwrap(), |
| &options, |
| )?; |
|
|
| result.save(&output)?; |
|
|
| log::info!("Duration: {}", result.duration_formatted()); |
| log::info!("Processing time: {:.2}s", result.processing_time); |
| log::info!("Real-time factor: {:.3}x", result.rtf); |
|
|
| println!("✓ Synthesis complete: {}", output.display()); |
| } |
|
|
| Commands::InitConfig { output } => { |
| log::info!("Creating default configuration..."); |
|
|
| let config = Config::default(); |
| config.save(&output)?; |
|
|
| println!("✓ Configuration saved to: {}", output.display()); |
| } |
|
|
| Commands::Info => { |
| println!("IndexTTS - High-performance Text-to-Speech Engine"); |
| println!("=================================================="); |
| println!("Version: {}", indextts::VERSION); |
| println!("Platform: {}", std::env::consts::OS); |
| println!("Architecture: {}", std::env::consts::ARCH); |
| println!(); |
| println!("Features:"); |
| println!(" - Multi-language support (Chinese, English, mixed)"); |
| println!(" - Zero-shot voice cloning"); |
| println!(" - 8-dimensional emotion control"); |
| println!(" - High-quality neural vocoding (BigVGAN)"); |
| println!(" - SIMD-optimized audio processing"); |
| println!(" - Parallel processing with Rayon"); |
| println!(); |
| println!("Sample Rate: {} Hz", indextts::SAMPLE_RATE); |
| println!("Mel Bands: {}", indextts::N_MELS); |
| println!("FFT Size: {}", indextts::N_FFT); |
| println!("Hop Length: {}", indextts::HOP_LENGTH); |
| println!(); |
| println!("CPU Cores: {}", num_cpus::get()); |
| println!("Physical Cores: {}", num_cpus::get_physical()); |
| } |
|
|
| Commands::Benchmark { iterations } => { |
| log::info!("Running benchmarks ({} iterations)...", iterations); |
|
|
| |
| benchmark_mel_spectrogram(iterations); |
|
|
| |
| benchmark_tokenization(iterations); |
|
|
| |
| benchmark_vocoder(iterations); |
|
|
| println!("✓ Benchmarks complete"); |
| } |
| } |
|
|
| Ok(()) |
| } |
|
|
| fn benchmark_mel_spectrogram(iterations: usize) { |
| use indextts::audio::{mel_spectrogram, AudioConfig}; |
| use std::time::Instant; |
|
|
| println!("\nMel-Spectrogram Benchmark"); |
| println!("-------------------------"); |
|
|
| let config = AudioConfig::default(); |
| let num_samples = config.sample_rate as usize; |
| let signal: Vec<f32> = (0..num_samples) |
| .map(|i| (i as f32 * 0.01).sin()) |
| .collect(); |
|
|
| let start = Instant::now(); |
| for _ in 0..iterations { |
| let _ = mel_spectrogram(&signal, &config); |
| } |
| let elapsed = start.elapsed(); |
|
|
| let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| println!(" Signal length: {} samples ({:.2}s)", num_samples, num_samples as f32 / config.sample_rate as f32); |
| println!(" Iterations: {}", iterations); |
| println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| println!(" Throughput: {:.1}x real-time", 1.0 / per_iter); |
| } |
|
|
| fn benchmark_tokenization(iterations: usize) { |
| use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig}; |
| use std::time::Instant; |
|
|
| println!("\nTokenization Benchmark"); |
| println!("----------------------"); |
|
|
| let normalizer = TextNormalizer::new(); |
| let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap(); |
|
|
| let test_texts = vec![ |
| "Hello world, this is a test of the text-to-speech system.", |
| "The quick brown fox jumps over the lazy dog.", |
| "你好世界,这是一个测试。", |
| "Mixed language: Hello 世界 and 你好 world.", |
| ]; |
|
|
| let start = Instant::now(); |
| for _ in 0..iterations { |
| for text in &test_texts { |
| let normalized = normalizer.normalize(text).unwrap(); |
| let _tokens = tokenizer.encode(&normalized).unwrap(); |
| } |
| } |
| let elapsed = start.elapsed(); |
|
|
| let total_chars: usize = test_texts.iter().map(|t| t.len()).sum(); |
| let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| println!(" Texts: {}", test_texts.len()); |
| println!(" Total characters: {}", total_chars); |
| println!(" Iterations: {}", iterations); |
| println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| println!( |
| " Throughput: {:.0} chars/sec", |
| (total_chars * iterations) as f32 / elapsed.as_secs_f32() |
| ); |
| } |
|
|
| fn benchmark_vocoder(iterations: usize) { |
| use indextts::vocoder::{create_bigvgan_22k, Vocoder}; |
| use ndarray::Array2; |
| use std::time::Instant; |
|
|
| println!("\nVocoder Benchmark"); |
| println!("-----------------"); |
|
|
| let vocoder = create_bigvgan_22k(); |
| let num_frames = 100; |
| let mel = Array2::zeros((80, num_frames)); |
|
|
| let start = Instant::now(); |
| for _ in 0..iterations { |
| let _ = vocoder.synthesize(&mel); |
| } |
| let elapsed = start.elapsed(); |
|
|
| let audio_duration = num_frames as f32 * vocoder.hop_length() as f32 / vocoder.sample_rate() as f32; |
| let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| println!(" Mel frames: {}", num_frames); |
| println!(" Audio duration: {:.2}s", audio_duration); |
| println!(" Iterations: {}", iterations); |
| println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| println!(" RTF: {:.3}x", per_iter / audio_duration); |
| } |
|
|