| |
| |
| |
|
|
| mod synthesis; |
|
|
| pub use synthesis::{IndexTTS, SynthesisOptions, SynthesisResult}; |
|
|
| use crate::{Error, Result}; |
| use std::path::{Path, PathBuf}; |
|
|
| |
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| pub enum PipelineStage { |
| TextNormalization, |
| Tokenization, |
| SemanticEncoding, |
| SpeakerConditioning, |
| GptGeneration, |
| AcousticExpansion, |
| Vocoding, |
| PostProcessing, |
| } |
|
|
| impl PipelineStage { |
| |
| pub fn name(&self) -> &'static str { |
| match self { |
| PipelineStage::TextNormalization => "Text Normalization", |
| PipelineStage::Tokenization => "Tokenization", |
| PipelineStage::SemanticEncoding => "Semantic Encoding", |
| PipelineStage::SpeakerConditioning => "Speaker Conditioning", |
| PipelineStage::GptGeneration => "GPT Generation", |
| PipelineStage::AcousticExpansion => "Acoustic Expansion", |
| PipelineStage::Vocoding => "Vocoding", |
| PipelineStage::PostProcessing => "Post Processing", |
| } |
| } |
|
|
| |
| pub fn all() -> Vec<PipelineStage> { |
| vec![ |
| PipelineStage::TextNormalization, |
| PipelineStage::Tokenization, |
| PipelineStage::SemanticEncoding, |
| PipelineStage::SpeakerConditioning, |
| PipelineStage::GptGeneration, |
| PipelineStage::AcousticExpansion, |
| PipelineStage::Vocoding, |
| PipelineStage::PostProcessing, |
| ] |
| } |
| } |
|
|
| |
| pub type ProgressCallback = Box<dyn Fn(PipelineStage, f32) + Send + Sync>; |
|
|
| |
| #[derive(Debug, Clone)] |
| pub struct PipelineConfig { |
| |
| pub model_dir: PathBuf, |
| |
| pub use_fp16: bool, |
| |
| pub device: String, |
| |
| pub enable_cache: bool, |
| |
| pub max_text_length: usize, |
| |
| pub max_audio_duration: f32, |
| } |
|
|
| impl Default for PipelineConfig { |
| fn default() -> Self { |
| Self { |
| model_dir: PathBuf::from("models"), |
| use_fp16: false, |
| device: "cpu".to_string(), |
| enable_cache: true, |
| max_text_length: 500, |
| max_audio_duration: 30.0, |
| } |
| } |
| } |
|
|
| impl PipelineConfig { |
| |
| pub fn with_model_dir<P: AsRef<Path>>(mut self, path: P) -> Self { |
| self.model_dir = path.as_ref().to_path_buf(); |
| self |
| } |
|
|
| |
| pub fn with_fp16(mut self, enable: bool) -> Self { |
| self.use_fp16 = enable; |
| self |
| } |
|
|
| |
| pub fn with_device(mut self, device: &str) -> Self { |
| self.device = device.to_string(); |
| self |
| } |
|
|
| |
| pub fn validate(&self) -> Result<()> { |
| if !self.model_dir.exists() { |
| log::warn!( |
| "Model directory does not exist: {}", |
| self.model_dir.display() |
| ); |
| } |
|
|
| if self.max_text_length == 0 { |
| return Err(Error::Config("max_text_length must be > 0".into())); |
| } |
|
|
| if self.max_audio_duration <= 0.0 { |
| return Err(Error::Config("max_audio_duration must be > 0".into())); |
| } |
|
|
| Ok(()) |
| } |
| } |
|
|
| |
| pub fn segment_text(text: &str, max_segment_len: usize) -> Vec<String> { |
| use crate::text::TextNormalizer; |
|
|
| let normalizer = TextNormalizer::new(); |
| let sentences = normalizer.split_sentences(text); |
|
|
| let mut segments = Vec::new(); |
| let mut current_segment = String::new(); |
|
|
| for sentence in sentences { |
| if current_segment.len() + sentence.len() > max_segment_len && !current_segment.is_empty() |
| { |
| segments.push(current_segment.trim().to_string()); |
| current_segment = sentence; |
| } else { |
| if !current_segment.is_empty() { |
| current_segment.push(' '); |
| } |
| current_segment.push_str(&sentence); |
| } |
| } |
|
|
| if !current_segment.trim().is_empty() { |
| segments.push(current_segment.trim().to_string()); |
| } |
|
|
| segments |
| } |
|
|
| |
| pub fn concatenate_audio(segments: &[Vec<f32>], silence_duration_ms: u32, sample_rate: u32) -> Vec<f32> { |
| let silence_samples = (silence_duration_ms as usize * sample_rate as usize) / 1000; |
| let silence = vec![0.0f32; silence_samples]; |
|
|
| let mut result = Vec::new(); |
|
|
| for (i, segment) in segments.iter().enumerate() { |
| result.extend_from_slice(segment); |
| if i < segments.len() - 1 { |
| result.extend_from_slice(&silence); |
| } |
| } |
|
|
| result |
| } |
|
|
| |
| pub fn estimate_duration(text: &str, chars_per_second: f32) -> f32 { |
| text.chars().count() as f32 / chars_per_second |
| } |
|
|