| |
| |
| |
| |
|
|
| use lazy_static::lazy_static; |
| use std::collections::HashMap; |
|
|
| lazy_static! { |
| |
| static ref G2P_DICT: HashMap<&'static str, Vec<&'static str>> = { |
| let mut m = HashMap::new(); |
| |
| m.insert("hello", vec!["HH", "AH0", "L", "OW1"]); |
| m.insert("world", vec!["W", "ER1", "L", "D"]); |
| m.insert("the", vec!["DH", "AH0"]); |
| m.insert("a", vec!["AH0"]); |
| m.insert("is", vec!["IH1", "Z"]); |
| m.insert("to", vec!["T", "UW1"]); |
| m.insert("and", vec!["AH0", "N", "D"]); |
| m.insert("in", vec!["IH0", "N"]); |
| m.insert("that", vec!["DH", "AE1", "T"]); |
| m.insert("have", vec!["HH", "AE1", "V"]); |
| m.insert("for", vec!["F", "AO1", "R"]); |
| m.insert("not", vec!["N", "AA1", "T"]); |
| m.insert("with", vec!["W", "IH1", "DH"]); |
| m.insert("you", vec!["Y", "UW1"]); |
| m.insert("this", vec!["DH", "IH1", "S"]); |
| m.insert("but", vec!["B", "AH1", "T"]); |
| m.insert("from", vec!["F", "R", "AH1", "M"]); |
| m.insert("they", vec!["DH", "EY1"]); |
| m.insert("we", vec!["W", "IY1"]); |
| m.insert("say", vec!["S", "EY1"]); |
| m.insert("she", vec!["SH", "IY1"]); |
| m.insert("or", vec!["AO1", "R"]); |
| m.insert("an", vec!["AE1", "N"]); |
| m.insert("will", vec!["W", "IH1", "L"]); |
| m.insert("my", vec!["M", "AY1"]); |
| m.insert("one", vec!["W", "AH1", "N"]); |
| m.insert("all", vec!["AO1", "L"]); |
| m.insert("would", vec!["W", "UH1", "D"]); |
| m.insert("there", vec!["DH", "EH1", "R"]); |
| m.insert("their", vec!["DH", "EH1", "R"]); |
| m |
| }; |
|
|
| |
| static ref PINYIN_MAP: HashMap<&'static str, (&'static str, &'static str)> = { |
| let mut m = HashMap::new(); |
| |
| m.insert("ba", ("b", "a")); |
| m.insert("pa", ("p", "a")); |
| m.insert("ma", ("m", "a")); |
| m.insert("fa", ("f", "a")); |
| m.insert("da", ("d", "a")); |
| m.insert("ta", ("t", "a")); |
| m.insert("na", ("n", "a")); |
| m.insert("la", ("l", "a")); |
| m.insert("ga", ("g", "a")); |
| m.insert("ka", ("k", "a")); |
| m.insert("ha", ("h", "a")); |
| m.insert("zha", ("zh", "a")); |
| m.insert("cha", ("ch", "a")); |
| m.insert("sha", ("sh", "a")); |
| m.insert("za", ("z", "a")); |
| m.insert("ca", ("c", "a")); |
| m.insert("sa", ("s", "a")); |
| m.insert("ni", ("n", "i")); |
| m.insert("hao", ("h", "ao")); |
| m.insert("shi", ("sh", "i")); |
| m.insert("jie", ("j", "ie")); |
| m.insert("zhong", ("zh", "ong")); |
| m.insert("guo", ("g", "uo")); |
| m.insert("ren", ("r", "en")); |
| m.insert("ming", ("m", "ing")); |
| m.insert("de", ("d", "e")); |
| m.insert("yi", ("", "i")); |
| m.insert("er", ("", "er")); |
| m.insert("san", ("s", "an")); |
| m.insert("si", ("s", "i")); |
| m.insert("wu", ("", "u")); |
| m.insert("liu", ("l", "iu")); |
| m.insert("qi", ("q", "i")); |
| m.insert("jiu", ("j", "iu")); |
| m |
| }; |
| } |
|
|
| |
| pub fn g2p_english(word: &str) -> Vec<String> { |
| let lower = word.to_lowercase(); |
|
|
| if let Some(phones) = G2P_DICT.get(lower.as_str()) { |
| phones.iter().map(|s| s.to_string()).collect() |
| } else { |
| |
| word.chars() |
| .map(|c| c.to_uppercase().to_string()) |
| .collect() |
| } |
| } |
|
|
| |
| pub fn text_to_phonemes(text: &str) -> Vec<String> { |
| let mut phonemes = Vec::new(); |
|
|
| let words: Vec<&str> = text.split_whitespace().collect(); |
|
|
| for (i, word) in words.iter().enumerate() { |
| let clean_word: String = word |
| .chars() |
| .filter(|c| c.is_alphabetic()) |
| .collect(); |
|
|
| if !clean_word.is_empty() { |
| phonemes.extend(g2p_english(&clean_word)); |
| } |
|
|
| |
| if i < words.len() - 1 { |
| phonemes.push(" ".to_string()); |
| } |
| } |
|
|
| phonemes |
| } |
|
|
| |
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| pub enum Tone { |
| First, |
| Second, |
| Third, |
| Fourth, |
| Neutral, |
| } |
|
|
| |
| pub fn extract_tone(pinyin: &str) -> (String, Tone) { |
| let tone_marks = [ |
| ('ā', 'a', Tone::First), |
| ('á', 'a', Tone::Second), |
| ('ǎ', 'a', Tone::Third), |
| ('à', 'a', Tone::Fourth), |
| ('ē', 'e', Tone::First), |
| ('é', 'e', Tone::Second), |
| ('ě', 'e', Tone::Third), |
| ('è', 'e', Tone::Fourth), |
| ('ī', 'i', Tone::First), |
| ('í', 'i', Tone::Second), |
| ('ǐ', 'i', Tone::Third), |
| ('ì', 'i', Tone::Fourth), |
| ('ō', 'o', Tone::First), |
| ('ó', 'o', Tone::Second), |
| ('ǒ', 'o', Tone::Third), |
| ('ò', 'o', Tone::Fourth), |
| ('ū', 'u', Tone::First), |
| ('ú', 'u', Tone::Second), |
| ('ǔ', 'u', Tone::Third), |
| ('ù', 'u', Tone::Fourth), |
| ('ǖ', 'ü', Tone::First), |
| ('ǘ', 'ü', Tone::Second), |
| ('ǚ', 'ü', Tone::Third), |
| ('ǜ', 'ü', Tone::Fourth), |
| ]; |
|
|
| let mut result = pinyin.to_string(); |
| let mut tone = Tone::Neutral; |
|
|
| for (marked, plain, t) in tone_marks.iter() { |
| if result.contains(*marked) { |
| result = result.replace(*marked, &plain.to_string()); |
| tone = *t; |
| break; |
| } |
| } |
|
|
| |
| if let Some(last_char) = result.chars().last() { |
| if last_char.is_ascii_digit() { |
| let tone_num = last_char.to_digit(10).unwrap_or(5); |
| tone = match tone_num { |
| 1 => Tone::First, |
| 2 => Tone::Second, |
| 3 => Tone::Third, |
| 4 => Tone::Fourth, |
| _ => Tone::Neutral, |
| }; |
| result.pop(); |
| } |
| } |
|
|
| (result, tone) |
| } |
|
|
| |
| pub fn pinyin_to_phones(pinyin: &str) -> Vec<String> { |
| let (base, tone) = extract_tone(pinyin); |
| let lower = base.to_lowercase(); |
|
|
| let mut phones = Vec::new(); |
|
|
| if let Some(&(initial, final_part)) = PINYIN_MAP.get(lower.as_str()) { |
| if !initial.is_empty() { |
| phones.push(initial.to_string()); |
| } |
| phones.push(final_part.to_string()); |
| } else { |
| |
| phones.push(lower); |
| } |
|
|
| |
| let tone_str = match tone { |
| Tone::First => "1", |
| Tone::Second => "2", |
| Tone::Third => "3", |
| Tone::Fourth => "4", |
| Tone::Neutral => "5", |
| }; |
| phones.push(tone_str.to_string()); |
|
|
| phones |
| } |
|
|
| |
| pub fn char_to_pinyin(ch: char) -> Option<String> { |
| |
| |
| let pinyin_map: HashMap<char, &str> = [ |
| ('你', "ni3"), |
| ('好', "hao3"), |
| ('世', "shi4"), |
| ('界', "jie4"), |
| ('中', "zhong1"), |
| ('国', "guo2"), |
| ('人', "ren2"), |
| ('我', "wo3"), |
| ('是', "shi4"), |
| ('的', "de5"), |
| ('了', "le5"), |
| ('在', "zai4"), |
| ('有', "you3"), |
| ('个', "ge4"), |
| ('这', "zhe4"), |
| ('他', "ta1"), |
| ('说', "shuo1"), |
| ('来', "lai2"), |
| ('要', "yao4"), |
| ('就', "jiu4"), |
| ('出', "chu1"), |
| ('会', "hui4"), |
| ('可', "ke3"), |
| ('以', "yi3"), |
| ('时', "shi2"), |
| ('大', "da4"), |
| ('看', "kan4"), |
| ('地', "di4"), |
| ('不', "bu4"), |
| ('对', "dui4"), |
| ] |
| .iter() |
| .cloned() |
| .collect(); |
|
|
| pinyin_map.get(&ch).map(|s| s.to_string()) |
| } |
|
|
| |
| pub fn segment_chinese(text: &str) -> Vec<String> { |
| use jieba_rs::Jieba; |
|
|
| let jieba = Jieba::new(); |
| let words = jieba.cut(text, false); |
| words.into_iter().map(|s| s.to_string()).collect() |
| } |
|
|
| |
| pub fn chinese_to_pinyin(text: &str) -> Vec<String> { |
| let mut pinyin_seq = Vec::new(); |
|
|
| for ch in text.chars() { |
| if super::is_chinese_char(ch) { |
| if let Some(py) = char_to_pinyin(ch) { |
| pinyin_seq.push(py); |
| } else { |
| |
| pinyin_seq.push(format!("_{}_", ch)); |
| } |
| } else if !ch.is_whitespace() { |
| pinyin_seq.push(ch.to_string()); |
| } |
| } |
|
|
| pinyin_seq |
| } |
|
|
| #[cfg(test)] |
| mod tests { |
| use super::*; |
|
|
| #[test] |
| fn test_g2p_english() { |
| let phones = g2p_english("hello"); |
| assert_eq!(phones, vec!["HH", "AH0", "L", "OW1"]); |
| } |
|
|
| #[test] |
| fn test_g2p_unknown() { |
| let phones = g2p_english("xyz"); |
| |
| assert_eq!(phones, vec!["X", "Y", "Z"]); |
| } |
|
|
| #[test] |
| fn test_extract_tone() { |
| let (base, tone) = extract_tone("nǐ"); |
| assert_eq!(base, "ni"); |
| assert_eq!(tone, Tone::Third); |
|
|
| let (base, tone) = extract_tone("hao3"); |
| assert_eq!(base, "hao"); |
| assert_eq!(tone, Tone::Third); |
| } |
|
|
| #[test] |
| fn test_pinyin_to_phones() { |
| let phones = pinyin_to_phones("hao3"); |
| assert!(phones.contains(&"h".to_string())); |
| assert!(phones.contains(&"ao".to_string())); |
| assert!(phones.contains(&"3".to_string())); |
| } |
|
|
| #[test] |
| fn test_char_to_pinyin() { |
| assert_eq!(char_to_pinyin('你'), Some("ni3".to_string())); |
| assert_eq!(char_to_pinyin('好'), Some("hao3".to_string())); |
| } |
|
|
| #[test] |
| fn test_segment_chinese() { |
| let segments = segment_chinese("你好世界"); |
| assert!(segments.len() >= 2); |
| } |
| } |
|
|