Spaces:
No application file
No application file
| import re | |
| import gradio | |
| import tqdm | |
| from bark.api import * | |
| from .bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new, codec_decode_new, SAMPLE_RATE | |
| from typing import Union | |
| def text_to_semantic_new( | |
| text: str, | |
| history_prompt: Optional[Union[str, dict]] = None, | |
| temp: float = 0.7, | |
| silent: bool = False, | |
| allow_early_stop: bool = True, | |
| min_eos_p: float = 0.2, | |
| progress=gradio.Progress() | |
| ): | |
| """Generate semantic array from text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| allow_early_stop: (Added in new) set to False to generate until the limit | |
| min_eos_p: (Added in new) Generation stopping likelyness, Lower means more likely to stop. | |
| progress: (Added in new) Gradio progress bar. | |
| Returns: | |
| numpy semantic array to be fed into `semantic_to_waveform` | |
| """ | |
| x_semantic = generate_text_semantic_new( | |
| text, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| silent=silent, | |
| use_kv_caching=True, | |
| allow_early_stop=allow_early_stop, | |
| min_eos_p=min_eos_p, | |
| progress=progress | |
| ) | |
| return x_semantic | |
| def semantic_to_waveform_new( | |
| semantic_tokens: np.ndarray, | |
| history_prompt: Optional[Union[str, dict]] = None, | |
| temp: float = 0.7, | |
| silent: bool = False, | |
| output_full: bool = False, | |
| skip_fine: bool = False, | |
| decode_on_cpu: bool = False, | |
| progress=gradio.Progress() | |
| ): | |
| """Generate audio array from semantic input. | |
| Args: | |
| semantic_tokens: semantic token output from `text_to_semantic` | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| output_full: return full generation to be used as a history prompt | |
| skip_fine: (Added in new) Skip converting coarse to fine | |
| decode_on_cpu: (Added in new) Move everything to cpu when decoding, useful for decoding huge audio files on medium vram | |
| progress: (Added in new) Gradio progress bar. | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| coarse_tokens = generate_coarse_new( | |
| semantic_tokens, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| silent=silent, | |
| use_kv_caching=True, | |
| progress=progress | |
| ) | |
| if not skip_fine: | |
| fine_tokens = generate_fine_new( | |
| coarse_tokens, | |
| history_prompt=history_prompt, | |
| temp=0.5, | |
| progress=progress | |
| ) | |
| else: | |
| fine_tokens = coarse_tokens | |
| audio_arr = codec_decode_new(fine_tokens, decode_on_cpu) | |
| if output_full: | |
| full_generation = { | |
| "semantic_prompt": semantic_tokens, | |
| "coarse_prompt": coarse_tokens, | |
| "fine_prompt": fine_tokens, | |
| } | |
| return full_generation, audio_arr | |
| return audio_arr | |
| def strict_split(string: str, regex='([.,:;!?\\n])'): | |
| splits = re.split(regex, string) | |
| splits_out = [] | |
| last = '' | |
| for idx, split in enumerate(splits): | |
| if idx % 2 == 0: | |
| last = split | |
| else: | |
| last += split | |
| splits_out.append(last) | |
| if len(splits_out) == 0 or not splits_out[-1] == last: | |
| splits_out.append(last) | |
| return splits_out | |
| def non_strict_split(string: str): | |
| return strict_split(string, '(\\.)') | |
| def long_merge(splits: list[str]): | |
| limit = 220 # Estimated for normal speaking speed | |
| out_list = [] | |
| current_str = '' | |
| for split in splits: | |
| if len(current_str) + len(split) <= limit: | |
| current_str += split | |
| else: | |
| if current_str: | |
| out_list.append(current_str) | |
| current_str = split | |
| if current_str: | |
| out_list.append(current_str) | |
| return out_list | |
| def strict_short(string): | |
| return strict_split(string) | |
| def strict_long(string): | |
| return long_merge(strict_split(string)) | |
| def non_strict_short(string): | |
| return non_strict_split(string) | |
| def non_strict_long(string): | |
| return long_merge(non_strict_split(string)) | |
| def generate_audio_new( | |
| text: str, | |
| history_prompt: Optional[Union[str, dict]] = None, | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| silent: bool = False, | |
| output_full: bool = False, | |
| skip_fine: bool = False, | |
| decode_on_cpu: bool = False, | |
| allow_early_stop: bool = True, | |
| min_eos_p: float = 0.2, | |
| long_gen_silence_secs: float = 0, | |
| long_gen_re_feed: bool = True, | |
| gen_prefix: str = '', | |
| split_type: str = 'Manual', | |
| progress=gradio.Progress() | |
| ): | |
| """Generate audio array from input text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| output_full: return full generation to be used as a history prompt | |
| skip_fine: (Added in new) Skip converting from coarse to fine | |
| decode_on_cpu: (Added in new) Decode on cpu | |
| allow_early_stop: (Added in new) Set to false to continue until the limit is reached | |
| min_eos_p: (Added in new) Lower values stop the generation earlier. | |
| long_gen_silence_secs: (Added in new) The amount of silence between clips for long form generations. | |
| long_gen_re_feed: (Added in new) For longer generations (\n) use the last generated chunk as the prompt for the next. Better continuation at risk of changing voice. | |
| gen_prefix: (Added in new) A prefix to add to every single generated chunk. | |
| split_type: (Added in new) The way to split the clips. | |
| progress: (Added in new) Gradio progress bar. | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| if gen_prefix: | |
| gen_prefix = gen_prefix + ' ' | |
| silence = np.zeros(int(long_gen_silence_secs * SAMPLE_RATE)) | |
| gen_audio = [] | |
| if text: | |
| match split_type.casefold(): | |
| case 'manual': | |
| gen_sections = text.strip().split('\n') | |
| case 'strict short': | |
| gen_sections = strict_short(text) | |
| case 'strict long': | |
| gen_sections = strict_long(text) | |
| case 'non-strict short': | |
| gen_sections = non_strict_short(text) | |
| case 'non-strict long': | |
| gen_sections = non_strict_long(text) | |
| case _: | |
| print('??? Unknown split method selected. Not splitting.') | |
| gen_sections = [text] | |
| else: | |
| gen_sections = [text] | |
| print('Generation split into sections:', gen_sections) | |
| for input_text in tqdm.tqdm(gen_sections, desc='Generation section'): | |
| input_text = gen_prefix + input_text | |
| semantic_tokens = text_to_semantic_new( | |
| input_text, | |
| history_prompt=history_prompt, | |
| temp=text_temp, | |
| silent=silent, | |
| allow_early_stop=allow_early_stop, | |
| min_eos_p=min_eos_p, | |
| progress=progress | |
| ) | |
| out = semantic_to_waveform_new( | |
| semantic_tokens, | |
| history_prompt=history_prompt, | |
| temp=waveform_temp, | |
| silent=silent, | |
| output_full=True, | |
| skip_fine=skip_fine, | |
| decode_on_cpu=decode_on_cpu, | |
| progress=progress | |
| ) | |
| full_generation, gen_audio_new = out | |
| if long_gen_re_feed: | |
| history_prompt = full_generation | |
| gen_audio += [gen_audio_new, silence.copy()] | |
| gen_audio = np.concatenate(gen_audio) | |
| if output_full: | |
| return full_generation, gen_audio | |
| return gen_audio | |