Spaces:

mrtroydev
/

audio-webui

No application file

App Files Files Community

audio-webui / webui /modules /implementations /patches /bark_api.py

mrtroydev

Upload folder using huggingface_hub

3883c60 verified almost 2 years ago

raw

history blame contribute delete

7.97 kB

	import re

	import gradio
	import tqdm
	from bark.api import *
	from .bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new, codec_decode_new, SAMPLE_RATE
	from typing import Union


	def text_to_semantic_new(
	text: str,
	history_prompt: Optional[Union[str, dict]] = None,
	temp: float = 0.7,
	silent: bool = False,
	allow_early_stop: bool = True,
	min_eos_p: float = 0.2,
	progress=gradio.Progress()
	):
	"""Generate semantic array from text.

	Args:
	text: text to be turned into audio
	history_prompt: history choice for audio cloning
	temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar
	allow_early_stop: (Added in new) set to False to generate until the limit
	min_eos_p: (Added in new) Generation stopping likelyness, Lower means more likely to stop.
	progress: (Added in new) Gradio progress bar.

	Returns:
	numpy semantic array to be fed into `semantic_to_waveform`
	"""
	x_semantic = generate_text_semantic_new(
	text,
	history_prompt=history_prompt,
	temp=temp,
	silent=silent,
	use_kv_caching=True,
	allow_early_stop=allow_early_stop,
	min_eos_p=min_eos_p,
	progress=progress
	)
	return x_semantic


	def semantic_to_waveform_new(
	semantic_tokens: np.ndarray,
	history_prompt: Optional[Union[str, dict]] = None,
	temp: float = 0.7,
	silent: bool = False,
	output_full: bool = False,
	skip_fine: bool = False,
	decode_on_cpu: bool = False,
	progress=gradio.Progress()
	):
	"""Generate audio array from semantic input.

	Args:
	semantic_tokens: semantic token output from `text_to_semantic`
	history_prompt: history choice for audio cloning
	temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar
	output_full: return full generation to be used as a history prompt
	skip_fine: (Added in new) Skip converting coarse to fine
	decode_on_cpu: (Added in new) Move everything to cpu when decoding, useful for decoding huge audio files on medium vram
	progress: (Added in new) Gradio progress bar.

	Returns:
	numpy audio array at sample frequency 24khz
	"""
	coarse_tokens = generate_coarse_new(
	semantic_tokens,
	history_prompt=history_prompt,
	temp=temp,
	silent=silent,
	use_kv_caching=True,
	progress=progress
	)
	if not skip_fine:
	fine_tokens = generate_fine_new(
	coarse_tokens,
	history_prompt=history_prompt,
	temp=0.5,
	progress=progress
	)
	else:
	fine_tokens = coarse_tokens
	audio_arr = codec_decode_new(fine_tokens, decode_on_cpu)
	if output_full:
	full_generation = {
	"semantic_prompt": semantic_tokens,
	"coarse_prompt": coarse_tokens,
	"fine_prompt": fine_tokens,
	}
	return full_generation, audio_arr
	return audio_arr



	def strict_split(string: str, regex='([.,:;!?\\n])'):
	splits = re.split(regex, string)
	splits_out = []
	last = ''
	for idx, split in enumerate(splits):
	if idx % 2 == 0:
	last = split
	else:
	last += split
	splits_out.append(last)

	if len(splits_out) == 0 or not splits_out[-1] == last:
	splits_out.append(last)

	return splits_out


	def non_strict_split(string: str):
	return strict_split(string, '(\\.)')


	def long_merge(splits: list[str]):
	limit = 220 # Estimated for normal speaking speed

	out_list = []
	current_str = ''

	for split in splits:
	if len(current_str) + len(split) <= limit:
	current_str += split
	else:
	if current_str:
	out_list.append(current_str)
	current_str = split

	if current_str:
	out_list.append(current_str)

	return out_list


	def strict_short(string):
	return strict_split(string)


	def strict_long(string):
	return long_merge(strict_split(string))


	def non_strict_short(string):
	return non_strict_split(string)


	def non_strict_long(string):
	return long_merge(non_strict_split(string))


	def generate_audio_new(
	text: str,
	history_prompt: Optional[Union[str, dict]] = None,
	text_temp: float = 0.7,
	waveform_temp: float = 0.7,
	silent: bool = False,
	output_full: bool = False,
	skip_fine: bool = False,
	decode_on_cpu: bool = False,
	allow_early_stop: bool = True,
	min_eos_p: float = 0.2,
	long_gen_silence_secs: float = 0,
	long_gen_re_feed: bool = True,
	gen_prefix: str = '',
	split_type: str = 'Manual',
	progress=gradio.Progress()
	):
	"""Generate audio array from input text.

	Args:
	text: text to be turned into audio
	history_prompt: history choice for audio cloning
	text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar
	output_full: return full generation to be used as a history prompt
	skip_fine: (Added in new) Skip converting from coarse to fine
	decode_on_cpu: (Added in new) Decode on cpu
	allow_early_stop: (Added in new) Set to false to continue until the limit is reached
	min_eos_p: (Added in new) Lower values stop the generation earlier.
	long_gen_silence_secs: (Added in new) The amount of silence between clips for long form generations.
	long_gen_re_feed: (Added in new) For longer generations (\n) use the last generated chunk as the prompt for the next. Better continuation at risk of changing voice.
	gen_prefix: (Added in new) A prefix to add to every single generated chunk.
	split_type: (Added in new) The way to split the clips.
	progress: (Added in new) Gradio progress bar.

	Returns:
	numpy audio array at sample frequency 24khz
	"""
	if gen_prefix:
	gen_prefix = gen_prefix + ' '

	silence = np.zeros(int(long_gen_silence_secs * SAMPLE_RATE))
	gen_audio = []
	if text:
	match split_type.casefold():
	case 'manual':
	gen_sections = text.strip().split('\n')
	case 'strict short':
	gen_sections = strict_short(text)
	case 'strict long':
	gen_sections = strict_long(text)
	case 'non-strict short':
	gen_sections = non_strict_short(text)
	case 'non-strict long':
	gen_sections = non_strict_long(text)
	case _:
	print('??? Unknown split method selected. Not splitting.')
	gen_sections = [text]
	else:
	gen_sections = [text]
	print('Generation split into sections:', gen_sections)
	for input_text in tqdm.tqdm(gen_sections, desc='Generation section'):
	input_text = gen_prefix + input_text
	semantic_tokens = text_to_semantic_new(
	input_text,
	history_prompt=history_prompt,
	temp=text_temp,
	silent=silent,
	allow_early_stop=allow_early_stop,
	min_eos_p=min_eos_p,
	progress=progress
	)
	out = semantic_to_waveform_new(
	semantic_tokens,
	history_prompt=history_prompt,
	temp=waveform_temp,
	silent=silent,
	output_full=True,
	skip_fine=skip_fine,
	decode_on_cpu=decode_on_cpu,
	progress=progress
	)
	full_generation, gen_audio_new = out
	if long_gen_re_feed:
	history_prompt = full_generation
	gen_audio += [gen_audio_new, silence.copy()]

	gen_audio = np.concatenate(gen_audio)

	if output_full:
	return full_generation, gen_audio
	return gen_audio