Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -156,16 +156,6 @@ CHOICES = {
|
|
156 |
}
|
157 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
158 |
|
159 |
-
np_log_99 = np.log(99)
|
160 |
-
def s_curve(p):
|
161 |
-
if p <= 0:
|
162 |
-
return 0
|
163 |
-
elif p >= 1:
|
164 |
-
return 1
|
165 |
-
s = 1 / (1 + np.exp((1-p*2)*np_log_99))
|
166 |
-
s = (s-0.01) * 50/49
|
167 |
-
return s
|
168 |
-
|
169 |
SAMPLE_RATE = 24000
|
170 |
|
171 |
@torch.no_grad()
|
@@ -198,10 +188,10 @@ def forward_gpu(tokens, voice, speed):
|
|
198 |
return forward(tokens, voice, speed, device='cuda')
|
199 |
|
200 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
201 |
-
def generate(text, voice, ps, speed, _reduce_noise,
|
202 |
-
return _generate(text, voice, ps, speed,
|
203 |
|
204 |
-
def _generate(text, voice, ps, speed,
|
205 |
if voice not in VOICES['cpu']:
|
206 |
voice = 'af'
|
207 |
ps = ps or phonemize(text, voice)
|
@@ -219,18 +209,11 @@ def _generate(text, voice, ps, speed, opening_cut, closing_cut, ease_in, ease_ou
|
|
219 |
except gr.exceptions.Error as e:
|
220 |
raise gr.Error(e)
|
221 |
return (None, '')
|
222 |
-
|
223 |
-
if
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
out = out[:-closing_cut]
|
228 |
-
ease_in = min(int(ease_in / speed), len(out)//2)
|
229 |
-
for i in range(ease_in):
|
230 |
-
out[i] *= s_curve(i / ease_in)
|
231 |
-
ease_out = min(int(ease_out / speed), len(out)//2)
|
232 |
-
for i in range(ease_out):
|
233 |
-
out[-i-1] *= s_curve(i / ease_out)
|
234 |
return ((SAMPLE_RATE, out), ps)
|
235 |
|
236 |
def toggle_autoplay(autoplay):
|
@@ -271,25 +254,15 @@ with gr.Blocks() as basic_tts:
|
|
271 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
272 |
with gr.Column():
|
273 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
276 |
with gr.Accordion('Output Tokens', open=True):
|
277 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
278 |
-
|
279 |
-
|
280 |
-
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
|
281 |
-
with gr.Row():
|
282 |
-
with gr.Column():
|
283 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
|
284 |
-
with gr.Column():
|
285 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
|
286 |
-
with gr.Row():
|
287 |
-
with gr.Column():
|
288 |
-
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
|
289 |
-
with gr.Column():
|
290 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
|
291 |
-
text.submit(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
292 |
-
generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
293 |
|
294 |
@torch.no_grad()
|
295 |
def lf_forward(token_lists, voice, speed, device='cpu'):
|
@@ -376,11 +349,10 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
376 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
377 |
return [(i, *row) for i, row in enumerate(segments)]
|
378 |
|
379 |
-
def lf_generate(segments, voice, speed,
|
380 |
token_lists = list(map(tokenize, segments['Tokens']))
|
381 |
wavs = []
|
382 |
-
|
383 |
-
closing_cut = int(closing_cut / speed)
|
384 |
pad_between = int(pad_between / speed)
|
385 |
batch_size = 100
|
386 |
for i in range(0, len(token_lists), batch_size):
|
@@ -396,16 +368,10 @@ def lf_generate(segments, voice, speed, opening_cut, closing_cut, ease_in, ease_
|
|
396 |
raise gr.Error(e)
|
397 |
break
|
398 |
for out in outs:
|
399 |
-
if
|
400 |
-
|
401 |
-
|
402 |
-
out = out[:-
|
403 |
-
ease_in = min(int(ease_in / speed), len(out)//2)
|
404 |
-
for i in range(ease_in):
|
405 |
-
out[i] *= s_curve(i / ease_in)
|
406 |
-
ease_out = min(int(ease_out / speed), len(out)//2)
|
407 |
-
for i in range(ease_out):
|
408 |
-
out[-i-1] *= s_curve(i / ease_out)
|
409 |
if wavs and pad_between > 0:
|
410 |
wavs.append(np.zeros(pad_between))
|
411 |
wavs.append(out)
|
@@ -451,26 +417,15 @@ with gr.Blocks() as lf_tts:
|
|
451 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
452 |
with gr.Column():
|
453 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
454 |
-
with gr.Accordion('Audio Settings', open=
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
with gr.Column():
|
459 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
|
460 |
-
with gr.Column():
|
461 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
|
462 |
-
with gr.Row():
|
463 |
-
with gr.Column():
|
464 |
-
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
|
465 |
-
with gr.Column():
|
466 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
|
467 |
-
with gr.Row():
|
468 |
-
pad_between = gr.Slider(minimum=0, maximum=24000, value=10000, step=1000, label='🔇 Pad Between', info='How many samples of silence to insert between segments')
|
469 |
with gr.Row():
|
470 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
471 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
472 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
473 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
474 |
|
475 |
with gr.Blocks() as about:
|
476 |
gr.Markdown("""
|
|
|
156 |
}
|
157 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
SAMPLE_RATE = 24000
|
160 |
|
161 |
@torch.no_grad()
|
|
|
188 |
return forward(tokens, voice, speed, device='cuda')
|
189 |
|
190 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
191 |
+
def generate(text, voice, ps, speed, _reduce_noise, trim, _closing_cut, _ease_in, _ease_out, _pad_before, _pad_after, use_gpu):
|
192 |
+
return _generate(text, voice, ps, speed, trim, use_gpu)
|
193 |
|
194 |
+
def _generate(text, voice, ps, speed, trim, use_gpu):
|
195 |
if voice not in VOICES['cpu']:
|
196 |
voice = 'af'
|
197 |
ps = ps or phonemize(text, voice)
|
|
|
209 |
except gr.exceptions.Error as e:
|
210 |
raise gr.Error(e)
|
211 |
return (None, '')
|
212 |
+
trim = int(trim / speed)
|
213 |
+
if trim > 0:
|
214 |
+
if trim * 2 >= len(out):
|
215 |
+
return (None, '')
|
216 |
+
out = out[trim:-trim]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
return ((SAMPLE_RATE, out), ps)
|
218 |
|
219 |
def toggle_autoplay(autoplay):
|
|
|
254 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
255 |
with gr.Column():
|
256 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
257 |
+
with gr.Accordion('Audio Settings', open=False):
|
258 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
259 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
260 |
+
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
261 |
+
trim = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
262 |
with gr.Accordion('Output Tokens', open=True):
|
263 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
264 |
+
text.submit(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
265 |
+
generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
@torch.no_grad()
|
268 |
def lf_forward(token_lists, voice, speed, device='cpu'):
|
|
|
349 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
350 |
return [(i, *row) for i, row in enumerate(segments)]
|
351 |
|
352 |
+
def lf_generate(segments, voice, speed, trim, pad_between, use_gpu):
|
353 |
token_lists = list(map(tokenize, segments['Tokens']))
|
354 |
wavs = []
|
355 |
+
trim = int(trim / speed)
|
|
|
356 |
pad_between = int(pad_between / speed)
|
357 |
batch_size = 100
|
358 |
for i in range(0, len(token_lists), batch_size):
|
|
|
368 |
raise gr.Error(e)
|
369 |
break
|
370 |
for out in outs:
|
371 |
+
if trim > 0:
|
372 |
+
if trim * 2 >= len(out):
|
373 |
+
continue
|
374 |
+
out = out[trim:-trim]
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
if wavs and pad_between > 0:
|
376 |
wavs.append(np.zeros(pad_between))
|
377 |
wavs.append(out)
|
|
|
417 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
418 |
with gr.Column():
|
419 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
420 |
+
with gr.Accordion('Audio Settings', open=True):
|
421 |
+
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
422 |
+
trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
|
423 |
+
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
with gr.Row():
|
425 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
426 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
427 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
428 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio])
|
429 |
|
430 |
with gr.Blocks() as about:
|
431 |
gr.Markdown("""
|