Spaces:

Teapack1
/

Assistant-Audio-Intent-Classification

Sleeping

App Files Files Community

Teapack1 commited on Nov 24, 2023

Commit

22ba507

1 Parent(s): 51a2f53

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -27

app.py CHANGED Viewed

@@ -1,35 +1,69 @@
 from transformers import pipeline
-model_id = "sanchit-gandhi/whisper-small-dv"  # update with your model id
-pipe = pipeline("automatic-speech-recognition", model=model_id)
-def transcribe_speech(filepath):
-    output = pipe(
-        filepath,
-        max_new_tokens=256,
-        generate_kwargs={
-            "task": "transcribe",
-            "language": "sinhalese",
-        },  # update with the language you've fine-tuned on
-        chunk_length_s=30,
-        batch_size=8,
-    )
-    return output["text"]
-import gradio as gr
-demo = gr.Blocks()
-mic_transcribe = gr.Interface(
-    fn=transcribe_speech,
-    inputs=gr.Audio(sources="microphone", type="filepath"),
-    outputs=gr.outputs.Textbox(),
-)
-with demo:
-    gr.TabbedInterface(
-        [mic_transcribe],
-        ["Transcribe Microphone"],
-    )
 demo.launch(debug=True)

+import gradio as gr
 from transformers import pipeline
+import numpy as np
+import time
+# Initialize the pipelines
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
+classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
+candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "not about lighting"]
+last_update_time = time.time() - 5  # Initialize with a value to ensure immediate first update
+# Buffer to hold the last updated values
+last_transcription = ""
+last_classification = ""
+def transcribe_and_classify(stream, new_chunk):
+    global last_update_time, last_transcription, last_classification
+    sr, y = new_chunk
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    # Concatenate new audio chunk to the stream
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+    # Keep only the last 10 seconds of audio
+    num_samples_last_10_seconds = 5 * sr
+    if len(stream) > num_samples_last_10_seconds:
+        stream = stream[-num_samples_last_10_seconds:]
+    current_time = time.time()
+    # Update every 5 seconds
+    if current_time - last_update_time >= 5:
+        last_update_time = current_time
+        # Transcribe the last 10 seconds of audio
+        transcription = transcriber({"sampling_rate": sr, "task": "transcribe", "language": "english", "raw": stream})["text"]
+        last_transcription = transcription  # Update the buffer
+        # Classify the transcribed text
+        if transcription.strip():
+            output = classifier(transcription, candidate_labels, multi_label=False)
+            top_label = output['labels'][0]
+            top_score = output['scores'][0]
+            last_classification = f"{top_label.upper()}, score: {top_score:.2f}"
+    # Return the last updated transcription and classification
+    return stream, last_transcription, last_classification
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=transcribe_and_classify,
+    inputs=[
+        "state",
+        gr.Audio(sources=["microphone"], streaming=True)
+    ],
+    outputs=[
+        "state",
+        "text",
+        "text"
+    ],
+    live=True
+)
+# Launch the demo
 demo.launch(debug=True)