Spaces:
Sleeping
Sleeping
| import whisper as openai_whisper | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from TTS.api import TTS | |
| import gradio as gr | |
| import torch | |
| import os | |
| # 1. Speech-to-Text (STT) Implementation | |
| def setup_stt(): | |
| model = openai_whisper.load_model("base") # Explicit OpenAI Whisper | |
| return model | |
| def transcribe_audio(model, audio_file): | |
| result = model.transcribe(audio_file) | |
| print("Transcription:", result['text']) | |
| return result['text'] | |
| # 2. Natural Language Processing (NLP) Implementation | |
| def setup_nlp(): | |
| model_name = "gpt2" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| return tokenizer, model | |
| def generate_response(tokenizer, model, input_text): | |
| prompt = f"User: {input_text}\nAssistant:" | |
| input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
| response = model.generate( | |
| input_ids, | |
| max_length=150, | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| top_p=0.9, | |
| pad_token_id=tokenizer.eos_token_id, | |
| no_repeat_ngram_size=2 | |
| ) | |
| return tokenizer.decode(response[0], skip_special_tokens=True) | |
| # 3. Text-to-Speech (TTS) Implementation | |
| def setup_tts(): | |
| tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") | |
| return tts | |
| def generate_speech(tts, text, file_path="output.wav"): | |
| tts.tts_to_file(text, file_path=file_path) | |
| return file_path | |
| # 4. Voice AI System Class | |
| class VoiceAISystem: | |
| def __init__(self): | |
| print("Initializing Voice AI System...") | |
| print("Loading STT model...") | |
| self.stt_model = setup_stt() | |
| print("Loading NLP model...") | |
| self.tokenizer, self.nlp_model = setup_nlp() | |
| print("Loading TTS model...") | |
| self.tts_model = setup_tts() | |
| # GPU Optimization | |
| self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Using device: {self.device}") | |
| self.nlp_model = self.nlp_model.to(self.device) | |
| print("System initialization complete!") | |
| def process_audio(self, audio_file): | |
| try: | |
| os.makedirs("tmp", exist_ok=True) | |
| print("Transcribing audio...") | |
| text = transcribe_audio(self.stt_model, audio_file) | |
| print("Generating response...") | |
| with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): | |
| response = generate_response(self.tokenizer, self.nlp_model, text) | |
| print("Converting response to speech...") | |
| output_path = os.path.join("tmp", "response.wav") | |
| audio_response = generate_speech(self.tts_model, response, output_path) | |
| return audio_response, text, response | |
| except Exception as e: | |
| print(f"Error during processing: {str(e)}") | |
| return None, f"Error: {str(e)}", "Error processing request" | |
| # 5. Gradio UI Integration | |
| def create_voice_ai_interface(): | |
| system = VoiceAISystem() | |
| def chat(audio): | |
| if audio is None: | |
| return None, "No audio provided", "No response generated" | |
| return system.process_audio(audio) | |
| interface = gr.Interface( | |
| fn=chat, | |
| inputs=[ | |
| gr.Audio( | |
| type="filepath", | |
| label="Speak here" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Audio(label="AI Response"), | |
| gr.Textbox(label="Transcribed Text"), | |
| gr.Textbox(label="AI Response Text") | |
| ], | |
| title="Voice AI System", | |
| description="Click to record your voice and interact with the AI" | |
| ) | |
| return interface | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| iface = create_voice_ai_interface() | |
| iface.launch(share=True) | |