FFomy commited on
Commit
cb8606e
·
verified ·
1 Parent(s): 75467e4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +673 -0
  2. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from huggingface_hub import snapshot_download
3
+ from modelscope.hub.snapshot_download import snapshot_download
4
+
5
+
6
+ # 1. 定义本地路径和远程仓库ID
7
+ FUN_ASR_NANO_LOCAL_PATH = "./Fun-ASR/model"
8
+ FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
9
+ SENSE_VOICE_SMALL_LOCAL_PATH = "./Fun-ASR/model/SenseVoiceSmall"
10
+ # SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
11
+ # REPO_TYPE = "hf" # "hf" for Hugging Face, "ms" for ModelScope
12
+ SENSE_VOICE_SMALL_REPO_ID = "iic/SenseVoiceSmall"
13
+ REPO_TYPE = "ms"
14
+
15
+ # 2. 检查本地是否存在,不存在则下载
16
+ if not os.path.exists(FUN_ASR_NANO_LOCAL_PATH):
17
+ from modelscope import HubApi
18
+ api= HubApi()
19
+ api.login(os.getenv("MODELSCOPE_TOKEN"))
20
+ print(f"正在下载模型 Fun-ASR-Nano 到 {FUN_ASR_NANO_LOCAL_PATH} ...")
21
+ snapshot_download(
22
+ repo_id=FUN_ASR_NANO_REPO_ID,
23
+ local_dir=FUN_ASR_NANO_LOCAL_PATH,
24
+ ignore_patterns=["*.onnx"], # 如果你不需要onnx文件,可以过滤掉以节省时间和空间
25
+ )
26
+ print("模型下载完毕!")
27
+ else:
28
+ print("检测到本地模型文件,跳过下载。")
29
+
30
+
31
+ if not os.path.exists(SENSE_VOICE_SMALL_LOCAL_PATH):
32
+ print(f"正在下载模型 {SENSE_VOICE_SMALL_REPO_ID} 到 {SENSE_VOICE_SMALL_LOCAL_PATH} ...")
33
+ snapshot_download(
34
+ repo_id=SENSE_VOICE_SMALL_REPO_ID,
35
+ local_dir=SENSE_VOICE_SMALL_LOCAL_PATH,
36
+ ignore_patterns=["*.onnx"], # 如果你不需要onnx文件,可以过滤掉以节省时间和空间
37
+ )
38
+ print("模型下载完毕!")
39
+ else:
40
+ print("检测到本地模型文件,跳过下载。")
41
+
42
+
43
+
44
+
45
+ import gradio as gr
46
+ import time
47
+ import sys
48
+ import io
49
+ import tempfile
50
+ import subprocess
51
+ import requests
52
+ from urllib.parse import urlparse
53
+ from pydub import AudioSegment
54
+ import logging
55
+ import torch
56
+ import importlib
57
+ from funasr import AutoModel
58
+ from funasr.utils.postprocess_utils import rich_transcription_postprocess
59
+
60
+ # Model configurations for Hugging Face deployment
61
+ FUN_ASR_NANO_MODEL_PATH_LIST = [
62
+ "Fun-ASR/model", # local path, ms
63
+ "FunAudioLLM/fun-asr-nano", # huggingface model repo, hf
64
+ "FunAudioLLM/fun-asr-nano" # ModelScope model repo, ms
65
+ ]
66
+
67
+ SENSEVOICE_MODEL_PATH_LIST = [
68
+ "Fun-ASR/model/SenseVoiceSmall", # local path together with this hf space
69
+ "FunAudioLLM/SenseVoiceSmall", # huggingface model repo
70
+ "iic/SenseVoiceSmall" # ModelScope model repo
71
+ ]
72
+
73
+ class LogCapture(io.StringIO):
74
+ def __init__(self, callback):
75
+ super().__init__()
76
+ self.callback = callback
77
+
78
+ def write(self, s):
79
+ super().write(s)
80
+ self.callback(s)
81
+
82
+ # Set up logging
83
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
84
+
85
+
86
+
87
+
88
+
89
+
90
+ # Check for CUDA availability
91
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
92
+ logging.info(f"Using device: {device}")
93
+
94
+ def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
95
+ """
96
+ Downloads audio from a given URL using the specified method and proxy settings.
97
+
98
+ Args:
99
+ url (str): The URL of the audio.
100
+ method_choice (str): The method to use for downloading audio.
101
+ proxy_url (str): Proxy URL if needed.
102
+ proxy_username (str): Proxy username.
103
+ proxy_password (str): Proxy password.
104
+
105
+ Returns:
106
+ tuple: (path to the downloaded audio file, is_temp_file), or (None, False) if failed.
107
+ """
108
+ parsed_url = urlparse(url)
109
+ logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
110
+ try:
111
+ if 'youtube.com' in parsed_url.netloc or 'youtu.be' in parsed_url.netloc:
112
+ error_msg = f"YouTube download is not supported. Please use direct audio URLs instead."
113
+ logging.error(error_msg)
114
+ return None, False
115
+ elif parsed_url.scheme == 'rtsp':
116
+ audio_file = download_rtsp_audio(url, proxy_url)
117
+ if not audio_file:
118
+ error_msg = f"Failed to download RTSP audio from {url}"
119
+ logging.error(error_msg)
120
+ return None, False
121
+ else:
122
+ audio_file = download_direct_audio(url, method_choice, proxy_url, proxy_username, proxy_password)
123
+ if not audio_file:
124
+ error_msg = f"Failed to download audio from {url} using method {method_choice}"
125
+ logging.error(error_msg)
126
+ return None, False
127
+ return audio_file, True
128
+ except Exception as e:
129
+ error_msg = f"Error downloading audio from {url} using method {method_choice}: {str(e)}"
130
+ logging.error(error_msg)
131
+ return None, False
132
+
133
+
134
+
135
+
136
+ def download_rtsp_audio(url, proxy_url):
137
+ """
138
+ Downloads audio from an RTSP URL using FFmpeg.
139
+
140
+ Args:
141
+ url (str): The RTSP URL.
142
+ proxy_url (str): Proxy URL if needed.
143
+
144
+ Returns:
145
+ str: Path to the downloaded audio file, or None if failed.
146
+ """
147
+ logging.info("Using FFmpeg to download RTSP stream")
148
+ output_file = tempfile.mktemp(suffix='.mp3')
149
+ command = ['ffmpeg', '-i', url, '-acodec', 'libmp3lame', '-ab', '192k', '-y', output_file]
150
+ env = os.environ.copy()
151
+ if proxy_url and len(proxy_url.strip()) > 0:
152
+ env['http_proxy'] = proxy_url
153
+ env['https_proxy'] = proxy_url
154
+ try:
155
+ subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
156
+ logging.info(f"Downloaded RTSP audio to: {output_file}")
157
+ return output_file
158
+ except subprocess.CalledProcessError as e:
159
+ logging.error(f"FFmpeg error: {e.stderr.decode()}")
160
+ return None
161
+ except Exception as e:
162
+ logging.error(f"Error downloading RTSP audio: {str(e)}")
163
+ return None
164
+
165
+ def download_direct_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
166
+ """
167
+ Downloads audio from a direct URL using the specified method.
168
+
169
+ Args:
170
+ url (str): The direct URL of the audio file.
171
+ method_choice (str): The method to use for downloading.
172
+ proxy_url (str): Proxy URL if needed.
173
+ proxy_username (str): Proxy username.
174
+ proxy_password (str): Proxy password.
175
+
176
+ Returns:
177
+ str: Path to the downloaded audio file, or None if failed.
178
+ """
179
+ logging.info(f"Downloading direct audio from: {url} using method: {method_choice}")
180
+ methods = {
181
+ 'wget': wget_method,
182
+ 'requests': requests_method,
183
+ 'ffmpeg': ffmpeg_method,
184
+ 'aria2': aria2_method,
185
+ }
186
+ method = methods.get(method_choice, requests_method)
187
+ try:
188
+ audio_file = method(url, proxy_url, proxy_username, proxy_password)
189
+ if not audio_file or not os.path.exists(audio_file):
190
+ error_msg = f"Failed to download direct audio from {url} using method {method_choice}"
191
+ logging.error(error_msg)
192
+ return None
193
+ return audio_file
194
+ except Exception as e:
195
+ logging.error(f"Error downloading direct audio with {method_choice}: {str(e)}")
196
+ return None
197
+
198
+ def requests_method(url, proxy_url, proxy_username, proxy_password):
199
+ """
200
+ Downloads audio using the requests library.
201
+
202
+ Args:
203
+ url (str): The URL of the audio file.
204
+ proxy_url (str): Proxy URL if needed.
205
+ proxy_username (str): Proxy username.
206
+ proxy_password (str): Proxy password.
207
+
208
+ Returns:
209
+ str: Path to the downloaded audio file, or None if failed.
210
+ """
211
+ try:
212
+ proxies = None
213
+ auth = None
214
+ if proxy_url and len(proxy_url.strip()) > 0:
215
+ proxies = {
216
+ "http": proxy_url,
217
+ "https": proxy_url
218
+ }
219
+ if proxy_username and proxy_password:
220
+ auth = (proxy_username, proxy_password)
221
+ response = requests.get(url, stream=True, proxies=proxies, auth=auth)
222
+ if response.status_code == 200:
223
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
224
+ for chunk in response.iter_content(chunk_size=8192):
225
+ if chunk:
226
+ temp_file.write(chunk)
227
+ logging.info(f"Downloaded direct audio to: {temp_file.name}")
228
+ return temp_file.name
229
+ else:
230
+ logging.error(f"Failed to download audio from {url} with status code {response.status_code}")
231
+ return None
232
+ except Exception as e:
233
+ logging.error(f"Error in requests_method: {str(e)}")
234
+ return None
235
+
236
+ def wget_method(url, proxy_url, proxy_username, proxy_password):
237
+ """
238
+ Downloads audio using the wget command-line tool.
239
+
240
+ Args:
241
+ url (str): The URL of the audio file.
242
+ proxy_url (str): Proxy URL if needed.
243
+ proxy_username (str): Proxy username.
244
+ proxy_password (str): Proxy password.
245
+
246
+ Returns:
247
+ str: Path to the downloaded audio file, or None if failed.
248
+ """
249
+ logging.info("Using wget method")
250
+ output_file = tempfile.mktemp(suffix='.mp3')
251
+ command = ['wget', '-O', output_file, url]
252
+ env = os.environ.copy()
253
+ if proxy_url and len(proxy_url.strip()) > 0:
254
+ env['http_proxy'] = proxy_url
255
+ env['https_proxy'] = proxy_url
256
+ try:
257
+ subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
258
+ logging.info(f"Downloaded audio to: {output_file}")
259
+ return output_file
260
+ except subprocess.CalledProcessError as e:
261
+ logging.error(f"Wget error: {e.stderr.decode()}")
262
+ return None
263
+ except Exception as e:
264
+ logging.error(f"Error in wget_method: {str(e)}")
265
+ return None
266
+
267
+
268
+ def ffmpeg_method(url, proxy_url, proxy_username, proxy_password):
269
+ """
270
+ Downloads audio using FFmpeg.
271
+
272
+ Args:
273
+ url (str): The URL of the audio file.
274
+ proxy_url (str): Proxy URL if needed.
275
+ proxy_username (str): Proxy username.
276
+ proxy_password (str): Proxy password.
277
+
278
+ Returns:
279
+ str: Path to the downloaded audio file, or None if failed.
280
+ """
281
+ logging.info("Using ffmpeg method")
282
+ output_file = tempfile.mktemp(suffix='.mp3')
283
+ command = ['ffmpeg', '-i', url, '-vn', '-acodec', 'libmp3lame', '-q:a', '2', output_file]
284
+ env = os.environ.copy()
285
+ if proxy_url and len(proxy_url.strip()) > 0:
286
+ env['http_proxy'] = proxy_url
287
+ env['https_proxy'] = proxy_url
288
+ try:
289
+ subprocess.run(command, check=True, capture_output=True, text=True, env=env)
290
+ logging.info(f"Downloaded and converted audio to: {output_file}")
291
+ return output_file
292
+ except subprocess.CalledProcessError as e:
293
+ logging.error(f"FFmpeg error: {e.stderr}")
294
+ return None
295
+ except Exception as e:
296
+ logging.error(f"Error in ffmpeg_method: {str(e)}")
297
+ return None
298
+
299
+ def aria2_method(url, proxy_url, proxy_username, proxy_password):
300
+ """
301
+ Downloads audio using aria2.
302
+
303
+ Args:
304
+ url (str): The URL of the audio file.
305
+ proxy_url (str): Proxy URL if needed.
306
+ proxy_username (str): Proxy username.
307
+ proxy_password (str): Proxy password.
308
+
309
+ Returns:
310
+ str: Path to the downloaded audio file, or None if failed.
311
+ """
312
+ logging.info("Using aria2 method")
313
+ output_file = tempfile.mktemp(suffix='.mp3')
314
+ command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
315
+ if proxy_url and len(proxy_url.strip()) > 0:
316
+ command.extend(['--all-proxy', proxy_url])
317
+ try:
318
+ subprocess.run(command, check=True, capture_output=True, text=True)
319
+ logging.info(f"Downloaded audio to: {output_file}")
320
+ return output_file
321
+ except subprocess.CalledProcessError as e:
322
+ logging.error(f"Aria2 error: {e.stderr}")
323
+ return None
324
+ except Exception as e:
325
+ logging.error(f"Error in aria2_method: {str(e)}")
326
+ return None
327
+
328
+ def trim_audio(audio_path, start_time, end_time):
329
+ """
330
+ Trims an audio file to the specified start and end times.
331
+
332
+ Args:
333
+ audio_path (str): Path to the audio file.
334
+ start_time (float): Start time in seconds.
335
+ end_time (float): End time in seconds.
336
+
337
+ Returns:
338
+ str: Path to the trimmed audio file.
339
+
340
+ Raises:
341
+ gr.Error: If invalid start or end times are provided.
342
+ """
343
+ try:
344
+ logging.info(f"Trimming audio from {start_time} to {end_time}")
345
+ audio = AudioSegment.from_file(audio_path)
346
+ audio_duration = len(audio) / 1000 # Duration in seconds
347
+
348
+ # Default start and end times if None
349
+ start_time = max(0, start_time) if start_time is not None else 0
350
+ end_time = min(audio_duration, end_time) if end_time is not None else audio_duration
351
+
352
+ # Validate times
353
+ if start_time >= end_time:
354
+ raise gr.Error("End time must be greater than start time.")
355
+
356
+ trimmed_audio = audio[int(start_time * 1000):int(end_time * 1000)]
357
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
358
+ trimmed_audio.export(temp_audio_file.name, format="wav")
359
+ logging.info(f"Trimmed audio saved to: {temp_audio_file.name}")
360
+ return temp_audio_file.name
361
+ except Exception as e:
362
+ logging.error(f"Error trimming audio: {str(e)}")
363
+ raise gr.Error(f"Error trimming audio: {str(e)}")
364
+
365
+ def save_transcription(transcription):
366
+ """
367
+ Saves the transcription text to a temporary file.
368
+
369
+ Args:
370
+ transcription (str): The transcription text.
371
+
372
+ Returns:
373
+ str: The path to the transcription file.
374
+ """
375
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as temp_file:
376
+ temp_file.write(transcription)
377
+ logging.info(f"Transcription saved to: {temp_file.name}")
378
+ return temp_file.name
379
+
380
+ def get_model_options(pipeline_type):
381
+ """
382
+ Returns a list of model IDs based on the selected pipeline type.
383
+
384
+ Args:
385
+ pipeline_type (str): The type of pipeline.
386
+
387
+ Returns:
388
+ list: A list of model IDs.
389
+ """
390
+ if pipeline_type == "fun-asr-nano":
391
+ return FUN_ASR_NANO_MODEL_PATH_LIST
392
+ elif pipeline_type == "sensevoice":
393
+ return SENSEVOICE_MODEL_PATH_LIST
394
+ else:
395
+ return []
396
+ # if pipeline_type == "sensevoice":
397
+ # return SENSEVOICE_MODEL_PATH_LIST
398
+ # else:
399
+ # return []
400
+
401
+ # Dictionary to store loaded models
402
+ loaded_models = {}
403
+
404
+ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time=None, end_time=None, verbose=False):
405
+ """
406
+ Transcribes audio from a given source using SenseVoice.
407
+
408
+ Args:
409
+ audio_input (str): Path to uploaded audio file or recorded audio.
410
+ audio_url (str): URL of audio.
411
+ proxy_url (str): Proxy URL if needed.
412
+ proxy_username (str): Proxy username.
413
+ proxy_password (str): Proxy password.
414
+ pipeline_type (str): Type of pipeline to use ('sensevoice').
415
+ model_id (str): The ID of the model to use.
416
+ download_method (str): Method to use for downloading audio.
417
+ start_time (float, optional): Start time in seconds for trimming audio.
418
+ end_time (float, optional): End time in seconds for trimming audio.
419
+ verbose (bool, optional): Whether to output verbose logging.
420
+
421
+ Yields:
422
+ Tuple[str, str, str or None]: Metrics and messages, transcription text, path to transcription file.
423
+ """
424
+ try:
425
+ if verbose:
426
+ logging.getLogger().setLevel(logging.INFO)
427
+ else:
428
+ logging.getLogger().setLevel(logging.WARNING)
429
+
430
+ logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, download_method={download_method}")
431
+ verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nDownload Method: {download_method}\n"
432
+
433
+ if verbose:
434
+ yield verbose_messages, "", None
435
+
436
+ # Determine the audio source
437
+ audio_path = None
438
+ is_temp_file = False
439
+
440
+ if audio_input is not None and len(audio_input) > 0:
441
+ # audio_input is a filepath to uploaded or recorded audio
442
+ audio_path = audio_input
443
+ is_temp_file = False
444
+ elif audio_url is not None and len(audio_url.strip()) > 0:
445
+ # audio_url is provided
446
+ audio_path, is_temp_file = download_audio(audio_url, download_method, proxy_url, proxy_username, proxy_password)
447
+ if not audio_path:
448
+ error_msg = f"Error downloading audio from {audio_url} using method {download_method}. Check logs for details."
449
+ logging.error(error_msg)
450
+ yield verbose_messages + error_msg, "", None
451
+ return
452
+ else:
453
+ verbose_messages += f"Successfully downloaded audio from {audio_url}\n"
454
+ if verbose:
455
+ yield verbose_messages, "", None
456
+ else:
457
+ error_msg = "No audio source provided. Please upload an audio file, record audio, or enter a URL."
458
+ logging.error(error_msg)
459
+ yield verbose_messages + error_msg, "", None
460
+ return
461
+
462
+ # Convert start_time and end_time to float or None
463
+ start_time = float(start_time) if start_time else None
464
+ end_time = float(end_time) if end_time else None
465
+
466
+ if start_time is not None or end_time is not None:
467
+ audio_path = trim_audio(audio_path, start_time, end_time)
468
+ is_temp_file = True # The trimmed audio is a temporary file
469
+ verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
470
+ if verbose:
471
+ yield verbose_messages, "", None
472
+
473
+ # Model caching
474
+ model_key = (pipeline_type, model_id)
475
+ if model_key in loaded_models:
476
+ model = loaded_models[model_key]
477
+ logging.info("Loaded model from cache")
478
+ else:
479
+ if pipeline_type == "fun-asr-nano":
480
+ model = AutoModel(
481
+ model=model_id,
482
+ trust_remote_code=True,
483
+ remote_code=f"./Fun-ASR/model.py",
484
+ vad_model="fsmn-vad",
485
+ vad_kwargs={"max_single_segment_time": 30000},
486
+ device=device,
487
+ disable_update=True,
488
+ hub=REPO_TYPE,
489
+ )
490
+ elif pipeline_type == "sensevoice":
491
+ model = AutoModel(
492
+ model=model_id,
493
+ trust_remote_code=False,
494
+ vad_model="fsmn-vad",
495
+ vad_kwargs={"max_single_segment_time": 30000},
496
+ device=device,
497
+ disable_update=True,
498
+ hub=REPO_TYPE,
499
+ )
500
+ else:
501
+ error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
502
+ logging.error(error_msg)
503
+ yield verbose_messages + error_msg, "", None
504
+ return
505
+ loaded_models[model_key] = model
506
+
507
+ # Perform the transcription
508
+ start_time_perf = time.time()
509
+
510
+ if pipeline_type == "fun-asr-nano":
511
+ system_prompt = "You are a helpful assistant."
512
+ user_prompt = f"语音转写:<|startofspeech|>!{audio_path}<|endofspeech|>"
513
+ contents_i = []
514
+ contents_i.append({"role": "system", "content": system_prompt})
515
+ contents_i.append({"role": "user", "content": user_prompt})
516
+ contents_i.append({"role": "assistant", "content": "null"})
517
+ print(audio_path)
518
+ res = model.generate(
519
+ input=[audio_path],
520
+ use_itn=True,
521
+ batch_size=1,
522
+ )
523
+ elif pipeline_type == "sensevoice":
524
+ res = model.generate(
525
+ input=audio_path,
526
+ cache={},
527
+ language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
528
+ use_itn=True,
529
+ batch_size_s=60,
530
+ merge_vad=True,
531
+ merge_length_s=15,
532
+ )
533
+
534
+ transcription = rich_transcription_postprocess(res[0]["text"])
535
+ end_time_perf = time.time()
536
+
537
+ # Calculate metrics
538
+ transcription_time = end_time_perf - start_time_perf
539
+ audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
540
+
541
+ metrics_output = (
542
+ f"Transcription time: {transcription_time:.2f} seconds\n"
543
+ f"Audio file size: {audio_file_size:.2f} MB\n"
544
+ )
545
+
546
+ # Save the transcription to a file
547
+ transcription_file = save_transcription(transcription)
548
+
549
+ # Always yield the final result, regardless of verbose setting
550
+ final_metrics = verbose_messages + metrics_output
551
+ yield final_metrics, transcription, transcription_file
552
+
553
+ except Exception as e:
554
+ error_msg = f"An error occurred during transcription: {str(e)}"
555
+ logging.error(error_msg)
556
+ yield verbose_messages + error_msg, "", None
557
+
558
+ finally:
559
+ # Clean up temporary audio files
560
+ if audio_path and is_temp_file and os.path.exists(audio_path):
561
+ os.remove(audio_path)
562
+
563
+
564
+ with gr.Blocks() as iface:
565
+ gr.Markdown("# Audio Transcription")
566
+ gr.Markdown("Transcribe audio using SenseVoice model with multilingual support.")
567
+
568
+ with gr.Row():
569
+ audio_input = gr.Audio(label="Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
570
+ audio_url = gr.Textbox(label="Or Enter URL of audio file (direct link only, no YouTube)")
571
+
572
+ transcribe_button = gr.Button("Transcribe")
573
+
574
+ with gr.Accordion("Advanced Options", open=False):
575
+ with gr.Row():
576
+ proxy_url = gr.Textbox(label="Proxy URL", placeholder="Enter proxy URL if needed", value="", lines=1)
577
+ proxy_username = gr.Textbox(label="Proxy Username", placeholder="Proxy username (optional)", value="", lines=1)
578
+ proxy_password = gr.Textbox(label="Proxy Password", placeholder="Proxy password (optional)", value="", lines=1, type="password")
579
+
580
+
581
+ with gr.Row():
582
+ pipeline_type = gr.Dropdown(
583
+ choices=["sensevoice","fun-asr-nano"],
584
+ label="Pipeline Type",
585
+ value="fun-asr-nano"
586
+ )
587
+ model_id = gr.Dropdown(
588
+ label="Model",
589
+ choices=get_model_options("fun-asr-nano"),
590
+ value=FUN_ASR_NANO_MODEL_PATH_LIST[0] # Default to official Local Model
591
+ )
592
+ with gr.Row():
593
+ download_method = gr.Dropdown(
594
+ choices=["requests", "ffmpeg", "aria2", "wget"],
595
+ label="Download Method",
596
+ value="requests"
597
+ )
598
+
599
+ with gr.Row():
600
+ start_time = gr.Number(label="Start Time (seconds)", value=None, minimum=0)
601
+ end_time = gr.Number(label="End Time (seconds)", value=None, minimum=0)
602
+ verbose = gr.Checkbox(label="Verbose Output", value=False)
603
+
604
+ with gr.Row():
605
+ metrics_output = gr.Textbox(label="Transcription Metrics and Verbose Messages", lines=10)
606
+ transcription_output = gr.Textbox(label="Transcription", lines=10)
607
+ transcription_file = gr.File(label="Download Transcription")
608
+
609
+ def update_model_dropdown(pipeline_type):
610
+ """
611
+ Updates the model dropdown choices based on the selected pipeline type.
612
+
613
+ Args:
614
+ pipeline_type (str): The selected pipeline type.
615
+
616
+ Returns:
617
+ gr.update: Updated model dropdown component.
618
+ """
619
+ try:
620
+ model_choices = get_model_options(pipeline_type)
621
+ logging.info(f"Model choices for {pipeline_type}: {model_choices}")
622
+ if model_choices:
623
+ return gr.update(choices=model_choices, value=model_choices[0], visible=True)
624
+ else:
625
+ return gr.update(choices=["No models available"], value=None, visible=False)
626
+ except Exception as e:
627
+ logging.error(f"Error in update_model_dropdown: {str(e)}")
628
+ return gr.update(choices=["Error"], value="Error", visible=True)
629
+
630
+ # Event handler for pipeline_type change
631
+ pipeline_type.change(update_model_dropdown, inputs=[pipeline_type], outputs=[model_id])
632
+
633
+ def transcribe_with_progress(*args):
634
+ # The audio_input is now the first argument
635
+ for result in transcribe_audio(*args):
636
+ yield result
637
+
638
+ transcribe_button.click(
639
+ transcribe_with_progress,
640
+ inputs=[audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time, end_time, verbose],
641
+ outputs=[metrics_output, transcription_output, transcription_file]
642
+ )
643
+
644
+ # Note: For examples, users should use local audio files or upload their own files
645
+ # Examples with specific paths may not work for all users
646
+
647
+ gr.Markdown(f"""
648
+ ### Usage Examples:
649
+ 1. **Upload Audio**: Click the "Upload or Record Audio" button to select your audio file
650
+ 2. **Select Pipeline Type**: Choose from available pipelines:
651
+ - **Fun-ASR-Nano** (default) - Large language model based ASR model
652
+ - **SenseVoice** - CTC-based based ASR model with VAD
653
+
654
+ 3. **Available Model Options**:
655
+
656
+ **For Fun-ASR-Nano:**
657
+ - `Fun-ASR/model` (local path, default)
658
+ - `FunAudioLLM/fun-asr-nano` (HuggingFace)
659
+ - `FunAudioLLM/fun-asr-nano` (ModelScope)
660
+
661
+ **For SenseVoice:**
662
+ - `Fun-ASR/model/SenseVoiceSmall` (local path, default for this pipeline)
663
+ - `FunAudioLLM/SenseVoiceSmall` (HuggingFace)
664
+ - `iic/SenseVoiceSmall` (ModelScope)
665
+
666
+ 4. **Local Testing**: For development, you can use local paths as shown above
667
+
668
+ Supported languages:
669
+ - Fun-ASR-Nano: more than 50 languages and Chinese dialects.
670
+ - SenseVoiceSmall:Chinese (zh), English (en), Cantonese (yue), Japanese (ja), Korean (ko).
671
+ """)
672
+
673
+ iface.queue().launch(share=False, debug=True)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ requests
3
+ ffmpeg-python
4
+ pydub
5
+ torch
6
+ transformers
7
+ funasr>=1.1.3
8
+ torchaudio
9
+ modelscope
10
+ huggingface_hub
11
+ pydantic>=2.12.4
12
+ dotenv