Redesign interface based on AC-Foley paper
Browse files- Add comprehensive model description based on AC-Foley paper
- Implement mask_away_clip parameter for better control
- Redesign UI with clear sections: required/optional inputs
- Add four generation modes with detailed explanations
- Include advanced options in collapsible accordion
- Add usage guide with practical examples
- Improve layout with better information hierarchy
- Reference original paper: https://openreview.net/forum?id=URPXhnWdBF
app.py
CHANGED
|
@@ -30,14 +30,21 @@ EXAMPLE_PROMPTS = [
|
|
| 30 |
USAGE_TIPS = """
|
| 31 |
### 💡 使用技巧
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"""
|
| 42 |
|
| 43 |
# Check and install missing dependencies
|
|
@@ -209,7 +216,7 @@ class AudioFoleyModel:
|
|
| 209 |
|
| 210 |
def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
|
| 211 |
duration: float = 8.0, cfg_strength: float = 4.5,
|
| 212 |
-
seed: int = 42, reference_audio: str = None) -> Tuple[Optional[str], str]:
|
| 213 |
"""Generate audio from video and text prompt"""
|
| 214 |
try:
|
| 215 |
# Validation checks
|
|
@@ -262,7 +269,11 @@ class AudioFoleyModel:
|
|
| 262 |
return None, f"❌ Failed to load video: {str(e)}"
|
| 263 |
|
| 264 |
# Prepare frames
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
sync_frames = sync_frames.unsqueeze(0)
|
| 267 |
|
| 268 |
# Update model sequence configuration
|
|
@@ -378,7 +389,7 @@ def initialize_model():
|
|
| 378 |
else:
|
| 379 |
return "✅ 模型已加载"
|
| 380 |
|
| 381 |
-
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength):
|
| 382 |
"""Interface function for generating audio"""
|
| 383 |
global audio_model, model_loading_status
|
| 384 |
|
|
@@ -391,7 +402,7 @@ def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_stren
|
|
| 391 |
negative_prompt = "" # Simplified interface
|
| 392 |
|
| 393 |
audio_path, message = audio_model.generate_audio(
|
| 394 |
-
video_file, prompt, negative_prompt, duration, cfg_strength, seed, audio_file
|
| 395 |
)
|
| 396 |
return audio_path, message
|
| 397 |
|
|
@@ -403,15 +414,18 @@ def get_model_status():
|
|
| 403 |
# Create Gradio interface
|
| 404 |
with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as demo:
|
| 405 |
gr.Markdown("""
|
| 406 |
-
# 🎵
|
| 407 |
|
| 408 |
-
|
|
|
|
| 409 |
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
|
|
|
|
|
|
| 413 |
|
| 414 |
-
|
| 415 |
""")
|
| 416 |
|
| 417 |
# Model status display - will be updated automatically
|
|
@@ -429,47 +443,81 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 429 |
)
|
| 430 |
|
| 431 |
with gr.Row():
|
| 432 |
-
with gr.Column():
|
|
|
|
|
|
|
| 433 |
video_input = gr.Video(
|
| 434 |
-
label="
|
| 435 |
-
format="mp4"
|
|
|
|
| 436 |
)
|
| 437 |
|
|
|
|
|
|
|
| 438 |
audio_input = gr.Audio(
|
| 439 |
-
label="参考音频
|
| 440 |
type="filepath",
|
| 441 |
-
sources=["upload"]
|
|
|
|
| 442 |
)
|
| 443 |
|
| 444 |
prompt_input = gr.Textbox(
|
| 445 |
-
label="
|
| 446 |
-
placeholder="
|
| 447 |
lines=2,
|
| 448 |
-
|
| 449 |
)
|
| 450 |
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
step=0.1,
|
| 465 |
-
label="CFG强度"
|
| 466 |
)
|
| 467 |
|
| 468 |
-
with gr.Column():
|
| 469 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
gr.Markdown("### 🎯 示例提示词")
|
| 471 |
example_buttons = []
|
| 472 |
-
for prompt in EXAMPLE_PROMPTS[:
|
| 473 |
btn = gr.Button(prompt, size="sm")
|
| 474 |
example_buttons.append(btn)
|
| 475 |
btn.click(
|
|
@@ -477,37 +525,54 @@ with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as d
|
|
| 477 |
outputs=prompt_input
|
| 478 |
)
|
| 479 |
|
| 480 |
-
|
|
|
|
| 481 |
|
|
|
|
|
|
|
| 482 |
audio_output = gr.Audio(
|
| 483 |
label="生成的音频",
|
| 484 |
type="filepath"
|
| 485 |
)
|
| 486 |
|
| 487 |
-
generation_status = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
|
|
|
| 489 |
generate_btn.click(
|
| 490 |
fn=generate_audio_interface,
|
| 491 |
inputs=[
|
| 492 |
-
video_input, audio_input, prompt_input,
|
|
|
|
| 493 |
],
|
| 494 |
outputs=[audio_output, generation_status]
|
| 495 |
)
|
| 496 |
|
| 497 |
-
with gr.Accordion("💡
|
| 498 |
gr.Markdown(USAGE_TIPS)
|
| 499 |
|
| 500 |
gr.Markdown("""
|
| 501 |
-
### 🎬
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
-
|
| 504 |
-
-
|
| 505 |
-
-
|
| 506 |
-
-
|
| 507 |
-
- "安静办公室里的键盘敲击声"
|
| 508 |
-
- "厨房里炒菜和切菜的声音"
|
| 509 |
-
- "雨滴打在金属屋顶上"
|
| 510 |
-
- "木地板上轻柔的脚步声"
|
| 511 |
""")
|
| 512 |
|
| 513 |
# Auto-initialize model on startup
|
|
|
|
| 30 |
USAGE_TIPS = """
|
| 31 |
### 💡 使用技巧
|
| 32 |
|
| 33 |
+
**基础设置:**
|
| 34 |
+
- **视频质量**: 使用清晰、光线良好的视频,建议1-15秒
|
| 35 |
+
- **参考音频**: 提供清晰的音频片段作为音色参考
|
| 36 |
+
- **CFG强度**: 1-8之间,数值越高越贴合描述
|
| 37 |
+
|
| 38 |
+
**高级功能:**
|
| 39 |
+
- **mask_away_clip**: 当视频内容与期望音频差异很大时启用
|
| 40 |
+
- **细粒度控制**: 使用参考音频实现精确的音色和风格控制
|
| 41 |
+
- **零样本生成**: 无需训练即可生成新颖的音效组合
|
| 42 |
+
|
| 43 |
+
**应用场景:**
|
| 44 |
+
- 影视后期配音
|
| 45 |
+
- 游戏音效制作
|
| 46 |
+
- 音乐创作辅助
|
| 47 |
+
- 声音设计实验
|
| 48 |
"""
|
| 49 |
|
| 50 |
# Check and install missing dependencies
|
|
|
|
| 216 |
|
| 217 |
def generate_audio(self, video_file, prompt: str, negative_prompt: str = "",
|
| 218 |
duration: float = 8.0, cfg_strength: float = 4.5,
|
| 219 |
+
seed: int = 42, reference_audio: str = None, mask_away_clip: bool = False) -> Tuple[Optional[str], str]:
|
| 220 |
"""Generate audio from video and text prompt"""
|
| 221 |
try:
|
| 222 |
# Validation checks
|
|
|
|
| 269 |
return None, f"❌ Failed to load video: {str(e)}"
|
| 270 |
|
| 271 |
# Prepare frames
|
| 272 |
+
if mask_away_clip:
|
| 273 |
+
clip_frames = None # Mask away clip frames when video and audio don't match well
|
| 274 |
+
log.info("🎭 Using mask_away_clip: ignoring visual features")
|
| 275 |
+
else:
|
| 276 |
+
clip_frames = clip_frames.unsqueeze(0) if clip_frames is not None else None
|
| 277 |
sync_frames = sync_frames.unsqueeze(0)
|
| 278 |
|
| 279 |
# Update model sequence configuration
|
|
|
|
| 389 |
else:
|
| 390 |
return "✅ 模型已加载"
|
| 391 |
|
| 392 |
+
def generate_audio_interface(video_file, audio_file, prompt, duration, cfg_strength, mask_away_clip):
|
| 393 |
"""Interface function for generating audio"""
|
| 394 |
global audio_model, model_loading_status
|
| 395 |
|
|
|
|
| 402 |
negative_prompt = "" # Simplified interface
|
| 403 |
|
| 404 |
audio_path, message = audio_model.generate_audio(
|
| 405 |
+
video_file, prompt, negative_prompt, duration, cfg_strength, seed, audio_file, mask_away_clip
|
| 406 |
)
|
| 407 |
return audio_path, message
|
| 408 |
|
|
|
|
| 414 |
# Create Gradio interface
|
| 415 |
with gr.Blocks(title="hf_AC Audio Foley Generator", theme=gr.themes.Soft()) as demo:
|
| 416 |
gr.Markdown("""
|
| 417 |
+
# 🎵 AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis
|
| 418 |
|
| 419 |
+
## 📖 模型简介
|
| 420 |
+
AC-Foley是一个基于参考音频引导的视频到音频合成模型,能够实现精确的细粒度声音合成。与传统依赖文本描述的方法不同,AC-Foley直接利用参考音频来实现对生成声音的精确控制,解决了文本描述在微观声学特征方面的模糊性问题。
|
| 421 |
|
| 422 |
+
## ✨ 功能要点
|
| 423 |
+
- **细粒度声音合成**: 生成具有特定音色的脚步声(木板、大理石、砾石等)
|
| 424 |
+
- **音色转换**: 将小提琴的旋律转换为唢呐的明亮刺耳音色
|
| 425 |
+
- **零样本生成**: 创建独特的音效而无需专门训练
|
| 426 |
+
- **视觉-音频对齐**: 根据视频内容自动生成匹配的音频
|
| 427 |
|
| 428 |
+
*基于论文: [AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer](https://openreview.net/forum?id=URPXhnWdBF)*
|
| 429 |
""")
|
| 430 |
|
| 431 |
# Model status display - will be updated automatically
|
|
|
|
| 443 |
)
|
| 444 |
|
| 445 |
with gr.Row():
|
| 446 |
+
with gr.Column(scale=2):
|
| 447 |
+
# 必需输入
|
| 448 |
+
gr.Markdown("### 📹 必需输入")
|
| 449 |
video_input = gr.Video(
|
| 450 |
+
label="视频文件",
|
| 451 |
+
format="mp4",
|
| 452 |
+
info="上传需要生成音频的视频文件"
|
| 453 |
)
|
| 454 |
|
| 455 |
+
# 可选输入
|
| 456 |
+
gr.Markdown("### 🎛️ 可选输入")
|
| 457 |
audio_input = gr.Audio(
|
| 458 |
+
label="参考音频",
|
| 459 |
type="filepath",
|
| 460 |
+
sources=["upload"],
|
| 461 |
+
info="提供音色、风格、节奏参考(支持细粒度控制)"
|
| 462 |
)
|
| 463 |
|
| 464 |
prompt_input = gr.Textbox(
|
| 465 |
+
label="文本提示",
|
| 466 |
+
placeholder="例如: '脚步声', '金属碰撞声', '鸟叫声'",
|
| 467 |
lines=2,
|
| 468 |
+
info="描述想要的音频类型(留空则根据视频自动生成)"
|
| 469 |
)
|
| 470 |
|
| 471 |
+
# 高级选项
|
| 472 |
+
with gr.Accordion("🔧 高级选项", open=False):
|
| 473 |
+
with gr.Row():
|
| 474 |
+
duration_slider = gr.Slider(
|
| 475 |
+
minimum=1.0,
|
| 476 |
+
maximum=15.0,
|
| 477 |
+
value=8.0,
|
| 478 |
+
step=0.5,
|
| 479 |
+
label="时长 (秒)"
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
cfg_strength_slider = gr.Slider(
|
| 483 |
+
minimum=1.0,
|
| 484 |
+
maximum=8.0,
|
| 485 |
+
value=4.5,
|
| 486 |
+
step=0.1,
|
| 487 |
+
label="CFG强度"
|
| 488 |
+
)
|
| 489 |
|
| 490 |
+
mask_away_clip = gr.Checkbox(
|
| 491 |
+
label="忽略视觉特征 (mask_away_clip)",
|
| 492 |
+
value=False,
|
| 493 |
+
info="当视频和参考音频差异较大且生成效果不佳时启用"
|
|
|
|
|
|
|
| 494 |
)
|
| 495 |
|
| 496 |
+
with gr.Column(scale=1):
|
| 497 |
+
# 使用指南
|
| 498 |
+
gr.Markdown("### 📋 使用指南")
|
| 499 |
+
gr.Markdown("""
|
| 500 |
+
**四种生成模式:**
|
| 501 |
+
|
| 502 |
+
1️⃣ **纯视频**: 仅上传视频
|
| 503 |
+
- 根据视觉内容自动生成音频
|
| 504 |
+
|
| 505 |
+
2️⃣ **视频+参考音频**: 上传视频+音频
|
| 506 |
+
- 使用参考音频的音色和风格
|
| 507 |
+
- 实现细粒度音色控制
|
| 508 |
+
|
| 509 |
+
3️⃣ **视频+文本**: 上传视频+文本
|
| 510 |
+
- 根据文本描述生成指定类型音频
|
| 511 |
+
|
| 512 |
+
4️⃣ **完整模式**: 视频+音频+文本
|
| 513 |
+
- 最精确的控制方式
|
| 514 |
+
- 结合视觉、音色和语义指导
|
| 515 |
+
""")
|
| 516 |
+
|
| 517 |
+
# 示例提示词
|
| 518 |
gr.Markdown("### 🎯 示例提示词")
|
| 519 |
example_buttons = []
|
| 520 |
+
for prompt in EXAMPLE_PROMPTS[:4]:
|
| 521 |
btn = gr.Button(prompt, size="sm")
|
| 522 |
example_buttons.append(btn)
|
| 523 |
btn.click(
|
|
|
|
| 525 |
outputs=prompt_input
|
| 526 |
)
|
| 527 |
|
| 528 |
+
# 生成按钮
|
| 529 |
+
generate_btn = gr.Button("🎵 开始生成音频", variant="primary", size="lg")
|
| 530 |
|
| 531 |
+
# 输出区域
|
| 532 |
+
gr.Markdown("### 🎧 生成结果")
|
| 533 |
audio_output = gr.Audio(
|
| 534 |
label="生成的音频",
|
| 535 |
type="filepath"
|
| 536 |
)
|
| 537 |
|
| 538 |
+
generation_status = gr.Textbox(
|
| 539 |
+
label="生成状态",
|
| 540 |
+
interactive=False,
|
| 541 |
+
lines=2
|
| 542 |
+
)
|
| 543 |
|
| 544 |
+
# 绑定生成事件
|
| 545 |
generate_btn.click(
|
| 546 |
fn=generate_audio_interface,
|
| 547 |
inputs=[
|
| 548 |
+
video_input, audio_input, prompt_input,
|
| 549 |
+
duration_slider, cfg_strength_slider, mask_away_clip
|
| 550 |
],
|
| 551 |
outputs=[audio_output, generation_status]
|
| 552 |
)
|
| 553 |
|
| 554 |
+
with gr.Accordion("💡 详细说明", open=False):
|
| 555 |
gr.Markdown(USAGE_TIPS)
|
| 556 |
|
| 557 |
gr.Markdown("""
|
| 558 |
+
### 🎬 应用示例
|
| 559 |
+
|
| 560 |
+
**细粒度声音合成:**
|
| 561 |
+
- "木地板上的脚步声" + 参考音频 → 特定音色的脚步声
|
| 562 |
+
- "金属碰撞" + 不同参考音频 → 铁器vs铜器的区别
|
| 563 |
+
|
| 564 |
+
**音色转换:**
|
| 565 |
+
- 钢琴旋律视频 + 小提琴参考音频 → 小提琴演奏同样旋律
|
| 566 |
+
- 人声哼唱 + 乐器参考 → 乐器演奏版本
|
| 567 |
+
|
| 568 |
+
**创意音效:**
|
| 569 |
+
- 科幻场景视频 + 现实音效参考 → 独特的科幻音效
|
| 570 |
+
- 动画视频 + 真实音效 → 卡通与现实结合的音效
|
| 571 |
|
| 572 |
+
### 📚 技术细节
|
| 573 |
+
- 模型基于扩散模型和音频条件机制
|
| 574 |
+
- 支持44.1kHz高质量音频生成
|
| 575 |
+
- 实现了视觉-音频-文本的多模态对齐
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
""")
|
| 577 |
|
| 578 |
# Auto-initialize model on startup
|