Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import json | |
| import requests | |
| import random | |
| labels = ["Real Audio π£οΈ", "Cloned Audio π€"] | |
| DURATION = 2 | |
| MD_DESCRIPTION = """ | |
| # Clone Guesser | |
| Try to guess if the voice is real π£οΈ or cloned π€. | |
| Each cloned sample was generated from a real voice recording, so the spoken text alone won't help identify if it's real or cloned. | |
| For more information about the data and models used, see the [dataset page](https://huggingface.co/datasets/jerpint/vox-cloned-data) | |
| """ | |
| def get_accuracy(score_matrix) -> str: | |
| correct = score_matrix[0][0] + score_matrix[1][1] | |
| total = sum(score_matrix[0]) + sum(score_matrix[1]) | |
| if total == 0: | |
| return "" | |
| accuracy = correct / total * 100 | |
| return f"{accuracy:.2f}%" | |
| def audio_link(path: str, model: str): | |
| """Get the link to the audio file for a given path and model.""" | |
| return f"https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/{model}/{path}?download=true" | |
| def confusion_matrix_to_markdown(matrix, labels=None): | |
| num_labels = len(matrix) | |
| labels = labels or [f"Class {i}" for i in range(num_labels)] | |
| accuracy = get_accuracy(matrix) | |
| markdown = "Scoreboard \n" | |
| # Header row | |
| markdown += f"| {' | '.join([''] + labels)} |\n" | |
| markdown += f"| {' | '.join(['---'] * (num_labels + 1))} |\n" | |
| # Data rows | |
| for i, row in enumerate(matrix): | |
| markdown += f"| {labels[i]} | " + " | ".join(map(str, row)) + " |\n" | |
| markdown += f"\nAccuracy %: {accuracy}\n" | |
| return markdown | |
| def load_and_cache_data(): | |
| json_link = "https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/files.json?download=true" | |
| local_file = "files.json" | |
| if not os.path.exists(local_file): | |
| json_file = requests.get(json_link) | |
| if json_file.status_code != 200: | |
| raise Exception(f"Failed to load data from {json_link}") | |
| # Cache the file | |
| with open(local_file, "w") as f: | |
| f.write(json_file.text) | |
| with open(local_file, "r") as f: | |
| return json.load(f) | |
| def load_data(): | |
| json_link = "https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/files.json?download=true" | |
| json_file = requests.get(json_link) | |
| if json_file.status_code != 200: | |
| raise Exception(f"Failed to load data from {json_link}") | |
| print("Loaded data") | |
| return json.loads(json_file.text) | |
| def select_random_model(path): | |
| """Select a random model from the list of models for a given path. | |
| Will select commonvoice 50% of the time, and a random other model 50% of the time. | |
| """ | |
| if random.random() < 0.5: | |
| return "commonvoice" | |
| else: | |
| other_models = [m for m in data[path] if m != "commonvoice"] | |
| return random.choice(other_models) | |
| def get_random_audio(): | |
| path = random.choice(paths) | |
| model = select_random_model(path) | |
| return path, model | |
| def next_audio(): | |
| new_audio = get_random_audio() | |
| audio_cmp = gr.Audio(audio_link(new_audio[0], new_audio[1])) | |
| return audio_cmp, new_audio | |
| data = load_data() | |
| # Keep only samples with minimum 2 sources | |
| data = {path: data[path] for path in data if len(data[path]) >= 2} | |
| # List all available paths | |
| paths = list(data.keys()) | |
| with gr.Blocks() as demo: | |
| current_audio = gr.State(get_random_audio) | |
| score_matrix = gr.State([[0, 0], [0, 0]]) | |
| gr.Markdown(MD_DESCRIPTION) | |
| with gr.Column(): | |
| with gr.Row(): | |
| audio_cmp = gr.Audio( | |
| audio_link(current_audio.value[0], current_audio.value[1]) | |
| ) | |
| with gr.Column(): | |
| with gr.Row(): | |
| button1 = gr.Button("Real Audio π£οΈ") | |
| button2 = gr.Button("Cloned Audio π€") | |
| score_md = gr.Markdown(confusion_matrix_to_markdown(score_matrix.value, labels)) | |
| def check_result(x, score_matrix): | |
| is_correct = x[1] == "commonvoice" | |
| audio_cmp, current_audio = next_audio() | |
| if is_correct: | |
| gr.Info("Correct! Real Audio", duration=DURATION) | |
| score_matrix[0][0] += 1 | |
| else: | |
| gr.Warning("Incorrect! Cloned Audio", duration=DURATION) | |
| score_matrix[0][1] += 1 | |
| score_md = confusion_matrix_to_markdown(score_matrix, labels) | |
| return audio_cmp, current_audio, score_matrix, score_md | |
| def check_result(x, score_matrix): | |
| is_correct = x[1] != "commonvoice" | |
| audio_cmp, current_audio = next_audio() | |
| if is_correct: | |
| gr.Info("Correct! Cloned Audio", duration=DURATION) | |
| score_matrix[1][1] += 1 | |
| else: | |
| gr.Warning("Incorrect! Real Audio", duration=DURATION) | |
| score_matrix[1][0] += 1 | |
| score_md = confusion_matrix_to_markdown(score_matrix, labels) | |
| return audio_cmp, current_audio, score_matrix, score_md | |
| demo.launch() | |