| from pprint import pprint | |
| from datasets import load_dataset | |
| from transformers.pipelines import pipeline | |
| model_alias = "kotoba-tech/kotoba-whisper-v1.1" | |
| print("""### P + S ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=True, | |
| stable_ts=True, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| return_timestamps=True, | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### P ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=True, | |
| stable_ts=False, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| return_timestamps=True, | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### S ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=False, | |
| stable_ts=True, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| return_timestamps=True, | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### RAW ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=False, | |
| stable_ts=False, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| return_timestamps=True, | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### P + S ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=True, | |
| stable_ts=True, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### P ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=True, | |
| stable_ts=False, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### S ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=False, | |
| stable_ts=True, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |
| print("""### RAW ###""") | |
| pipe = pipeline(model=model_alias, | |
| punctuator=False, | |
| stable_ts=False, | |
| chunk_length_s=15, | |
| batch_size=16, | |
| trust_remote_code=True) | |
| dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
| for i in dataset: | |
| if i["audio"]["path"] == "long_interview_1.mp3": | |
| i["audio"]["array"] = i["audio"]["array"][:7938000] | |
| prediction = pipe( | |
| i["audio"], | |
| generate_kwargs={"language": "japanese", "task": "transcribe"} | |
| ) | |
| pprint(prediction) | |
| break | |