Spaces:
Runtime error
Runtime error
File size: 5,105 Bytes
5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 84478fb 5bad2f1 84478fb 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 ecc57af 5bad2f1 cb0722f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import re
import numpy as np
import random
import sys
import torch
from PIL import Image, ImageDraw, ImageFont
from diffusers import DiffusionPipeline, TCDScheduler
from huggingface_hub import hf_hub_download
from gtts import gTTS
from moviepy.editor import ImageSequenceClip, VideoFileClip, concatenate_videoclips, AudioFileClip
import gradio as gr
# Choose among 1, 2, 4 and 8:
num_inference_steps = 8
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
repo_name = "ByteDance/Hyper-SD"
plural = "s" if num_inference_steps > 1 else ""
ckpt_name = f"Hyper-SDXL-{num_inference_steps}step{plural}-lora.safetensors"
device = "cpu"
pipe = DiffusionPipeline.from_pretrained(base_model_id).to(device)
pipe.load_lora_weights(hf_hub_download(repo_name, ckpt_name))
pipe.fuse_lora()
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
def generate_image(prompt, step_count=50, seed=None):
if seed is None:
seed = random.randint(0, sys.maxsize)
generator = torch.Generator(device).manual_seed(seed)
eta = 0.5
images = pipe(
prompt=prompt,
num_inference_steps=step_count,
guidance_scale=0.0,
eta=eta,
generator=generator,
).images
return images[0]
def draw_text_on_image(image, text, font_path="arial.ttf", font_size=24):
image_with_text = image.copy()
draw = ImageDraw.Draw(image_with_text)
try:
font = ImageFont.truetype(font_path, font_size)
except OSError:
print(f"Font {font_path} not found. Using default font.")
font = ImageFont.load_default()
# Split text into multiple lines to fit within the image
lines = []
max_width = image.width - 20 # Padding of 10 pixels on each side
words = text.split()
while words:
line = ''
while words and draw.textlength(line + words[0], font=font) <= max_width:
line = f"{line} {words.pop(0)}" if line else words.pop(0)
lines.append(line)
# Calculate total text height
text_height = sum(draw.textbbox((0, 0), line, font=font)[3] for line in lines)
# Position text at the bottom of the image
text_y = image.height - text_height - 20 # Padding of 10 pixels from the bottom
for line in lines:
text_bbox = draw.textbbox((0, 0), line, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
text_x = (image.width - text_width) // 2 # Centered horizontally
# Draw background rectangle for text
draw.rectangle([(text_x - 5, text_y - 5), (text_x + text_width + 5, text_y + text_height + 5)], fill="black")
# Draw text on top of the rectangle
draw.text((text_x, text_y), line, font=font, fill="white")
text_y += text_height + 5 # Move to the next line with some padding
return image_with_text
def process_story(story):
# Use regular expressions to split the text into sentences
sentences = re.split(r'(?<=[.!?]) +', story.strip())
# Initialize lists for video clips and audio clips
video_clips = []
fps = 24 # Frames per second
# Generate images, overlay text, and create audio
for i, sentence in enumerate(sentences):
print(f"Sentence {i+1}: {sentence}\n")
seed = random.randint(0, sys.maxsize)
image = generate_image(sentence, step_count=50, seed=seed) # Increase step count for better quality images
resized_image = image.resize((256, 256))
image_with_text = draw_text_on_image(resized_image, sentence)
# Save the image with text
image_path = f"sentence_{i+1}.png"
image_with_text.save(image_path)
frame = np.array(image_with_text) # Convert to NumPy array
# Generate audio for the sentence
tts = gTTS(sentence, lang='en')
audio_path = f"sentence_{i+1}.mp3"
tts.save(audio_path)
audio_clip = AudioFileClip(audio_path)
# Create a video clip from the image and set the duration to the audio duration
video_clip = ImageSequenceClip([frame], fps=fps)
video_clip = video_clip.set_duration(audio_clip.duration)
video_clip = video_clip.set_audio(audio_clip)
# Save the individual video clip
clip_path = f"sentence_{i+1}.mp4"
video_clip.write_videofile(clip_path, codec="libx264", audio_codec="aac")
video_clips.append(video_clip)
# Clear memory
del resized_image, image_with_text
# Concatenate all video clips into a final video
final_video = concatenate_videoclips(video_clips)
final_video_path = "story_video.mp4"
final_video.write_videofile(final_video_path, codec="libx264", audio_codec="aac")
return final_video_path
def generate_story_video(story):
final_video_path = process_story(story)
return final_video_path
iface = gr.Interface(
fn=generate_story_video,
inputs="text",
outputs="video",
title="Story to Video Generator",
description="Enter a story and get a video with images and narrated text.",
)
if __name__ == "__main__":
iface.launch(share=True)
|