import re
import numpy as np
import random
import sys
import torch
from PIL import Image, ImageDraw, ImageFont
from diffusers import DiffusionPipeline, TCDScheduler
from huggingface_hub import hf_hub_download
from gtts import gTTS
from moviepy.editor import ImageSequenceClip, VideoFileClip, concatenate_videoclips, AudioFileClip
import gradio as gr

# Choose among 1, 2, 4 and 8:
num_inference_steps = 8

base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
repo_name = "ByteDance/Hyper-SD"
plural = "s" if num_inference_steps > 1 else ""
ckpt_name = f"Hyper-SDXL-{num_inference_steps}step{plural}-lora.safetensors"
device = "cpu"

pipe = DiffusionPipeline.from_pretrained(base_model_id).to(device)
pipe.load_lora_weights(hf_hub_download(repo_name, ckpt_name))
pipe.fuse_lora()
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

def generate_image(prompt, step_count=50, seed=None):
    if seed is None:
        seed = random.randint(0, sys.maxsize)
    generator = torch.Generator(device).manual_seed(seed)
    eta = 0.5
    images = pipe(
        prompt=prompt,
        num_inference_steps=step_count,
        guidance_scale=0.0,
        eta=eta,
        generator=generator,
    ).images
    return images[0]

def draw_text_on_image(image, text, font_path="arial.ttf", font_size=24):
    image_with_text = image.copy()
    draw = ImageDraw.Draw(image_with_text)
    try:
        font = ImageFont.truetype(font_path, font_size)
    except OSError:
        print(f"Font {font_path} not found. Using default font.")
        font = ImageFont.load_default()

    # Split text into multiple lines to fit within the image
    lines = []
    max_width = image.width - 20  # Padding of 10 pixels on each side
    words = text.split()
    while words:
        line = ''
        while words and draw.textlength(line + words[0], font=font) <= max_width:
            line = f"{line} {words.pop(0)}" if line else words.pop(0)
        lines.append(line)

    # Calculate total text height
    text_height = sum(draw.textbbox((0, 0), line, font=font)[3] for line in lines)
    # Position text at the bottom of the image
    text_y = image.height - text_height - 20  # Padding of 10 pixels from the bottom

    for line in lines:
        text_bbox = draw.textbbox((0, 0), line, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        text_x = (image.width - text_width) // 2  # Centered horizontally

        # Draw background rectangle for text
        draw.rectangle([(text_x - 5, text_y - 5), (text_x + text_width + 5, text_y + text_height + 5)], fill="black")
        # Draw text on top of the rectangle
        draw.text((text_x, text_y), line, font=font, fill="white")
        text_y += text_height + 5  # Move to the next line with some padding

    return image_with_text

def process_story(story):
    # Use regular expressions to split the text into sentences
    sentences = re.split(r'(?<=[.!?]) +', story.strip())

    # Initialize lists for video clips and audio clips
    video_clips = []
    fps = 24  # Frames per second

    # Generate images, overlay text, and create audio
    for i, sentence in enumerate(sentences):
        print(f"Sentence {i+1}: {sentence}\n")
        seed = random.randint(0, sys.maxsize)
        image = generate_image(sentence, step_count=50, seed=seed)  # Increase step count for better quality images
        resized_image = image.resize((256, 256))
        image_with_text = draw_text_on_image(resized_image, sentence)

        # Save the image with text
        image_path = f"sentence_{i+1}.png"
        image_with_text.save(image_path)

        frame = np.array(image_with_text)  # Convert to NumPy array

        # Generate audio for the sentence
        tts = gTTS(sentence, lang='en')
        audio_path = f"sentence_{i+1}.mp3"
        tts.save(audio_path)
        audio_clip = AudioFileClip(audio_path)

        # Create a video clip from the image and set the duration to the audio duration
        video_clip = ImageSequenceClip([frame], fps=fps)
        video_clip = video_clip.set_duration(audio_clip.duration)
        video_clip = video_clip.set_audio(audio_clip)

        # Save the individual video clip
        clip_path = f"sentence_{i+1}.mp4"
        video_clip.write_videofile(clip_path, codec="libx264", audio_codec="aac")

        video_clips.append(video_clip)

        # Clear memory
        del resized_image, image_with_text

    # Concatenate all video clips into a final video
    final_video = concatenate_videoclips(video_clips)
    final_video_path = "story_video.mp4"
    final_video.write_videofile(final_video_path, codec="libx264", audio_codec="aac")

    return final_video_path

def generate_story_video(story):
    final_video_path = process_story(story)
    return final_video_path

iface = gr.Interface(
    fn=generate_story_video,
    inputs="text",
    outputs="video",
    title="Story to Video Generator",
    description="Enter a story and get a video with images and narrated text.",
)

if __name__ == "__main__":
    iface.launch(share=True)