Advanced_Video / app.py
Reality123b's picture
Update app.py
a0e820d verified
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import math
import dlib
import tempfile
import requests
import os
from transformers import pipeline
import cv2
import io
import json
import re
import time
detector = dlib.get_frontal_face_detector()
try:
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
except RuntimeError:
print("Downloading shape_predictor_68_face_landmarks.dat...")
landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
landmarks_compressed = requests.get(landmarks_url).content
import bz2
landmarks_data = bz2.decompress(landmarks_compressed)
with open("shape_predictor_68_face_landmarks.dat", "wb") as f:
f.write(landmarks_data)
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
IMAGE_GEN_API = "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell"
HF_TOKEN = os.getenv("HF_TOKEN")
LLM_API = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"
def query_hf_image_generation(prompt):
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
payload = {"inputs": prompt}
for _ in range(3):
try:
response = requests.post(IMAGE_GEN_API, headers=headers, json=payload)
response.raise_for_status()
image_bytes = response.content
image = Image.open(io.BytesIO(image_bytes))
return image
except requests.exceptions.RequestException as e:
print(f"Image generation attempt failed: {e}")
time.sleep(2)
raise Exception("Image generation failed after multiple attempts")
def query_llm(prompt, system_prompt):
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"
payload = {"inputs": prompt_template, "max_new_tokens": 200}
for _ in range(3):
try:
response = requests.post(LLM_API, headers=headers, json=payload)
response.raise_for_status()
return response.json()[0]['generated_text']
except requests.exceptions.RequestException as e:
print(f"LLM query attempt failed: {e}")
time.sleep(2)
raise Exception("LLM query failed after multiple attempts")
def segment_script(script):
system_prompt = "You are a helpful assistant. Given a script, divide it into segments suitable for generating images, ensuring each segment is less than 500 characters."
llm_response = query_llm(script, system_prompt)
segments = llm_response.split('\n')
segments = [seg.strip() for seg in segments if seg.strip()]
return segments
def generate_image_prompts(script_segments):
image_prompts = []
for segment in script_segments:
system_prompt = "You are a helpful assistant. Create a concise image prompt based on the following script segment:"
prompt = f"Script Segment: {segment}"
image_prompt = query_llm(prompt, system_prompt)
image_prompts.append(image_prompt)
return image_prompts
def extract_motion_params(llm_output):
try:
start_index = llm_output.find('{')
end_index = llm_output.rfind('}') + 1
json_string = llm_output[start_index:end_index]
params = json.loads(json_string)
return params
except:
return {
"motion_type": "none",
"intensity": 0.25,
"text_overlay": "",
"text_color": "white",
"start_time": 0,
"end_time": 5
}
def detect_face_landmarks(image):
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
rects = detector(gray, 1)
if len(rects) > 0:
shape = predictor(gray, rects[0])
shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)])
return shape
else:
return None
def apply_color_grading(frame, color_preset, intensity):
if color_preset == "sepia":
sepia_matrix = np.array([[0.393, 0.769, 0.189],
[0.349, 0.686, 0.168],
[0.272, 0.534, 0.131]])
frame_float = frame.astype(np.float32) / 255.0
sepia_effect = cv2.transform(frame_float, sepia_matrix)
blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
elif color_preset == "vintage":
frame_float = frame.astype(np.float32) / 255.0
frame_float[:, :, 0] *= (1 - intensity * 0.6)
frame_float[:, :, 2] *= (1 + intensity * 0.3)
grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY)
grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB)
blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
elif color_preset == "black_and_white":
gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)
elif color_preset == "cold":
frame_float = frame.astype(np.float32) / 255.0
frame_float[:, :, 0] *= (1 + intensity * 0.7)
frame_float[:, :, 2] *= (1 - intensity * 0.2)
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
elif color_preset == "warm":
frame_float = frame.astype(np.float32) / 255.0
frame_float[:, :, 2] *= (1 + intensity * 0.7)
frame_float[:, :, 0] *= (1 - intensity * 0.2)
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
elif color_preset == "neon":
frame_float = frame.astype(np.float32) / 255.0
lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB)
l, a, b = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
l = clahe.apply(l)
lab = cv2.merge((l, a, b))
frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
frame_float[:, :, 0] *= (1 - intensity * 0.4)
frame_float[:, :, 1] *= (1 + intensity * 0.8)
frame_float[:, :, 2] *= (1 - intensity * 0.4)
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
return frame
def apply_vignette(frame, intensity):
width, height = frame.shape[1], frame.shape[0]
x = np.linspace(-1, 1, width)
y = np.linspace(-1, 1, height)
X, Y = np.meshgrid(x, y)
radius = np.sqrt(X**2 + Y**2)
vignette = 1 - intensity * radius**2
vignette = np.clip(vignette, 0, 1)
vignette = np.stack([vignette] * 3, axis=-1)
frame_float = frame.astype(np.float32) / 255.0
result = frame_float * vignette
return (np.clip(result, 0, 1) * 255).astype(np.uint8)
def apply_bokeh(frame, intensity, t):
frame_float = frame.astype(np.float32) / 255.0
gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
circles = []
for _ in range(int(intensity * 30)):
radius = np.random.randint(5, 30)
x = np.random.randint(radius, frame.shape[1] - radius)
y = np.random.randint(radius, frame.shape[0] - radius)
color = frame_float[y, x]
brightness = np.random.uniform(0.5, 1.0)
circles.append((x, y, radius, color, brightness))
bokeh_effect = np.zeros_like(frame_float)
for x, y, radius, color, brightness in circles:
y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x]
mask = x_grid*x_grid + y_grid*y_grid <= radius*radius
bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi))
blended_frame = frame_float + intensity * bokeh_effect
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity):
frames = []
width, height = image.size
landmarks = detect_face_landmarks(image)
for i in range(int(duration * fps)):
t = i / (duration * fps)
frame = image.copy()
if landmarks is not None:
if motion_type == "head_nod":
top_head = landmarks[27]
bottom_head = landmarks[8]
angle = math.sin(t * 2 * math.pi) * intensity * 8
center_x = (top_head[0] + bottom_head[0]) // 2
center_y = (top_head[1] + bottom_head[1]) // 2
M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
frame = Image.fromarray(rotated_image)
elif motion_type == "head_shake":
top_head = landmarks[27]
left_head = landmarks[0]
right_head = landmarks[16]
angle = math.sin(t * 3 * math.pi) * intensity * 6
center_x = top_head[0]
center_y = top_head[1]
M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
frame = Image.fromarray(rotated_image)
elif motion_type == "eye_blink":
left_eye_top = landmarks[37]
left_eye_bottom = landmarks[41]
right_eye_top = landmarks[43]
right_eye_bottom = landmarks[47]
blink_progress = abs(math.sin(t * 2 * math.pi))
if blink_progress > 0.9:
draw = ImageDraw.Draw(frame)
draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2)
draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
else:
frame = image.copy()
elif motion_type == "smile":
mouth_left = landmarks[48]
mouth_right = landmarks[54]
mouth_top = landmarks[51]
mouth_bottom = landmarks[57]
smile_progress = intensity * t
draw = ImageDraw.Draw(frame)
curve_points = [
tuple(mouth_left),
(mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)),
(mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)),
tuple(mouth_right)
]
draw.line(curve_points, fill=text_color, width=4)
if motion_type == "zoom":
scale = 1 + intensity * t
new_size = (int(width * scale), int(height * scale))
resized_image = image.resize(new_size, Image.Resampling.LANCZOS)
x_offset = (new_size[0] - width) // 2
y_offset = (new_size[1] - height) // 2
frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height))
elif motion_type == "pan":
x_offset = int(intensity * t * (width - width))
y_offset = int(intensity * t * (height - height))
frame = Image.new("RGB", (width, height))
frame.paste(image, (-x_offset, -y_offset))
elif motion_type == "rotate":
angle = intensity * t * 360
rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC)
x_offset = (rotated_image.width - width) // 2
y_offset = (rotated_image.height - height) // 2
frame = Image.new("RGB", (width, height))
frame.paste(rotated_image, (-x_offset, -y_offset))
elif motion_type == "move_right":
x_offset = int(intensity * t * width)
frame = Image.new("RGB", (width, height), "black")
frame.paste(image, (x_offset, 0))
elif motion_type == "move_left":
x_offset = -int(intensity * t * width)
frame = Image.new("RGB", (width, height), "black")
frame.paste(image, (x_offset, 0))
elif motion_type == "move_up":
y_offset = -int(intensity * t * height)
frame = Image.new("RGB", (width, height), "black")
frame.paste(image, (0, y_offset))
elif motion_type == "move_down":
y_offset = int(intensity * t * height)
frame = Image.new("RGB", (width, height), "black")
frame.paste(image, (0, y_offset))
elif motion_type == "shake":
shake_intensity = intensity * 10
x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5))
y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3))
frame = Image.new("RGB", (width, height))
frame.paste(image, (x_offset, y_offset))
elif motion_type == "fade_in":
alpha = t
frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
elif motion_type == "fade_out":
alpha = 1 - t
frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
elif motion_type == "rain":
draw = ImageDraw.Draw(frame)
for _ in range(int(intensity * 5)):
x = np.random.randint(0, width)
y = np.random.randint(0, height)
length = np.random.randint(5, 15)
speed = intensity * 3
y_end = y + length + i * speed
draw.line([(x, y), (x, y_end)], fill="lightblue", width=1)
elif motion_type == "bokeh":
frame_np = np.array(frame)
frame_np = apply_bokeh(frame_np, intensity, t)
frame = Image.fromarray(frame_np)
frame_np = np.array(frame)
if color_preset:
frame_np = apply_color_grading(frame_np, color_preset, intensity)
if vignette_intensity > 0:
frame_np = apply_vignette(frame_np, vignette_intensity)
frame = Image.fromarray(frame_np)
draw = ImageDraw.Draw(frame)
if text_overlay and start_time <= t <= end_time:
try:
font = ImageFont.truetype("arial.ttf", font_size)
except IOError:
font = ImageFont.load_default()
text_width, text_height = draw.textsize(text_overlay, font=font)
x = (width - text_width) // 2
y = (height - text_height) // 2
draw.text((x, y), text_overlay, font=font, fill=text_color)
frames.append(np.array(frame))
return frames
def create_video_from_frames(frames, output_filename, fps=30):
writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
writer.send(None)
for frame in frames:
writer.send(frame)
writer.close()
def generate_video_from_script(script, duration_per_segment=5):
script_segments = segment_script(script)
image_prompts = generate_image_prompts(script_segments)
all_frames = []
for i, (segment, image_prompt) in enumerate(zip(script_segments, image_prompts)):
print(f"Processing segment {i + 1} of {len(script_segments)}")
print(f" Segment: {segment}")
print(f" Image Prompt: {image_prompt}")
image = query_hf_image_generation(image_prompt)
image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
system_prompt = "You are an expert in image to video creation. Provide the motion type, intensity, text overlay, text color, text start and end times, color preset, and vignette intensity for the following image description and user prompt. Give the response in a JSON format."
prompt = f"Image Description: {image_description}\nUser Prompt: {segment}"
llm_response = query_llm(prompt, system_prompt)
print(f" LLM Response: {llm_response}")
motion_params = extract_motion_params(llm_response)
print(f" Motion Parameters: {motion_params}")
frames = apply_advanced_motion(
image,
motion_params["motion_type"],
motion_params["intensity"],
duration=duration_per_segment,
fps=30,
text_overlay=motion_params["text_overlay"],
text_color=motion_params["text_color"],
font_size=50,
start_time=motion_params["start_time"],
end_time=motion_params["end_time"],
color_preset=motion_params.get("color_preset", None),
vignette_intensity=motion_params.get("vignette_intensity", 0)
)
all_frames.extend(frames)
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
output_filename = tmpfile.name
create_video_from_frames(all_frames, output_filename)
return output_filename
def generate_and_animate(prompt):
try:
image = query_hf_image_generation(prompt)
image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
llm_response = query_llm(prompt, image_description)
motion_params = extract_motion_params(llm_response)
frames = apply_advanced_motion(
image,
motion_params["motion_type"],
motion_params["intensity"],
duration=5,
fps=30,
text_overlay=motion_params["text_overlay"],
text_color=motion_params["text_color"],
font_size=50,
start_time=motion_params["start_time"],
end_time=motion_params["end_time"],
color_preset=motion_params.get("color_preset", None),
vignette_intensity=motion_params.get("vignette_intensity", 0)
)
video_file = create_video_from_frames(frames)
return video_file, gr.Image.update(value=image)
except Exception as e:
return str(e), None
motion_types = [
"zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down",
"shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none"
]
text_colors = ["white", "black", "red", "green", "blue", "yellow"]
color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"]
iface = gr.Interface(
fn=generate_and_animate,
inputs=[
gr.Textbox(label="Prompt"),
],
outputs=[
gr.Video(label="Generated Video"),
gr.Image(label="Generated Image")
],
title="AI Video Generator",
description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
)
video_iface = gr.Interface(
fn=generate_video_from_script,
inputs=[
gr.Textbox(label="Script (max 1 minute video)", lines=5),
gr.Slider(label="Duration per Segment (seconds)", minimum=1, maximum=10, step=1, value=5)
],
outputs=gr.Video(label="Generated Video from Script"),
title="Story Visualizer",
description="Enter a short story script, and this will generate a video visualizing it using multiple images and animations."
)
demo = gr.TabbedInterface([iface, video_iface], ["Generate and Animate", "Story to Video"])
if __name__ == "__main__":
demo.launch(share=True, debug=True)