Spaces:

JacobLinCool
/

tmp-service

Sleeping

App Files Files Community

JacobLinCool commited on about 1 month ago

Commit

6510c49

verified ·

1 Parent(s): 9ecb18c

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -21

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import os
 DEVICE = Accelerator().device
 MODEL_NAME = "qihoo360/fg-clip2-so400m"
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(
@@ -69,21 +70,28 @@ def generate_image_embeddings(zip_file):
         if len(images) == 0:
             return None, "❌ No valid images found in the zip file"
-        # Generate embeddings
         embeddings = []
         print(f"Generating embeddings for {len(images)} images...")
         with torch.no_grad():
-            for i, image in enumerate(images):
-                print(f"Processing image {i + 1}/{len(images)}")
                 image_input = image_processor(
-                    images=image,
-                    max_num_patches=determine_max_value(image),
                     return_tensors="pt",
                 ).to(DEVICE)
-                image_feature = model.get_image_features(**image_input)
-                # Normalize the embedding
-                normalized_features = image_feature / image_feature.norm(
                     dim=-1, keepdim=True
                 )
                 embeddings.append(normalized_features.cpu().numpy())
@@ -147,12 +155,12 @@ def extract_frames(video_path: str, fps: int = 4):
 @spaces.GPU
-def generate_video_embeddings(video_file, fps):
     """
     Generate embeddings from video frames.
     Args:
-        video_file: Uploaded video file
         fps: Frames per second to extract
     Returns:
@@ -160,28 +168,35 @@ def generate_video_embeddings(video_file, fps):
     """
     try:
         # Extract frames
-        print(f"Extracting frames from video: {video_file.name} at {fps} fps")
-        frames = extract_frames(video_file.name, fps)
         print(f"Extracted {len(frames)} frames from video")
         if len(frames) == 0:
             return None, "❌ No frames could be extracted from the video"
-        # Generate embeddings
         embeddings = []
         print(f"Generating embeddings for {len(frames)} frames...")
         with torch.no_grad():
-            for i, frame in enumerate(frames):
-                print(f"Processing frame {i + 1}/{len(frames)}")
                 image_input = image_processor(
-                    images=frame,
-                    max_num_patches=determine_max_value(frame),
                     return_tensors="pt",
                 ).to(DEVICE)
-                image_feature = model.get_image_features(**image_input)
-                # Normalize the embedding
-                normalized_features = image_feature / image_feature.norm(
                     dim=-1, keepdim=True
                 )
                 embeddings.append(normalized_features.cpu().numpy())
@@ -250,8 +265,15 @@ with gr.Blocks(title="Video & Image Embedding Generator") as demo:
                 vid_output = gr.JSON(label="Embeddings (JSON)")
                 vid_status = gr.Textbox(label="Status", lines=3)
         vid_submit_btn.click(
-            fn=generate_video_embeddings,
             inputs=[video_input, fps_input],
             outputs=[vid_output, vid_status],
         )

 DEVICE = Accelerator().device
 MODEL_NAME = "qihoo360/fg-clip2-so400m"
+BATCH_SIZE = 64
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(
         if len(images) == 0:
             return None, "❌ No valid images found in the zip file"
+        # Generate embeddings with batching
         embeddings = []
         print(f"Generating embeddings for {len(images)} images...")
         with torch.no_grad():
+            for i in range(0, len(images), BATCH_SIZE):
+                batch = images[i : i + BATCH_SIZE]
+                print(
+                    f"Processing batch {i // BATCH_SIZE + 1}/{(len(images) + BATCH_SIZE - 1) // BATCH_SIZE} ({len(batch)} images)"
+                )
+                # Use the same max_num_patches for all images in batch
+                max_patches = max(determine_max_value(img) for img in batch)
                 image_input = image_processor(
+                    images=batch,
+                    max_num_patches=max_patches,
                     return_tensors="pt",
                 ).to(DEVICE)
+                image_features = model.get_image_features(**image_input)
+                # Normalize the embeddings
+                normalized_features = image_features / image_features.norm(
                     dim=-1, keepdim=True
                 )
                 embeddings.append(normalized_features.cpu().numpy())
 @spaces.GPU
+def generate_video_embeddings(video_path, fps):
     """
     Generate embeddings from video frames.
     Args:
+        video_path: Path to video file (str)
         fps: Frames per second to extract
     Returns:
     """
     try:
         # Extract frames
+        print(f"Extracting frames from video: {video_path} at {fps} fps")
+        frames = extract_frames(video_path, fps)
         print(f"Extracted {len(frames)} frames from video")
         if len(frames) == 0:
             return None, "❌ No frames could be extracted from the video"
+        # Generate embeddings with batching
         embeddings = []
         print(f"Generating embeddings for {len(frames)} frames...")
         with torch.no_grad():
+            for i in range(0, len(frames), BATCH_SIZE):
+                batch = frames[i : i + BATCH_SIZE]
+                print(
+                    f"Processing batch {i // BATCH_SIZE + 1}/{(len(frames) + BATCH_SIZE - 1) // BATCH_SIZE} ({len(batch)} frames)"
+                )
+                # Use the same max_num_patches for all frames in batch
+                max_patches = max(determine_max_value(frame) for frame in batch)
                 image_input = image_processor(
+                    images=batch,
+                    max_num_patches=max_patches,
                     return_tensors="pt",
                 ).to(DEVICE)
+                image_features = model.get_image_features(**image_input)
+                # Normalize the embeddings
+                normalized_features = image_features / image_features.norm(
                     dim=-1, keepdim=True
                 )
                 embeddings.append(normalized_features.cpu().numpy())
                 vid_output = gr.JSON(label="Embeddings (JSON)")
                 vid_status = gr.Textbox(label="Status", lines=3)
+        def handle_video_upload(video_file, fps):
+            if video_file is None:
+                return None, "❌ Please upload a video file"
+            return generate_video_embeddings(
+                video_file.name if hasattr(video_file, "name") else video_file, fps
+            )
         vid_submit_btn.click(
+            fn=handle_video_upload,
             inputs=[video_input, fps_input],
             outputs=[vid_output, vid_status],
         )