Spaces:

NKU-AMT
/

AMT

Runtime error

App Files Files Community

zzl commited on Apr 19, 2023

Commit

2bbc3ee

1 Parent(s): 9fd9429

release

Browse files

Files changed (4) hide show

app.py +6 -7
demo_img.py +34 -10
demo_vid.py +20 -2
utils.py +18 -0

app.py CHANGED Viewed

@@ -14,19 +14,18 @@ with gr.Blocks(css='style.css') as demo:
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         <a href="https://paper99.github.io" style="color:blue;">Zhen Li</a><sup>1*</sup>,
         <a href="https://github.com/NK-CS-ZZL" style="color:blue;">Zuo-Liang Zhu</a><sup>1*</sup>,
-        <a href="https://github.com/hlh981029" style="color:blue;">Ling-Hao Han</a><sup>1*</sup>,
-        <a href="https://houqb.github.io" style="color:blue;">Qibin Hou</a><sup>1*</sup>,
-        <a href="https://github.com" style="color:blue;">Chun-Le Guo</a><sup>1*</sup>,
-        <a href="https://mmcheng.net" style="color:blue;">Ming-Ming Cheng</a><sup>1*</sup>,
         </h2>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
-        <sup>1</sup>Nankai University <sup>*</sup> represents the equal contribution and <sup>#</sup> represents the corresponding author.
         </h2>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         [<a href="https://arxiv.org/abs/2303.13439" style="color:blue;">arXiv</a>]
         [<a href="https://github.com/MCG-NKU/AMT" style="color:blue;">GitHub</a>]
-        [<a href="https://github.com/MCG-NKU/AMT" style="color:blue;">Colab</a>]
-        [<a href="https://github.com/MCG-NKU/AMT" style="color:blue;">Replicate</a>]
         </h2>
         <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
         """

         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         <a href="https://paper99.github.io" style="color:blue;">Zhen Li</a><sup>1*</sup>,
         <a href="https://github.com/NK-CS-ZZL" style="color:blue;">Zuo-Liang Zhu</a><sup>1*</sup>,
+        <a href="https://github.com/hlh981029" style="color:blue;">Ling-Hao Han</a><sup>1</sup>,
+        <a href="https://houqb.github.io" style="color:blue;">Qibin Hou</a><sup>1</sup>,
+        <a href="https://github.com" style="color:blue;">Chun-Le Guo</a><sup>1</sup>,
+        <a href="https://mmcheng.net" style="color:blue;">Ming-Ming Cheng</a><sup>1</sup>,
         </h2>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+        <sup>1</sup>Nankai University <sup>*</sup> represents the equal contribution.
         </h2>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         [<a href="https://arxiv.org/abs/2303.13439" style="color:blue;">arXiv</a>]
         [<a href="https://github.com/MCG-NKU/AMT" style="color:blue;">GitHub</a>]
+        [<a href="https://colab.research.google.com/drive/1IeVO5BmLouhRh6fL2z_y18kgubotoaBq?usp=sharing" style="color:blue;">Colab</a>]
         </h2>
         <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
         """

demo_img.py CHANGED Viewed

@@ -7,8 +7,11 @@ from huggingface_hub import hf_hub_download
 from networks.amts import Model as AMTS
 from networks.amtl import Model as AMTL
 from networks.amtg import Model as AMTG
-from utils import img2tensor, tensor2img, InputPadder
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model_dict = {
     'AMT-S': AMTS, 'AMT-L': AMTL, 'AMT-G': AMTG
@@ -23,22 +26,43 @@ def img2vid(model_type, img0, img1, frame_ratio, iters):
     model.eval()
     img0_t = img2tensor(img0).to(device)
     img1_t = img2tensor(img1).to(device)
-    padder = InputPadder(img0_t.shape, 16)
-    img0_t, img1_t = padder.pad(img0_t, img1_t)
     inputs = [img0_t, img1_t]
     embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
     for i in range(iters):
         print(f'Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}')
-        outputs = [img0_t]
         for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
             with torch.no_grad():
-                imgt_pred = model(in_0, in_1, embt, eval=True)['imgt_pred']
-            imgt_pred = padder.unpad(imgt_pred)
-            in_1 = padder.unpad(in_1)
-            outputs += [imgt_pred, in_1]
         inputs = outputs
     out_path = 'results'
     size = outputs[0].shape[2:][::-1]
     writer = cv2.VideoWriter(f'{out_path}/demo.mp4', cv2.VideoWriter_fourcc(*'mp4v'), frame_ratio, size)

 from networks.amts import Model as AMTS
 from networks.amtl import Model as AMTL
 from networks.amtg import Model as AMTG
+from utils import (
+    img2tensor, tensor2img,
+    InputPadder,
+    check_dim_and_resize
+)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model_dict = {
     'AMT-S': AMTS, 'AMT-L': AMTL, 'AMT-G': AMTG
     model.eval()
     img0_t = img2tensor(img0).to(device)
     img1_t = img2tensor(img1).to(device)
     inputs = [img0_t, img1_t]
+    if device == 'cpu':
+        # Do not resize in cpu mode
+        anchor_resolution = 8192*8192
+        anchor_memory = 1
+        anchor_memory_bias = 0
+        vram_avail = 1
+    elif device == 'cuda':
+        anchor_resolution = 1024 * 512
+        anchor_memory = 1500 * 1024**2
+        anchor_memory_bias = 2500 * 1024**2
+        vram_avail = torch.cuda.get_device_properties(device).total_memory
     embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
+    inputs = check_dim_and_resize(inputs)
+    h, w = inputs[0].shape[-2:]
+    scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
+    scale = 1 if scale > 1 else scale
+    scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
+    if scale < 1:
+        print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
+    padding = int(16 / scale)
+    padder = InputPadder(inputs[0].shape, padding)
+    inputs = padder.pad(*inputs)
     for i in range(iters):
         print(f'Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}')
+        outputs = [inputs[0]]
         for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
+            in_0 = in_0.to(device)
+            in_1 = in_1.to(device)
             with torch.no_grad():
+                imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)['imgt_pred']
+            outputs += [imgt_pred.cpu(), in_1.cpu()]
         inputs = outputs
+    outputs = padder.unpad(*outputs)
     out_path = 'results'
     size = outputs[0].shape[2:][::-1]
     writer = cv2.VideoWriter(f'{out_path}/demo.mp4', cv2.VideoWriter_fourcc(*'mp4v'), frame_ratio, size)

demo_vid.py CHANGED Viewed

@@ -27,7 +27,25 @@ def vid2vid(model_type, video, iters):
     inputs = []
     h = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
     w = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    padder = InputPadder((h, w), 16)
     while True:
         ret, frame = vcap.read()
         if ret is False:
@@ -43,7 +61,7 @@ def vid2vid(model_type, video, iters):
         outputs = [inputs[0]]
         for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
             with torch.no_grad():
-                imgt_pred = model(in_0, in_1, embt, eval=True)['imgt_pred']
             imgt_pred = padder.unpad(imgt_pred)
             in_1 = padder.unpad(in_1)
             outputs += [imgt_pred, in_1]

     inputs = []
     h = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
     w = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    if device == 'cpu':
+        # Do not resize in cpu mode
+        anchor_resolution = 8192*8192
+        anchor_memory = 1
+        anchor_memory_bias = 0
+        vram_avail = 1
+    elif device == 'cuda':
+        anchor_resolution = 1024 * 512
+        anchor_memory = 1500 * 1024**2
+        anchor_memory_bias = 2500 * 1024**2
+        vram_avail = torch.cuda.get_device_properties(device).total_memory
+    scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
+    scale = 1 if scale > 1 else scale
+    scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
+    if scale < 1:
+        print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
+    padding = int(16 / scale)
+    padder = InputPadder(inputs[0].shape, padding)
     while True:
         ret, frame = vcap.read()
         if ret is False:
         outputs = [inputs[0]]
         for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
             with torch.no_grad():
+                imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)['imgt_pred']
             imgt_pred = padder.unpad(imgt_pred)
             in_1 = padder.unpad(in_1)
             outputs += [imgt_pred, in_1]

utils.py CHANGED Viewed

@@ -227,3 +227,21 @@ def warp(img, flow):
     grid_ = (grid + flow_).permute(0, 2, 3, 1)
     output = F.grid_sample(input=img, grid=grid_, mode='bilinear', padding_mode='border', align_corners=True)
     return output

     grid_ = (grid + flow_).permute(0, 2, 3, 1)
     output = F.grid_sample(input=img, grid=grid_, mode='bilinear', padding_mode='border', align_corners=True)
     return output
+def check_dim_and_resize(tensor_list):
+    shape_list = []
+    for t in tensor_list:
+        shape_list.append(t.shape[2:])
+    if len(set(shape_list)) > 1:
+        desired_shape = shape_list[0]
+        print(f'Inconsistent size of input video frames. All frames will be resized to {desired_shape}')
+        resize_tensor_list = []
+        for t in tensor_list:
+            resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode='bilinear'))
+        tensor_list = resize_tensor_list
+    return tensor_list