Spaces:

MohmedAnik
/

ObjectOrientmodel

Sleeping

App Files Files Community

MohmedAnik commited on 23 days ago

Commit

8f4a471

verified ·

1 Parent(s): 21d3dc9

Update vision_tower.py

Browse files

Files changed (1) hide show

vision_tower.py +6 -12

vision_tower.py CHANGED Viewed

@@ -53,9 +53,7 @@ class MLP_dim(nn.Module):
         return self.net2(self.net1(x))
 class FLIP_Dinov2Embeddings(Dinov2Embeddings):
-    """
-    Construct the CLS token, mask token, position and patch embeddings.
-    """
     def __init__(self, config: Dinov2Config) -> None:
         super().__init__(config)
@@ -65,17 +63,15 @@ class FLIP_Dinov2Embeddings(Dinov2Embeddings):
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
-        # add the [CLS] token to the embedded patch tokens
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
-        # add positional encoding to each token
         embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
         if bool_masked_pos is not None:
-            # embeddings = torch.where(
-            #     bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
-            # )
             B,S,D = embeddings.shape
             batch_indices = torch.arange(B).unsqueeze(1)
             embeddings = embeddings[batch_indices, bool_masked_pos]
@@ -140,13 +136,11 @@ class DINOv2_MLP(nn.Module):
             dino_outputs = self.dinov2(**img_inputs)
             dino_seq = dino_outputs.last_hidden_state
-            # B,S,_ = dino_seq.shape
-            # dino_seq = dino_seq.view(B*S,-1)
             dino_seq = dino_seq[:,0,:]
         down_sample_out = self.down_sampler(dino_seq)
-        # down_sample_out = down_sample_out.view(B,S,-1)
-        # down_sample_out = down_sample_out[:,0,:]
         return down_sample_out

         return self.net2(self.net1(x))
 class FLIP_Dinov2Embeddings(Dinov2Embeddings):
     def __init__(self, config: Dinov2Config) -> None:
         super().__init__(config)
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
         embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
         if bool_masked_pos is not None:
             B,S,D = embeddings.shape
             batch_indices = torch.arange(B).unsqueeze(1)
             embeddings = embeddings[batch_indices, bool_masked_pos]
             dino_outputs = self.dinov2(**img_inputs)
             dino_seq = dino_outputs.last_hidden_state
             dino_seq = dino_seq[:,0,:]
         down_sample_out = self.down_sampler(dino_seq)
         return down_sample_out