| vae: | |
| target: direct3d.models.vae.D3D_VAE | |
| params: | |
| triplane_res: 32 | |
| triplane_dim: 32 | |
| latent_dim: 16 | |
| num_freqs: 8 | |
| num_attention_heads: 12 | |
| attention_head_dim: 64 | |
| num_encoder_layers: 8 | |
| num_geodecoder_layers: 5 | |
| latents_scale: 2.45 | |
| dit: | |
| target: direct3d.models.dit.D3D_DiT | |
| params: | |
| attention_bias: true | |
| attention_head_dim: 72 | |
| num_attention_heads: 16 | |
| semantic_channels: 1024 | |
| pixel_channels: 1024 | |
| in_channels: 16 | |
| out_channels: 16 | |
| num_layers: 44 | |
| patch_size: 2 | |
| sample_size: [32, 96] | |
| semantic_encoder: | |
| target: direct3d.models.condition.ClipImageEncoder | |
| params: | |
| version: openai/clip-vit-large-patch14 | |
| pixel_encoder: | |
| target: direct3d.models.condition.DinoEncoder | |
| params: | |
| version: facebook/dinov2-large | |
| scheduler: | |
| target: diffusers.schedulers.EulerAncestralDiscreteScheduler | |
| params: | |
| num_train_timesteps: 1000 | |
| beta_start: 0.0001 | |
| beta_end: 0.02 | |
| beta_schedule: "linear" | |
| prediction_type: "epsilon" | |