vae: target: direct3d.models.vae.D3D_VAE params: triplane_res: 32 triplane_dim: 32 latent_dim: 16 num_freqs: 8 num_attention_heads: 12 attention_head_dim: 64 num_encoder_layers: 8 num_geodecoder_layers: 5 latents_scale: 2.45 dit: target: direct3d.models.dit.D3D_DiT params: attention_bias: true attention_head_dim: 72 num_attention_heads: 16 semantic_channels: 1024 pixel_channels: 1024 in_channels: 16 out_channels: 16 num_layers: 44 patch_size: 2 sample_size: [32, 96] semantic_encoder: target: direct3d.models.condition.ClipImageEncoder params: version: openai/clip-vit-large-patch14 pixel_encoder: target: direct3d.models.condition.DinoEncoder params: version: facebook/dinov2-large scheduler: target: diffusers.schedulers.EulerAncestralDiscreteScheduler params: num_train_timesteps: 1000 beta_start: 0.0001 beta_end: 0.02 beta_schedule: "linear" prediction_type: "epsilon"