# Copyright 2021 AlQuraishi Laboratory # Copyright 2021 DeepMind Technologies Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Modified code of Openfold's IPA.""" import numpy as np import torch import math from scipy.stats import truncnorm import torch.nn as nn from typing import Optional, Callable, List, Sequence from openfold.utils.rigid_utils import Rigid from data import all_atom def permute_final_dims(tensor: torch.Tensor, inds: List[int]): zero_index = -1 * len(inds) first_inds = list(range(len(tensor.shape[:zero_index]))) return tensor.permute(first_inds + [zero_index + i for i in inds]) def flatten_final_dims(t: torch.Tensor, no_dims: int): return t.reshape(t.shape[:-no_dims] + (-1,)) def ipa_point_weights_init_(weights): with torch.no_grad(): softplus_inverse_1 = 0.541324854612918 weights.fill_(softplus_inverse_1) def _prod(nums): out = 1 for n in nums: out = out * n return out def _calculate_fan(linear_weight_shape, fan="fan_in"): fan_out, fan_in = linear_weight_shape if fan == "fan_in": f = fan_in elif fan == "fan_out": f = fan_out elif fan == "fan_avg": f = (fan_in + fan_out) / 2 else: raise ValueError("Invalid fan option") return f def trunc_normal_init_(weights, scale=1.0, fan="fan_in"): shape = weights.shape f = _calculate_fan(shape, fan) scale = scale / max(1, f) a = -2 b = 2 std = math.sqrt(scale) / truncnorm.std(a=a, b=b, loc=0, scale=1) size = _prod(shape) samples = truncnorm.rvs(a=a, b=b, loc=0, scale=std, size=size) samples = np.reshape(samples, shape) with torch.no_grad(): weights.copy_(torch.tensor(samples, device=weights.device)) def lecun_normal_init_(weights): trunc_normal_init_(weights, scale=1.0) def he_normal_init_(weights): trunc_normal_init_(weights, scale=2.0) def glorot_uniform_init_(weights): nn.init.xavier_uniform_(weights, gain=1) def final_init_(weights): with torch.no_grad(): weights.fill_(0.0) def gating_init_(weights): with torch.no_grad(): weights.fill_(0.0) def normal_init_(weights): torch.nn.init.kaiming_normal_(weights, nonlinearity="linear") def compute_angles(ca_pos, pts): batch_size, num_res, num_heads, num_pts, _ = pts.shape calpha_vecs = (ca_pos[:, :, None, :] - ca_pos[:, None, :, :]) + 1e-10 calpha_vecs = torch.tile(calpha_vecs[:, :, :, None, None, :], (1, 1, 1, num_heads, num_pts, 1)) ipa_pts = pts[:, :, None, :, :, :] - torch.tile(ca_pos[:, :, None, None, None, :], (1, 1, num_res, num_heads, num_pts, 1)) phi_angles = all_atom.calculate_neighbor_angles( calpha_vecs.reshape(-1, 3), ipa_pts.reshape(-1, 3) ).reshape(batch_size, num_res, num_res, num_heads, num_pts) return phi_angles class Linear(nn.Linear): """ A Linear layer with built-in nonstandard initializations. Called just like torch.nn.Linear. Implements the initializers in 1.11.4, plus some additional ones found in the code. """ def __init__( self, in_dim: int, out_dim: int, bias: bool = True, init: str = "default", init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None, ): """ Args: in_dim: The final dimension of inputs to the layer out_dim: The final dimension of layer outputs bias: Whether to learn an additive bias. True by default init: The initializer to use. Choose from: "default": LeCun fan-in truncated normal initialization "relu": He initialization w/ truncated normal distribution "glorot": Fan-average Glorot uniform initialization "gating": Weights=0, Bias=1 "normal": Normal initialization with std=1/sqrt(fan_in) "final": Weights=0, Bias=0 Overridden by init_fn if the latter is not None. init_fn: A custom initializer taking weight and bias as inputs. Overrides init if not None. """ super(Linear, self).__init__(in_dim, out_dim, bias=bias) if bias: with torch.no_grad(): self.bias.fill_(0) if init_fn is not None: init_fn(self.weight, self.bias) else: if init == "default": lecun_normal_init_(self.weight) elif init == "relu": he_normal_init_(self.weight) elif init == "glorot": glorot_uniform_init_(self.weight) elif init == "gating": gating_init_(self.weight) if bias: with torch.no_grad(): self.bias.fill_(1.0) elif init == "normal": normal_init_(self.weight) elif init == "final": final_init_(self.weight) else: raise ValueError("Invalid init string.") class StructureModuleTransition(nn.Module): def __init__(self, c): super(StructureModuleTransition, self).__init__() self.c = c self.linear_1 = Linear(self.c, self.c, init="relu") self.linear_2 = Linear(self.c, self.c, init="relu") self.linear_3 = Linear(self.c, self.c, init="final") self.relu = nn.ReLU() self.ln = nn.LayerNorm(self.c) def forward(self, s): s_initial = s s = self.linear_1(s) s = self.relu(s) s = self.linear_2(s) s = self.relu(s) s = self.linear_3(s) s = s + s_initial s = self.ln(s) return s class EdgeTransition(nn.Module): def __init__( self, *, node_embed_size, edge_embed_in, edge_embed_out, num_layers=2, node_dilation=2 ): super(EdgeTransition, self).__init__() bias_embed_size = node_embed_size // node_dilation self.initial_embed = Linear( node_embed_size, bias_embed_size, init="relu") hidden_size = bias_embed_size * 2 + edge_embed_in trunk_layers = [] for _ in range(num_layers): trunk_layers.append(Linear(hidden_size, hidden_size, init="relu")) trunk_layers.append(nn.ReLU()) self.trunk = nn.Sequential(*trunk_layers) self.final_layer = Linear(hidden_size, edge_embed_out, init="final") self.layer_norm = nn.LayerNorm(edge_embed_out) def forward(self, node_embed, edge_embed): node_embed = self.initial_embed(node_embed) batch_size, num_res, _ = node_embed.shape edge_bias = torch.cat([ torch.tile(node_embed[:, :, None, :], (1, 1, num_res, 1)), torch.tile(node_embed[:, None, :, :], (1, num_res, 1, 1)), ], axis=-1) edge_embed = torch.cat( [edge_embed, edge_bias], axis=-1).reshape( batch_size * num_res**2, -1) edge_embed = self.final_layer(self.trunk(edge_embed) + edge_embed) edge_embed = self.layer_norm(edge_embed) edge_embed = edge_embed.reshape( batch_size, num_res, num_res, -1 ) return edge_embed class InvariantPointAttention(nn.Module): """ Implements Algorithm 22. """ def __init__( self, ipa_conf, inf: float = 1e5, eps: float = 1e-8, ): """ Args: c_s: Single representation channel dimension c_z: Pair representation channel dimension c_hidden: Hidden channel dimension no_heads: Number of attention heads no_qk_points: Number of query/key points to generate no_v_points: Number of value points to generate """ super(InvariantPointAttention, self).__init__() self._ipa_conf = ipa_conf self.c_s = ipa_conf.c_s self.c_z = ipa_conf.c_z self.c_hidden = ipa_conf.c_hidden self.no_heads = ipa_conf.no_heads self.no_qk_points = ipa_conf.no_qk_points self.no_v_points = ipa_conf.no_v_points self.inf = inf self.eps = eps # These linear layers differ from their specifications in the # supplement. There, they lack bias and use Glorot initialization. # Here as in the official source, they have bias and use the default # Lecun initialization. hc = self.c_hidden * self.no_heads self.linear_q = Linear(self.c_s, hc) self.linear_kv = Linear(self.c_s, 2 * hc) hpq = self.no_heads * self.no_qk_points * 3 self.linear_q_points = Linear(self.c_s, hpq) hpkv = self.no_heads * (self.no_qk_points + self.no_v_points) * 3 self.linear_kv_points = Linear(self.c_s, hpkv) self.linear_b = Linear(self.c_z, self.no_heads) self.down_z = Linear(self.c_z, self.c_z // 4) self.head_weights = nn.Parameter(torch.zeros((ipa_conf.no_heads))) ipa_point_weights_init_(self.head_weights) concat_out_dim = ( self.c_z // 4 + self.c_hidden + self.no_v_points * 4 ) self.linear_out = Linear(self.no_heads * concat_out_dim, self.c_s, init="final") self.softmax = nn.Softmax(dim=-1) self.softplus = nn.Softplus() def forward( self, s: torch.Tensor, z: Optional[torch.Tensor], r: Rigid, mask: torch.Tensor, _offload_inference: bool = False, _z_reference_list: Optional[Sequence[torch.Tensor]] = None, ) -> torch.Tensor: """ Args: s: [*, N_res, C_s] single representation z: [*, N_res, N_res, C_z] pair representation r: [*, N_res] transformation object mask: [*, N_res] mask Returns: [*, N_res, C_s] single representation update """ if _offload_inference: z = _z_reference_list else: z = [z] ####################################### # Generate scalar and point activations ####################################### # [*, N_res, H * C_hidden] q = self.linear_q(s) kv = self.linear_kv(s) # [*, N_res, H, C_hidden] q = q.view(q.shape[:-1] + (self.no_heads, -1)) # [*, N_res, H, 2 * C_hidden] kv = kv.view(kv.shape[:-1] + (self.no_heads, -1)) # [*, N_res, H, C_hidden] k, v = torch.split(kv, self.c_hidden, dim=-1) # [*, N_res, H * P_q * 3] q_pts = self.linear_q_points(s) # This is kind of clunky, but it's how the original does it # [*, N_res, H * P_q, 3] q_pts = torch.split(q_pts, q_pts.shape[-1] // 3, dim=-1) q_pts = torch.stack(q_pts, dim=-1) q_pts = r[..., None].apply(q_pts) # [*, N_res, H, P_q, 3] q_pts = q_pts.view( q_pts.shape[:-2] + (self.no_heads, self.no_qk_points, 3) ) # [*, N_res, H * (P_q + P_v) * 3] kv_pts = self.linear_kv_points(s) # [*, N_res, H * (P_q + P_v), 3] kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1) kv_pts = torch.stack(kv_pts, dim=-1) kv_pts = r[..., None].apply(kv_pts) # [*, N_res, H, (P_q + P_v), 3] kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.no_heads, -1, 3)) # [*, N_res, H, P_q/P_v, 3] k_pts, v_pts = torch.split( kv_pts, [self.no_qk_points, self.no_v_points], dim=-2 ) ########################## # Compute attention scores ########################## # [*, N_res, N_res, H] b = self.linear_b(z[0]) if(_offload_inference): z[0] = z[0].cpu() # [*, H, N_res, N_res] a = torch.matmul( math.sqrt(1.0 / (3 * self.c_hidden)) * permute_final_dims(q, (1, 0, 2)), # [*, H, N_res, C_hidden] permute_final_dims(k, (1, 2, 0)), # [*, H, C_hidden, N_res] ) # a *= math.sqrt(1.0 / (3 * self.c_hidden)) a += (math.sqrt(1.0 / 3) * permute_final_dims(b, (2, 0, 1))) # [*, N_res, N_res, H, P_q, 3] pt_displacement = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5) pt_att = pt_displacement ** 2 # [*, N_res, N_res, H, P_q] pt_att = sum(torch.unbind(pt_att, dim=-1)) head_weights = self.softplus(self.head_weights).view( *((1,) * len(pt_att.shape[:-2]) + (-1, 1)) ) head_weights = head_weights * math.sqrt( 1.0 / (3 * (self.no_qk_points * 9.0 / 2)) ) pt_att = pt_att * head_weights # [*, N_res, N_res, H] pt_att = torch.sum(pt_att, dim=-1) * (-0.5) # [*, N_res, N_res] square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2) square_mask = self.inf * (square_mask - 1) # [*, H, N_res, N_res] pt_att = permute_final_dims(pt_att, (2, 0, 1)) a = a + pt_att a = a + square_mask.unsqueeze(-3) a = self.softmax(a) ################ # Compute output ################ # [*, N_res, H, C_hidden] o = torch.matmul( a, v.transpose(-2, -3) ).transpose(-2, -3) # [*, N_res, H * C_hidden] o = flatten_final_dims(o, 2) # [*, H, 3, N_res, P_v] o_pt = torch.sum( ( a[..., None, :, :, None] * permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :] ), dim=-2, ) # [*, N_res, H, P_v, 3] o_pt = permute_final_dims(o_pt, (2, 0, 3, 1)) o_pt = r[..., None, None].invert_apply(o_pt) # [*, N_res, H * P_v] o_pt_dists = torch.sqrt(torch.sum(o_pt ** 2, dim=-1) + self.eps) o_pt_norm_feats = flatten_final_dims( o_pt_dists, 2) # [*, N_res, H * P_v, 3] o_pt = o_pt.reshape(*o_pt.shape[:-3], -1, 3) if(_offload_inference): z[0] = z[0].to(o_pt.device) # [*, N_res, H, C_z // 4] pair_z = self.down_z(z[0]) o_pair = torch.matmul(a.transpose(-2, -3), pair_z) # [*, N_res, H * C_z // 4] o_pair = flatten_final_dims(o_pair, 2) o_feats = [o, *torch.unbind(o_pt, dim=-1), o_pt_norm_feats, o_pair] # [*, N_res, C_s] s = self.linear_out( torch.cat( o_feats, dim=-1 ) ) return s class TorsionAngles(nn.Module): def __init__(self, c, num_torsions, eps=1e-8): super(TorsionAngles, self).__init__() self.c = c self.eps = eps self.num_torsions = num_torsions self.linear_1 = Linear(self.c, self.c, init="relu") self.linear_2 = Linear(self.c, self.c, init="relu") # TODO: Remove after published checkpoint is updated without these weights. self.linear_3 = Linear(self.c, self.c, init="final") self.linear_final = Linear( self.c, self.num_torsions * 2, init="final") self.relu = nn.ReLU() def forward(self, s): s_initial = s s = self.linear_1(s) s = self.relu(s) s = self.linear_2(s) s = s + s_initial unnormalized_s = self.linear_final(s) norm_denom = torch.sqrt( torch.clamp( torch.sum(unnormalized_s ** 2, dim=-1, keepdim=True), min=self.eps, ) ) normalized_s = unnormalized_s / norm_denom return unnormalized_s, normalized_s class RotationVFLayer(nn.Module): def __init__(self, dim): super(RotationVFLayer, self).__init__() self.linear_1 = Linear(dim, dim, init="relu") self.linear_2 = Linear(dim, dim, init="relu") self.linear_3 = Linear(dim, dim) self.final_linear = Linear(dim, 6, init="final") self.relu = nn.ReLU() def forward(self, s): s_initial = s s = self.linear_1(s) s = self.relu(s) s = self.linear_2(s) s = self.relu(s) s = self.linear_3(s) s = s + s_initial return self.final_linear(s) class BackboneUpdate(nn.Module): """ Implements part of Algorithm 23. """ def __init__(self, c_s, use_rot_updates, dropout=0.0): """ Args: c_s: Single representation channel dimension """ super(BackboneUpdate, self).__init__() self.c_s = c_s self.noise_schedule = 1.0 update_dim = 6 if use_rot_updates else 3 self.linear = nn.Sequential( Linear(self.c_s, update_dim, init="final"), # nn.Tanh(), ) def forward(self, s: torch.Tensor): """ Args: [*, N_res, C_s] single representation Returns: [*, N_res, 6] update vector """ # [*, 6] update = self.noise_schedule * self.linear(s) return update class IpaScore(nn.Module): def __init__(self, model_conf, diffuser): super(IpaScore, self).__init__() self._model_conf = model_conf ipa_conf = model_conf.ipa self._ipa_conf = ipa_conf self.diffuser = diffuser self.scale_pos = lambda x: x * ipa_conf.coordinate_scaling self.scale_rigids = lambda x: x.apply_trans_fn(self.scale_pos) self.unscale_pos = lambda x: x / ipa_conf.coordinate_scaling self.unscale_rigids = lambda x: x.apply_trans_fn(self.unscale_pos) self.trunk = nn.ModuleDict() for b in range(ipa_conf.num_blocks): self.trunk[f'ipa_{b}'] = InvariantPointAttention(ipa_conf) self.trunk[f'ipa_ln_{b}'] = nn.LayerNorm(ipa_conf.c_s) self.trunk[f'skip_embed_{b}'] = Linear( self._model_conf.node_embed_size, self._ipa_conf.c_skip, init="final" ) tfmr_in = ipa_conf.c_s + self._ipa_conf.c_skip tfmr_layer = torch.nn.TransformerEncoderLayer( d_model=tfmr_in, nhead=ipa_conf.seq_tfmr_num_heads, dim_feedforward=tfmr_in, batch_first=True, dropout=0.0, norm_first=False ) self.trunk[f'seq_tfmr_{b}'] = torch.nn.TransformerEncoder( tfmr_layer, ipa_conf.seq_tfmr_num_layers) self.trunk[f'post_tfmr_{b}'] = Linear( tfmr_in, ipa_conf.c_s, init="final") self.trunk[f'node_transition_{b}'] = StructureModuleTransition( c=ipa_conf.c_s) self.trunk[f'bb_update_{b}'] = BackboneUpdate(ipa_conf.c_s) if b < ipa_conf.num_blocks-1: # No edge update on the last block. edge_in = self._model_conf.edge_embed_size self.trunk[f'edge_transition_{b}'] = EdgeTransition( node_embed_size=ipa_conf.c_s, edge_embed_in=edge_in, edge_embed_out=self._model_conf.edge_embed_size, ) self.torsion_pred = TorsionAngles(ipa_conf.c_s, 1) def forward(self, init_node_embed, edge_embed, input_feats): node_mask = input_feats['res_mask'].type(torch.float32) diffuse_mask = (1 - input_feats['fixed_mask'].type(torch.float32)) * node_mask edge_mask = node_mask[..., None] * node_mask[..., None, :] init_frames = input_feats['rigids_t'].type(torch.float32) curr_rigids = Rigid.from_tensor_7(torch.clone(init_frames)) init_rigids = Rigid.from_tensor_7(init_frames) init_rots = init_rigids.get_rots() # Main trunk curr_rigids = self.scale_rigids(curr_rigids) init_node_embed = init_node_embed * node_mask[..., None] node_embed = init_node_embed * node_mask[..., None] for b in range(self._ipa_conf.num_blocks): ipa_embed = self.trunk[f'ipa_{b}']( node_embed, edge_embed, curr_rigids, node_mask) ipa_embed *= node_mask[..., None] node_embed = self.trunk[f'ipa_ln_{b}'](node_embed + ipa_embed) seq_tfmr_in = torch.cat([ node_embed, self.trunk[f'skip_embed_{b}'](init_node_embed) ], dim=-1) seq_tfmr_out = self.trunk[f'seq_tfmr_{b}']( seq_tfmr_in, src_key_padding_mask=1 - node_mask) node_embed = node_embed + self.trunk[f'post_tfmr_{b}'](seq_tfmr_out) node_embed = self.trunk[f'node_transition_{b}'](node_embed) node_embed = node_embed * node_mask[..., None] rigid_update = self.trunk[f'bb_update_{b}']( node_embed * diffuse_mask[..., None]) curr_rigids = curr_rigids.compose_q_update_vec( rigid_update, diffuse_mask[..., None]) if b < self._ipa_conf.num_blocks-1: edge_embed = self.trunk[f'edge_transition_{b}']( node_embed, edge_embed) edge_embed *= edge_mask[..., None] rot_score = self.diffuser.calc_rot_score( init_rigids.get_rots(), curr_rigids.get_rots(), input_feats['t'] ) rot_score = rot_score * node_mask[..., None] curr_rigids = self.unscale_rigids(curr_rigids) trans_score = self.diffuser.calc_trans_score( init_rigids.get_trans(), curr_rigids.get_trans(), input_feats['t'][:, None, None], use_torch=True, ) trans_score = trans_score * node_mask[..., None] _, psi_pred = self.torsion_pred(node_embed) model_out = { 'psi': psi_pred, 'rot_score': rot_score, 'trans_score': trans_score, 'final_rigids': curr_rigids, } return model_out