P2DFlow / models /ipa_pytorch.py
Holmes
test
ca7299e
# Copyright 2021 AlQuraishi Laboratory
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Modified code of Openfold's IPA."""
import numpy as np
import torch
import math
from scipy.stats import truncnorm
import torch.nn as nn
from typing import Optional, Callable, List, Sequence
from openfold.utils.rigid_utils import Rigid
from data import all_atom
def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
zero_index = -1 * len(inds)
first_inds = list(range(len(tensor.shape[:zero_index])))
return tensor.permute(first_inds + [zero_index + i for i in inds])
def flatten_final_dims(t: torch.Tensor, no_dims: int):
return t.reshape(t.shape[:-no_dims] + (-1,))
def ipa_point_weights_init_(weights):
with torch.no_grad():
softplus_inverse_1 = 0.541324854612918
weights.fill_(softplus_inverse_1)
def _prod(nums):
out = 1
for n in nums:
out = out * n
return out
def _calculate_fan(linear_weight_shape, fan="fan_in"):
fan_out, fan_in = linear_weight_shape
if fan == "fan_in":
f = fan_in
elif fan == "fan_out":
f = fan_out
elif fan == "fan_avg":
f = (fan_in + fan_out) / 2
else:
raise ValueError("Invalid fan option")
return f
def trunc_normal_init_(weights, scale=1.0, fan="fan_in"):
shape = weights.shape
f = _calculate_fan(shape, fan)
scale = scale / max(1, f)
a = -2
b = 2
std = math.sqrt(scale) / truncnorm.std(a=a, b=b, loc=0, scale=1)
size = _prod(shape)
samples = truncnorm.rvs(a=a, b=b, loc=0, scale=std, size=size)
samples = np.reshape(samples, shape)
with torch.no_grad():
weights.copy_(torch.tensor(samples, device=weights.device))
def lecun_normal_init_(weights):
trunc_normal_init_(weights, scale=1.0)
def he_normal_init_(weights):
trunc_normal_init_(weights, scale=2.0)
def glorot_uniform_init_(weights):
nn.init.xavier_uniform_(weights, gain=1)
def final_init_(weights):
with torch.no_grad():
weights.fill_(0.0)
def gating_init_(weights):
with torch.no_grad():
weights.fill_(0.0)
def normal_init_(weights):
torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
def compute_angles(ca_pos, pts):
batch_size, num_res, num_heads, num_pts, _ = pts.shape
calpha_vecs = (ca_pos[:, :, None, :] - ca_pos[:, None, :, :]) + 1e-10
calpha_vecs = torch.tile(calpha_vecs[:, :, :, None, None, :], (1, 1, 1, num_heads, num_pts, 1))
ipa_pts = pts[:, :, None, :, :, :] - torch.tile(ca_pos[:, :, None, None, None, :], (1, 1, num_res, num_heads, num_pts, 1))
phi_angles = all_atom.calculate_neighbor_angles(
calpha_vecs.reshape(-1, 3),
ipa_pts.reshape(-1, 3)
).reshape(batch_size, num_res, num_res, num_heads, num_pts)
return phi_angles
class Linear(nn.Linear):
"""
A Linear layer with built-in nonstandard initializations. Called just
like torch.nn.Linear.
Implements the initializers in 1.11.4, plus some additional ones found
in the code.
"""
def __init__(
self,
in_dim: int,
out_dim: int,
bias: bool = True,
init: str = "default",
init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
):
"""
Args:
in_dim:
The final dimension of inputs to the layer
out_dim:
The final dimension of layer outputs
bias:
Whether to learn an additive bias. True by default
init:
The initializer to use. Choose from:
"default": LeCun fan-in truncated normal initialization
"relu": He initialization w/ truncated normal distribution
"glorot": Fan-average Glorot uniform initialization
"gating": Weights=0, Bias=1
"normal": Normal initialization with std=1/sqrt(fan_in)
"final": Weights=0, Bias=0
Overridden by init_fn if the latter is not None.
init_fn:
A custom initializer taking weight and bias as inputs.
Overrides init if not None.
"""
super(Linear, self).__init__(in_dim, out_dim, bias=bias)
if bias:
with torch.no_grad():
self.bias.fill_(0)
if init_fn is not None:
init_fn(self.weight, self.bias)
else:
if init == "default":
lecun_normal_init_(self.weight)
elif init == "relu":
he_normal_init_(self.weight)
elif init == "glorot":
glorot_uniform_init_(self.weight)
elif init == "gating":
gating_init_(self.weight)
if bias:
with torch.no_grad():
self.bias.fill_(1.0)
elif init == "normal":
normal_init_(self.weight)
elif init == "final":
final_init_(self.weight)
else:
raise ValueError("Invalid init string.")
class StructureModuleTransition(nn.Module):
def __init__(self, c):
super(StructureModuleTransition, self).__init__()
self.c = c
self.linear_1 = Linear(self.c, self.c, init="relu")
self.linear_2 = Linear(self.c, self.c, init="relu")
self.linear_3 = Linear(self.c, self.c, init="final")
self.relu = nn.ReLU()
self.ln = nn.LayerNorm(self.c)
def forward(self, s):
s_initial = s
s = self.linear_1(s)
s = self.relu(s)
s = self.linear_2(s)
s = self.relu(s)
s = self.linear_3(s)
s = s + s_initial
s = self.ln(s)
return s
class EdgeTransition(nn.Module):
def __init__(
self,
*,
node_embed_size,
edge_embed_in,
edge_embed_out,
num_layers=2,
node_dilation=2
):
super(EdgeTransition, self).__init__()
bias_embed_size = node_embed_size // node_dilation
self.initial_embed = Linear(
node_embed_size, bias_embed_size, init="relu")
hidden_size = bias_embed_size * 2 + edge_embed_in
trunk_layers = []
for _ in range(num_layers):
trunk_layers.append(Linear(hidden_size, hidden_size, init="relu"))
trunk_layers.append(nn.ReLU())
self.trunk = nn.Sequential(*trunk_layers)
self.final_layer = Linear(hidden_size, edge_embed_out, init="final")
self.layer_norm = nn.LayerNorm(edge_embed_out)
def forward(self, node_embed, edge_embed):
node_embed = self.initial_embed(node_embed)
batch_size, num_res, _ = node_embed.shape
edge_bias = torch.cat([
torch.tile(node_embed[:, :, None, :], (1, 1, num_res, 1)),
torch.tile(node_embed[:, None, :, :], (1, num_res, 1, 1)),
], axis=-1)
edge_embed = torch.cat(
[edge_embed, edge_bias], axis=-1).reshape(
batch_size * num_res**2, -1)
edge_embed = self.final_layer(self.trunk(edge_embed) + edge_embed)
edge_embed = self.layer_norm(edge_embed)
edge_embed = edge_embed.reshape(
batch_size, num_res, num_res, -1
)
return edge_embed
class InvariantPointAttention(nn.Module):
"""
Implements Algorithm 22.
"""
def __init__(
self,
ipa_conf,
inf: float = 1e5,
eps: float = 1e-8,
):
"""
Args:
c_s:
Single representation channel dimension
c_z:
Pair representation channel dimension
c_hidden:
Hidden channel dimension
no_heads:
Number of attention heads
no_qk_points:
Number of query/key points to generate
no_v_points:
Number of value points to generate
"""
super(InvariantPointAttention, self).__init__()
self._ipa_conf = ipa_conf
self.c_s = ipa_conf.c_s
self.c_z = ipa_conf.c_z
self.c_hidden = ipa_conf.c_hidden
self.no_heads = ipa_conf.no_heads
self.no_qk_points = ipa_conf.no_qk_points
self.no_v_points = ipa_conf.no_v_points
self.inf = inf
self.eps = eps
# These linear layers differ from their specifications in the
# supplement. There, they lack bias and use Glorot initialization.
# Here as in the official source, they have bias and use the default
# Lecun initialization.
hc = self.c_hidden * self.no_heads
self.linear_q = Linear(self.c_s, hc)
self.linear_kv = Linear(self.c_s, 2 * hc)
hpq = self.no_heads * self.no_qk_points * 3
self.linear_q_points = Linear(self.c_s, hpq)
hpkv = self.no_heads * (self.no_qk_points + self.no_v_points) * 3
self.linear_kv_points = Linear(self.c_s, hpkv)
self.linear_b = Linear(self.c_z, self.no_heads)
self.down_z = Linear(self.c_z, self.c_z // 4)
self.head_weights = nn.Parameter(torch.zeros((ipa_conf.no_heads)))
ipa_point_weights_init_(self.head_weights)
concat_out_dim = (
self.c_z // 4 + self.c_hidden + self.no_v_points * 4
)
self.linear_out = Linear(self.no_heads * concat_out_dim, self.c_s, init="final")
self.softmax = nn.Softmax(dim=-1)
self.softplus = nn.Softplus()
def forward(
self,
s: torch.Tensor,
z: Optional[torch.Tensor],
r: Rigid,
mask: torch.Tensor,
_offload_inference: bool = False,
_z_reference_list: Optional[Sequence[torch.Tensor]] = None,
) -> torch.Tensor:
"""
Args:
s:
[*, N_res, C_s] single representation
z:
[*, N_res, N_res, C_z] pair representation
r:
[*, N_res] transformation object
mask:
[*, N_res] mask
Returns:
[*, N_res, C_s] single representation update
"""
if _offload_inference:
z = _z_reference_list
else:
z = [z]
#######################################
# Generate scalar and point activations
#######################################
# [*, N_res, H * C_hidden]
q = self.linear_q(s)
kv = self.linear_kv(s)
# [*, N_res, H, C_hidden]
q = q.view(q.shape[:-1] + (self.no_heads, -1))
# [*, N_res, H, 2 * C_hidden]
kv = kv.view(kv.shape[:-1] + (self.no_heads, -1))
# [*, N_res, H, C_hidden]
k, v = torch.split(kv, self.c_hidden, dim=-1)
# [*, N_res, H * P_q * 3]
q_pts = self.linear_q_points(s)
# This is kind of clunky, but it's how the original does it
# [*, N_res, H * P_q, 3]
q_pts = torch.split(q_pts, q_pts.shape[-1] // 3, dim=-1)
q_pts = torch.stack(q_pts, dim=-1)
q_pts = r[..., None].apply(q_pts)
# [*, N_res, H, P_q, 3]
q_pts = q_pts.view(
q_pts.shape[:-2] + (self.no_heads, self.no_qk_points, 3)
)
# [*, N_res, H * (P_q + P_v) * 3]
kv_pts = self.linear_kv_points(s)
# [*, N_res, H * (P_q + P_v), 3]
kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1)
kv_pts = torch.stack(kv_pts, dim=-1)
kv_pts = r[..., None].apply(kv_pts)
# [*, N_res, H, (P_q + P_v), 3]
kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.no_heads, -1, 3))
# [*, N_res, H, P_q/P_v, 3]
k_pts, v_pts = torch.split(
kv_pts, [self.no_qk_points, self.no_v_points], dim=-2
)
##########################
# Compute attention scores
##########################
# [*, N_res, N_res, H]
b = self.linear_b(z[0])
if(_offload_inference):
z[0] = z[0].cpu()
# [*, H, N_res, N_res]
a = torch.matmul(
math.sqrt(1.0 / (3 * self.c_hidden)) *
permute_final_dims(q, (1, 0, 2)), # [*, H, N_res, C_hidden]
permute_final_dims(k, (1, 2, 0)), # [*, H, C_hidden, N_res]
)
# a *= math.sqrt(1.0 / (3 * self.c_hidden))
a += (math.sqrt(1.0 / 3) * permute_final_dims(b, (2, 0, 1)))
# [*, N_res, N_res, H, P_q, 3]
pt_displacement = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5)
pt_att = pt_displacement ** 2
# [*, N_res, N_res, H, P_q]
pt_att = sum(torch.unbind(pt_att, dim=-1))
head_weights = self.softplus(self.head_weights).view(
*((1,) * len(pt_att.shape[:-2]) + (-1, 1))
)
head_weights = head_weights * math.sqrt(
1.0 / (3 * (self.no_qk_points * 9.0 / 2))
)
pt_att = pt_att * head_weights
# [*, N_res, N_res, H]
pt_att = torch.sum(pt_att, dim=-1) * (-0.5)
# [*, N_res, N_res]
square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
square_mask = self.inf * (square_mask - 1)
# [*, H, N_res, N_res]
pt_att = permute_final_dims(pt_att, (2, 0, 1))
a = a + pt_att
a = a + square_mask.unsqueeze(-3)
a = self.softmax(a)
################
# Compute output
################
# [*, N_res, H, C_hidden]
o = torch.matmul(
a, v.transpose(-2, -3)
).transpose(-2, -3)
# [*, N_res, H * C_hidden]
o = flatten_final_dims(o, 2)
# [*, H, 3, N_res, P_v]
o_pt = torch.sum(
(
a[..., None, :, :, None]
* permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :]
),
dim=-2,
)
# [*, N_res, H, P_v, 3]
o_pt = permute_final_dims(o_pt, (2, 0, 3, 1))
o_pt = r[..., None, None].invert_apply(o_pt)
# [*, N_res, H * P_v]
o_pt_dists = torch.sqrt(torch.sum(o_pt ** 2, dim=-1) + self.eps)
o_pt_norm_feats = flatten_final_dims(
o_pt_dists, 2)
# [*, N_res, H * P_v, 3]
o_pt = o_pt.reshape(*o_pt.shape[:-3], -1, 3)
if(_offload_inference):
z[0] = z[0].to(o_pt.device)
# [*, N_res, H, C_z // 4]
pair_z = self.down_z(z[0])
o_pair = torch.matmul(a.transpose(-2, -3), pair_z)
# [*, N_res, H * C_z // 4]
o_pair = flatten_final_dims(o_pair, 2)
o_feats = [o, *torch.unbind(o_pt, dim=-1), o_pt_norm_feats, o_pair]
# [*, N_res, C_s]
s = self.linear_out(
torch.cat(
o_feats, dim=-1
)
)
return s
class TorsionAngles(nn.Module):
def __init__(self, c, num_torsions, eps=1e-8):
super(TorsionAngles, self).__init__()
self.c = c
self.eps = eps
self.num_torsions = num_torsions
self.linear_1 = Linear(self.c, self.c, init="relu")
self.linear_2 = Linear(self.c, self.c, init="relu")
# TODO: Remove after published checkpoint is updated without these weights.
self.linear_3 = Linear(self.c, self.c, init="final")
self.linear_final = Linear(
self.c, self.num_torsions * 2, init="final")
self.relu = nn.ReLU()
def forward(self, s):
s_initial = s
s = self.linear_1(s)
s = self.relu(s)
s = self.linear_2(s)
s = s + s_initial
unnormalized_s = self.linear_final(s)
norm_denom = torch.sqrt(
torch.clamp(
torch.sum(unnormalized_s ** 2, dim=-1, keepdim=True),
min=self.eps,
)
)
normalized_s = unnormalized_s / norm_denom
return unnormalized_s, normalized_s
class RotationVFLayer(nn.Module):
def __init__(self, dim):
super(RotationVFLayer, self).__init__()
self.linear_1 = Linear(dim, dim, init="relu")
self.linear_2 = Linear(dim, dim, init="relu")
self.linear_3 = Linear(dim, dim)
self.final_linear = Linear(dim, 6, init="final")
self.relu = nn.ReLU()
def forward(self, s):
s_initial = s
s = self.linear_1(s)
s = self.relu(s)
s = self.linear_2(s)
s = self.relu(s)
s = self.linear_3(s)
s = s + s_initial
return self.final_linear(s)
class BackboneUpdate(nn.Module):
"""
Implements part of Algorithm 23.
"""
def __init__(self, c_s, use_rot_updates, dropout=0.0):
"""
Args:
c_s:
Single representation channel dimension
"""
super(BackboneUpdate, self).__init__()
self.c_s = c_s
self.noise_schedule = 1.0
update_dim = 6 if use_rot_updates else 3
self.linear = nn.Sequential(
Linear(self.c_s, update_dim, init="final"),
# nn.Tanh(),
)
def forward(self, s: torch.Tensor):
"""
Args:
[*, N_res, C_s] single representation
Returns:
[*, N_res, 6] update vector
"""
# [*, 6]
update = self.noise_schedule * self.linear(s)
return update
class IpaScore(nn.Module):
def __init__(self, model_conf, diffuser):
super(IpaScore, self).__init__()
self._model_conf = model_conf
ipa_conf = model_conf.ipa
self._ipa_conf = ipa_conf
self.diffuser = diffuser
self.scale_pos = lambda x: x * ipa_conf.coordinate_scaling
self.scale_rigids = lambda x: x.apply_trans_fn(self.scale_pos)
self.unscale_pos = lambda x: x / ipa_conf.coordinate_scaling
self.unscale_rigids = lambda x: x.apply_trans_fn(self.unscale_pos)
self.trunk = nn.ModuleDict()
for b in range(ipa_conf.num_blocks):
self.trunk[f'ipa_{b}'] = InvariantPointAttention(ipa_conf)
self.trunk[f'ipa_ln_{b}'] = nn.LayerNorm(ipa_conf.c_s)
self.trunk[f'skip_embed_{b}'] = Linear(
self._model_conf.node_embed_size,
self._ipa_conf.c_skip,
init="final"
)
tfmr_in = ipa_conf.c_s + self._ipa_conf.c_skip
tfmr_layer = torch.nn.TransformerEncoderLayer(
d_model=tfmr_in,
nhead=ipa_conf.seq_tfmr_num_heads,
dim_feedforward=tfmr_in,
batch_first=True,
dropout=0.0,
norm_first=False
)
self.trunk[f'seq_tfmr_{b}'] = torch.nn.TransformerEncoder(
tfmr_layer, ipa_conf.seq_tfmr_num_layers)
self.trunk[f'post_tfmr_{b}'] = Linear(
tfmr_in, ipa_conf.c_s, init="final")
self.trunk[f'node_transition_{b}'] = StructureModuleTransition(
c=ipa_conf.c_s)
self.trunk[f'bb_update_{b}'] = BackboneUpdate(ipa_conf.c_s)
if b < ipa_conf.num_blocks-1:
# No edge update on the last block.
edge_in = self._model_conf.edge_embed_size
self.trunk[f'edge_transition_{b}'] = EdgeTransition(
node_embed_size=ipa_conf.c_s,
edge_embed_in=edge_in,
edge_embed_out=self._model_conf.edge_embed_size,
)
self.torsion_pred = TorsionAngles(ipa_conf.c_s, 1)
def forward(self, init_node_embed, edge_embed, input_feats):
node_mask = input_feats['res_mask'].type(torch.float32)
diffuse_mask = (1 - input_feats['fixed_mask'].type(torch.float32)) * node_mask
edge_mask = node_mask[..., None] * node_mask[..., None, :]
init_frames = input_feats['rigids_t'].type(torch.float32)
curr_rigids = Rigid.from_tensor_7(torch.clone(init_frames))
init_rigids = Rigid.from_tensor_7(init_frames)
init_rots = init_rigids.get_rots()
# Main trunk
curr_rigids = self.scale_rigids(curr_rigids)
init_node_embed = init_node_embed * node_mask[..., None]
node_embed = init_node_embed * node_mask[..., None]
for b in range(self._ipa_conf.num_blocks):
ipa_embed = self.trunk[f'ipa_{b}'](
node_embed,
edge_embed,
curr_rigids,
node_mask)
ipa_embed *= node_mask[..., None]
node_embed = self.trunk[f'ipa_ln_{b}'](node_embed + ipa_embed)
seq_tfmr_in = torch.cat([
node_embed, self.trunk[f'skip_embed_{b}'](init_node_embed)
], dim=-1)
seq_tfmr_out = self.trunk[f'seq_tfmr_{b}'](
seq_tfmr_in, src_key_padding_mask=1 - node_mask)
node_embed = node_embed + self.trunk[f'post_tfmr_{b}'](seq_tfmr_out)
node_embed = self.trunk[f'node_transition_{b}'](node_embed)
node_embed = node_embed * node_mask[..., None]
rigid_update = self.trunk[f'bb_update_{b}'](
node_embed * diffuse_mask[..., None])
curr_rigids = curr_rigids.compose_q_update_vec(
rigid_update, diffuse_mask[..., None])
if b < self._ipa_conf.num_blocks-1:
edge_embed = self.trunk[f'edge_transition_{b}'](
node_embed, edge_embed)
edge_embed *= edge_mask[..., None]
rot_score = self.diffuser.calc_rot_score(
init_rigids.get_rots(),
curr_rigids.get_rots(),
input_feats['t']
)
rot_score = rot_score * node_mask[..., None]
curr_rigids = self.unscale_rigids(curr_rigids)
trans_score = self.diffuser.calc_trans_score(
init_rigids.get_trans(),
curr_rigids.get_trans(),
input_feats['t'][:, None, None],
use_torch=True,
)
trans_score = trans_score * node_mask[..., None]
_, psi_pred = self.torsion_pred(node_embed)
model_out = {
'psi': psi_pred,
'rot_score': rot_score,
'trans_score': trans_score,
'final_rigids': curr_rigids,
}
return model_out