import torch
import pandas as pd
import numpy as np
from peptdeep.model.featurize import get_batch_aa_indices, get_batch_mod_feature
from peptdeep.settings import model_const
import peptdeep.model.model_interface as model_interface
import peptdeep.model.building_block as building_block
from peptdeep.utils import evaluate_linear_regression
mod_feature_size = len(model_const["mod_elements"])
IRT_PEPTIDE_DF = pd.DataFrame(
[
["LGGNEQVTR", "RT-pep a", -24.92, "", ""],
["GAGSSEPVTGLDAK", "RT-pep b", 0.00, "", ""],
["VEATFGVDESNAK", "RT-pep c", 12.39, "", ""],
["YILAGVENSK", "RT-pep d", 19.79, "", ""],
["TPVISGGPYEYR", "RT-pep e", 28.71, "", ""],
["TPVITGAPYEYR", "RT-pep f", 33.38, "", ""],
["DGLDAASYYAPVR", "RT-pep g", 42.26, "", ""],
["ADVTPADFSEWSK", "RT-pep h", 54.62, "", ""],
["GTFIIDPGGVIR", "RT-pep i", 70.52, "", ""],
["GTFIIDPAAVIR", "RT-pep k", 87.23, "", ""],
["LFLQFGAQGSPFLK", "RT-pep l", 100.00, "", ""],
],
columns=["sequence", "pep_name", "irt", "mods", "mod_sites"],
)
IRT_PEPTIDE_DF["nAA"] = IRT_PEPTIDE_DF.sequence.str.len()
# legacy
irt_pep = IRT_PEPTIDE_DF
[docs]
class Model_RT_Bert(torch.nn.Module):
"""Transformer model for RT prediction"""
[docs]
def __init__(
self,
dropout=0.1,
nlayers=4,
hidden=128,
output_attentions=False,
**kwargs,
):
super().__init__()
self.dropout = torch.nn.Dropout(dropout)
self.input_nn = building_block.AATransformerEncoding(hidden)
self._output_attentions = output_attentions
self.hidden_nn = building_block.Hidden_HFace_Transformer(
hidden,
nlayers=nlayers,
dropout=dropout,
output_attentions=output_attentions,
)
self.output_nn = torch.nn.Sequential(
building_block.SeqAttentionSum(hidden),
torch.nn.PReLU(),
self.dropout,
torch.nn.Linear(hidden, 1),
)
@property
def output_attentions(self):
return self._output_attentions
@output_attentions.setter
def output_attentions(self, val: bool):
self._output_attentions = val
self.hidden_nn.output_attentions = val
[docs]
def forward(
self,
aa_indices,
mod_x,
):
x = self.dropout(self.input_nn(aa_indices, mod_x))
hidden_x = self.hidden_nn(x)
if self.output_attentions:
self.attentions = hidden_x[1]
else:
self.attentions = None
x = self.dropout(hidden_x[0] + x * 0.2)
return self.output_nn(x).squeeze(1)
[docs]
class Model_RT_LSTM_CNN(torch.nn.Module):
"""CNN+LSTM model for RT prediction"""
[docs]
def __init__(
self,
dropout=0.2,
):
super().__init__()
self.dropout = torch.nn.Dropout(dropout)
hidden = 256
self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum(hidden)
self.rt_decoder = building_block.Decoder_Linear(hidden, 1)
[docs]
def forward(
self,
aa_indices,
mod_x,
):
x = self.rt_encoder(aa_indices, mod_x)
x = self.dropout(x)
return self.rt_decoder(x).squeeze(1)
# legacy
Model_RT_LSTM = Model_RT_LSTM_CNN
[docs]
class AlphaRTModel(model_interface.ModelInterface):
"""
`ModelInterface` for RT models
"""
[docs]
def __init__(
self,
dropout=0.1,
model_class: torch.nn.Module = Model_RT_LSTM_CNN, # model defined above
device: str = "gpu",
**kwargs,
):
super().__init__(device=device)
self.model: Model_RT_LSTM_CNN = None
self.build(model_class, dropout=dropout, **kwargs)
self.target_column_to_predict = "rt_pred"
self.target_column_to_train = "rt_norm"
[docs]
def test(
self,
precursor_df: pd.DataFrame,
*,
batch_size: int = 1024,
):
return evaluate_linear_regression(
self.predict(precursor_df, batch_size=batch_size), x="rt_pred", y="rt_norm"
)
def _get_features_from_batch_df(
self,
batch_df: pd.DataFrame,
):
return (
self._get_26aa_indice_features(batch_df),
self._get_mod_features(batch_df),
)
[docs]
def add_irt_column_to_precursor_df(
self,
precursor_df: pd.DataFrame,
irt_pep_df: pd.DataFrame = None,
):
if irt_pep_df is None:
irt_pep_df = IRT_PEPTIDE_DF
print(f"Predict RT for {len(irt_pep_df)} iRT precursors.")
self.predict(irt_pep_df)
if "irt" not in irt_pep_df.columns:
irt_pep_df["irt"] = irt_pep_df["rt"]
eval_df = evaluate_linear_regression(irt_pep_df, "rt_pred", y="irt")
print("Linear regression of `rt_pred` to `irt`:")
print(eval_df)
# simple linear regression
# rt_pred_mean = irt_pep_df.rt_pred.mean()
# irt_mean = irt_pep_df.irt.mean()
# x = irt_pep_df.rt_pred.values - rt_pred_mean
# y = irt_pep_df.irt.values - irt_mean
# slope = np.sum(x*y)/np.sum(x*x)
# intercept = irt_mean - slope*rt_pred_mean
# end linear regression
slope = eval_df.slope.values[0]
intercept = eval_df.intercept.values[0]
precursor_df["irt_pred"] = precursor_df.rt_pred * slope + intercept
return precursor_df