import torch
import pandas as pd
import numpy as np
import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface
from alphabase.peptide.precursor import is_precursor_refined
ASCII_NUM = 128
[docs]
class Model_for_Generic_AASeq_Regression_LSTM(torch.nn.Module):
"""Generic LSTM regression model for AA sequence"""
[docs]
def __init__(
self,
*,
hidden_dim=256,
output_dim=1,
nlayers=4,
dropout=0.1,
**kwargs,
):
super().__init__()
self.dropout = torch.nn.Dropout(dropout)
self.nn = torch.nn.Sequential(
building_block.ascii_embedding(hidden_dim // 4),
building_block.SeqCNN(hidden_dim // 4),
self.dropout,
building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=nlayers),
building_block.SeqAttentionSum(hidden_dim),
self.dropout,
torch.nn.Linear(hidden_dim, 64),
torch.nn.GELU(),
torch.nn.Linear(64, output_dim),
)
[docs]
def forward(self, aa_x):
return self.nn(aa_x).squeeze(-1)
[docs]
class ModelInterface_for_Generic_AASeq_Regression(ModelInterface):
"""
`ModelInterface` for Generic_AASeq_Regression models
"""
[docs]
def __init__(
self,
model_class: torch.nn.Module = Model_for_Generic_AASeq_Regression_LSTM,
dropout=0.1,
device: str = "gpu",
hidden_dim=256,
output_dim=1,
nlayers=4,
**kwargs,
):
super().__init__(device=device)
self.build(
model_class,
dropout=dropout,
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
**kwargs,
)
self.loss_func = torch.nn.L1Loss() # for regression
self.target_column_to_predict = "predicted_property"
self.target_column_to_train = "detected_property"
[docs]
class Model_for_Generic_ModAASeq_Regression_LSTM(torch.nn.Module):
"""Generic LSTM regression model for modified sequence"""
[docs]
def __init__(
self,
*,
hidden_dim=256,
output_dim=1,
nlayers=4,
dropout=0.1,
**kwargs,
):
super().__init__()
self.dropout = torch.nn.Dropout(dropout)
self.encoder_nn = building_block.Encoder_AA_Mod_CNN_LSTM_AttnSum(
hidden_dim,
n_lstm_layers=nlayers,
)
self.output_nn = torch.nn.Sequential(
self.dropout,
torch.nn.Linear(hidden_dim, 64),
torch.nn.GELU(),
torch.nn.Linear(64, output_dim),
)
[docs]
def forward(self, aa_x, mod_x):
x = self.encoder_nn(aa_x, mod_x)
return self.output_nn(x).squeeze(-1)
[docs]
class ModelInterface_for_Generic_ModAASeq_Regression(ModelInterface):
"""
`ModelInterface` for all Generic_ModAASeq_Regression models
"""
[docs]
def __init__(
self,
model_class: torch.nn.Module = Model_for_Generic_ModAASeq_Regression_LSTM,
dropout=0.1,
device: str = "gpu",
hidden_dim=256,
output_dim=1,
nlayers=4,
**kwargs,
):
super().__init__(device=device)
self.build(
model_class,
dropout=dropout,
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
**kwargs,
)
self.loss_func = torch.nn.L1Loss() # for regression
self.target_column_to_predict = "predicted_property"
self.target_column_to_train = "detected_property"
def _get_features_from_batch_df(
self,
batch_df: pd.DataFrame,
**kwargs,
):
return self._get_aa_mod_features(batch_df)
[docs]
class Model_for_Generic_AASeq_BinaryClassification_LSTM(
Model_for_Generic_AASeq_Regression_LSTM
):
"""Generic LSTM classification model for AA sequence"""
[docs]
def __init__(
self,
*,
hidden_dim=256,
output_dim=1,
nlayers=4,
dropout=0.1,
**kwargs,
):
super().__init__(
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
dropout=dropout,
)
[docs]
def forward(self, aa_x):
x = super().forward(aa_x)
return torch.sigmoid(x)
[docs]
class ModelInterface_for_Generic_AASeq_BinaryClassification(ModelInterface):
"""
`ModelInterface` for all Generic_AASeq_BinaryClassification models
"""
[docs]
def __init__(
self,
model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_LSTM,
dropout=0.1,
device: str = "gpu",
hidden_dim=256,
output_dim=1,
nlayers=4,
**kwargs,
):
"""
Class to predict retention times from precursor dataframes.
"""
super().__init__(device=device)
self.build(
model_class,
dropout=dropout,
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
**kwargs,
)
self.loss_func = torch.nn.BCELoss() # for binary classification
self.target_column_to_predict = "predicted_prob"
self.target_column_to_train = "detected_prob"
[docs]
class Model_for_Generic_ModAASeq_BinaryClassification_LSTM(
Model_for_Generic_ModAASeq_Regression_LSTM
):
"""Generic LSTM classification model for modified sequence"""
[docs]
def __init__(
self,
*,
hidden_dim=256,
output_dim=1,
nlayers=4,
dropout=0.1,
**kwargs,
):
super().__init__(
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
dropout=dropout,
**kwargs,
)
[docs]
def forward(self, aa_x, mod_x):
x = super().forward(aa_x, mod_x)
return torch.sigmoid(x)
[docs]
class ModelInterface_for_Generic_ModAASeq_BinaryClassification(ModelInterface):
"""
`ModelInterface` for Generic_ModAASeq_BinaryClassification
"""
[docs]
def __init__(
self,
model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_LSTM,
dropout=0.1,
device: str = "gpu",
hidden_dim=256,
output_dim=1,
nlayers=4,
**kwargs,
):
super().__init__(device=device)
self.build(
model_class,
hidden_dim=hidden_dim,
output_dim=output_dim,
nlayers=nlayers,
dropout=dropout,
**kwargs,
)
self.loss_func = torch.nn.BCELoss() # for classification
self.target_column_to_predict = "predicted_prob"
self.target_column_to_train = "detected_prob"
def _get_features_from_batch_df(
self,
batch_df: pd.DataFrame,
):
return self._get_aa_mod_features(batch_df)
[docs]
class ModelInterface_for_Generic_AASeq_MultiLabelClassification(
ModelInterface_for_Generic_AASeq_BinaryClassification
):
[docs]
def __init__(
self,
num_target_values: int = 6,
model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_Transformer,
nlayers=4,
hidden_dim=256,
device="gpu",
dropout=0.1,
**kwargs,
):
self.num_target_values = num_target_values
super().__init__(
model_class=model_class,
output_dim=self.num_target_values,
nlayers=nlayers,
hidden_dim=hidden_dim,
device=device,
dropout=dropout,
**kwargs,
)
self.target_column_to_train = "target_probs"
self.target_column_to_predict = "target_probs_pred"
def _get_targets_from_batch_df(self, batch_df, **kwargs):
return self._as_tensor(
np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32
)
def _check_predict_in_order(self, precursor_df: pd.DataFrame):
if not is_precursor_refined(precursor_df):
# multilabel prediction can only predict in order
precursor_df.sort_values("nAA", inplace=True)
precursor_df.reset_index(drop=True, inplace=True)
def _prepare_predict_data_df(self, precursor_df, **kwargs):
precursor_df[self.target_column_to_predict] = [
[0] * self.num_target_values
] * len(precursor_df)
self.predict_df = precursor_df
def _set_batch_predict_data(self, batch_df, predict_values, **kwargs):
self.predict_df.loc[:, self.target_column_to_predict].values[
batch_df.index.values[0] : batch_df.index.values[-1] + 1
] = list(predict_values)
[docs]
class ModelInterface_for_Generic_ModAASeq_MultiLabelClassification(
ModelInterface_for_Generic_ModAASeq_BinaryClassification
):
[docs]
def __init__(
self,
num_target_values: int = 6,
model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_Transformer,
nlayers=4,
hidden_dim=256,
device="gpu",
dropout=0.1,
**kwargs,
):
self.num_target_values = num_target_values
super().__init__(
model_class=model_class,
output_dim=self.num_target_values,
nlayers=nlayers,
hidden_dim=hidden_dim,
device=device,
dropout=dropout,
**kwargs,
)
self.target_column_to_train = "target_probs"
self.target_column_to_predict = "target_probs_pred"
def _get_targets_from_batch_df(self, batch_df, **kwargs):
return self._as_tensor(
np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32
)
def _check_predict_in_order(self, precursor_df: pd.DataFrame):
if not is_precursor_refined(precursor_df):
# multilabel prediction can only predict in order
precursor_df.sort_values("nAA", inplace=True)
precursor_df.reset_index(drop=True, inplace=True)
def _prepare_predict_data_df(self, precursor_df, **kwargs):
precursor_df[self.target_column_to_predict] = [
[0] * self.num_target_values
] * len(precursor_df)
self.predict_df = precursor_df
def _set_batch_predict_data(self, batch_df, predict_values, **kwargs):
self.predict_df.loc[:, self.target_column_to_predict].values[
batch_df.index.values[0] : batch_df.index.values[-1] + 1
] = list(predict_values)
# alias
ModelInterface_for_Generic_AASeq_MultiTargetClassification = (
ModelInterface_for_Generic_AASeq_MultiLabelClassification
)
ModelInterface_for_Generic_ModAASeq_MultiTargetClassification = (
ModelInterface_for_Generic_ModAASeq_MultiLabelClassification
)