Source code for peptdeep.model.generic_property_prediction

import torch
import pandas as pd
import numpy as np

import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface
from alphabase.peptide.precursor import is_precursor_refined

ASCII_NUM = 128


[docs] class Model_for_Generic_AASeq_Regression_LSTM(torch.nn.Module): """Generic LSTM regression model for AA sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, dropout=0.1, **kwargs, ): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.nn = torch.nn.Sequential( building_block.ascii_embedding(hidden_dim // 4), building_block.SeqCNN(hidden_dim // 4), self.dropout, building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=nlayers), building_block.SeqAttentionSum(hidden_dim), self.dropout, torch.nn.Linear(hidden_dim, 64), torch.nn.GELU(), torch.nn.Linear(64, output_dim), )
[docs] def forward(self, aa_x): return self.nn(aa_x).squeeze(-1)
[docs] class Model_for_Generic_AASeq_Regression_Transformer(torch.nn.Module): """Generic transformer regression model for AA sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, output_attentions=False, dropout=0.1, **kwargs, ): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.input_nn = building_block.ascii_embedding(hidden_dim) self.output_attentions = output_attentions self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder( hidden_dim, nlayers=nlayers, dropout=dropout, output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( building_block.SeqAttentionSum(hidden_dim), torch.nn.PReLU(), self.dropout, torch.nn.Linear(hidden_dim, output_dim), )
@property def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter def output_attentions(self, val: bool): self._output_attentions = val
[docs] def forward(self, aa_x): aa_x = self.dropout(self.input_nn(aa_x)) aa_x = self.hidden_nn(aa_x) if self.output_attentions: self.attentions = aa_x[1] else: self.attentions = None aa_x = self.dropout(aa_x[0]) return self.output_nn(aa_x).squeeze(1)
[docs] class ModelInterface_for_Generic_AASeq_Regression(ModelInterface): """ `ModelInterface` for Generic_AASeq_Regression models """
[docs] def __init__( self, model_class: torch.nn.Module = Model_for_Generic_AASeq_Regression_LSTM, dropout=0.1, device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, **kwargs, ): super().__init__(device=device) self.build( model_class, dropout=dropout, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, **kwargs, ) self.loss_func = torch.nn.L1Loss() # for regression self.target_column_to_predict = "predicted_property" self.target_column_to_train = "detected_property"
[docs] class Model_for_Generic_ModAASeq_Regression_LSTM(torch.nn.Module): """Generic LSTM regression model for modified sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, dropout=0.1, **kwargs, ): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.encoder_nn = building_block.Encoder_AA_Mod_CNN_LSTM_AttnSum( hidden_dim, n_lstm_layers=nlayers, ) self.output_nn = torch.nn.Sequential( self.dropout, torch.nn.Linear(hidden_dim, 64), torch.nn.GELU(), torch.nn.Linear(64, output_dim), )
[docs] def forward(self, aa_x, mod_x): x = self.encoder_nn(aa_x, mod_x) return self.output_nn(x).squeeze(-1)
[docs] class Model_for_Generic_ModAASeq_Regression_Transformer(torch.nn.Module): """Generic transformer regression model for modified sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, output_attentions=False, dropout=0.1, **kwargs, ): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.input_nn = building_block.AA_Mod_Embedding(hidden_dim) self._output_attentions = output_attentions self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder( hidden_dim, nlayers=nlayers, dropout=dropout, output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( building_block.SeqAttentionSum(hidden_dim), torch.nn.PReLU(), self.dropout, torch.nn.Linear(hidden_dim, output_dim), )
@property def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter def output_attentions(self, val: bool): self._output_attentions = val
[docs] def forward( self, aa_indices, mod_x, ): x = self.dropout(self.input_nn(aa_indices, mod_x)) hidden_x = self.hidden_nn(x) if self.output_attentions: self.attentions = hidden_x[1] else: self.attentions = None x = self.dropout(hidden_x[0] + x * 0.2) return self.output_nn(x).squeeze(1)
[docs] class ModelInterface_for_Generic_ModAASeq_Regression(ModelInterface): """ `ModelInterface` for all Generic_ModAASeq_Regression models """
[docs] def __init__( self, model_class: torch.nn.Module = Model_for_Generic_ModAASeq_Regression_LSTM, dropout=0.1, device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, **kwargs, ): super().__init__(device=device) self.build( model_class, dropout=dropout, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, **kwargs, ) self.loss_func = torch.nn.L1Loss() # for regression self.target_column_to_predict = "predicted_property" self.target_column_to_train = "detected_property"
def _get_features_from_batch_df( self, batch_df: pd.DataFrame, **kwargs, ): return self._get_aa_mod_features(batch_df)
[docs] class Model_for_Generic_AASeq_BinaryClassification_LSTM( Model_for_Generic_AASeq_Regression_LSTM ): """Generic LSTM classification model for AA sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, dropout=0.1, **kwargs, ): super().__init__( hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, dropout=dropout, )
[docs] def forward(self, aa_x): x = super().forward(aa_x) return torch.sigmoid(x)
[docs] class Model_for_Generic_AASeq_BinaryClassification_Transformer( Model_for_Generic_AASeq_Regression_Transformer ): """Generic transformer classification model for AA sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, output_attentions=False, dropout=0.1, **kwargs, ): """ Model based on a transformer Architecture from Huggingface's BertEncoder class. """ super().__init__( nlayers=nlayers, hidden_dim=hidden_dim, output_dim=output_dim, output_attentions=output_attentions, dropout=dropout, **kwargs, )
[docs] def forward(self, aa_x): x = super().forward(aa_x) return torch.sigmoid(x)
[docs] class ModelInterface_for_Generic_AASeq_BinaryClassification(ModelInterface): """ `ModelInterface` for all Generic_AASeq_BinaryClassification models """
[docs] def __init__( self, model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_LSTM, dropout=0.1, device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, **kwargs, ): """ Class to predict retention times from precursor dataframes. """ super().__init__(device=device) self.build( model_class, dropout=dropout, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, **kwargs, ) self.loss_func = torch.nn.BCELoss() # for binary classification self.target_column_to_predict = "predicted_prob" self.target_column_to_train = "detected_prob"
[docs] class Model_for_Generic_ModAASeq_BinaryClassification_LSTM( Model_for_Generic_ModAASeq_Regression_LSTM ): """Generic LSTM classification model for modified sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, dropout=0.1, **kwargs, ): super().__init__( hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, dropout=dropout, **kwargs, )
[docs] def forward(self, aa_x, mod_x): x = super().forward(aa_x, mod_x) return torch.sigmoid(x)
[docs] class Model_for_Generic_ModAASeq_BinaryClassification_Transformer( Model_for_Generic_ModAASeq_Regression_Transformer ): """Generic transformer classification model for modified sequence"""
[docs] def __init__( self, *, hidden_dim=256, output_dim=1, nlayers=4, output_attentions=False, dropout=0.1, **kwargs, ): super().__init__( nlayers=nlayers, hidden_dim=hidden_dim, output_dim=output_dim, output_attentions=output_attentions, dropout=dropout, **kwargs, )
@property def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter def output_attentions(self, val: bool): self._output_attentions = val
[docs] def forward( self, aa_indices, mod_x, ): x = super().forward(aa_indices, mod_x) return torch.sigmoid(x)
[docs] class ModelInterface_for_Generic_ModAASeq_BinaryClassification(ModelInterface): """ `ModelInterface` for Generic_ModAASeq_BinaryClassification """
[docs] def __init__( self, model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_LSTM, dropout=0.1, device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, **kwargs, ): super().__init__(device=device) self.build( model_class, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, dropout=dropout, **kwargs, ) self.loss_func = torch.nn.BCELoss() # for classification self.target_column_to_predict = "predicted_prob" self.target_column_to_train = "detected_prob"
def _get_features_from_batch_df( self, batch_df: pd.DataFrame, ): return self._get_aa_mod_features(batch_df)
[docs] class ModelInterface_for_Generic_AASeq_MultiLabelClassification( ModelInterface_for_Generic_AASeq_BinaryClassification ):
[docs] def __init__( self, num_target_values: int = 6, model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_Transformer, nlayers=4, hidden_dim=256, device="gpu", dropout=0.1, **kwargs, ): self.num_target_values = num_target_values super().__init__( model_class=model_class, output_dim=self.num_target_values, nlayers=nlayers, hidden_dim=hidden_dim, device=device, dropout=dropout, **kwargs, ) self.target_column_to_train = "target_probs" self.target_column_to_predict = "target_probs_pred"
def _get_targets_from_batch_df(self, batch_df, **kwargs): return self._as_tensor( np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) def _check_predict_in_order(self, precursor_df: pd.DataFrame): if not is_precursor_refined(precursor_df): # multilabel prediction can only predict in order precursor_df.sort_values("nAA", inplace=True) precursor_df.reset_index(drop=True, inplace=True) def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ [0] * self.num_target_values ] * len(precursor_df) self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): self.predict_df.loc[:, self.target_column_to_predict].values[ batch_df.index.values[0] : batch_df.index.values[-1] + 1 ] = list(predict_values)
[docs] class ModelInterface_for_Generic_ModAASeq_MultiLabelClassification( ModelInterface_for_Generic_ModAASeq_BinaryClassification ):
[docs] def __init__( self, num_target_values: int = 6, model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_Transformer, nlayers=4, hidden_dim=256, device="gpu", dropout=0.1, **kwargs, ): self.num_target_values = num_target_values super().__init__( model_class=model_class, output_dim=self.num_target_values, nlayers=nlayers, hidden_dim=hidden_dim, device=device, dropout=dropout, **kwargs, ) self.target_column_to_train = "target_probs" self.target_column_to_predict = "target_probs_pred"
def _get_targets_from_batch_df(self, batch_df, **kwargs): return self._as_tensor( np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) def _check_predict_in_order(self, precursor_df: pd.DataFrame): if not is_precursor_refined(precursor_df): # multilabel prediction can only predict in order precursor_df.sort_values("nAA", inplace=True) precursor_df.reset_index(drop=True, inplace=True) def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ [0] * self.num_target_values ] * len(precursor_df) self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): self.predict_df.loc[:, self.target_column_to_predict].values[ batch_df.index.values[0] : batch_df.index.values[-1] + 1 ] = list(predict_values)
# alias ModelInterface_for_Generic_AASeq_MultiTargetClassification = ( ModelInterface_for_Generic_AASeq_MultiLabelClassification ) ModelInterface_for_Generic_ModAASeq_MultiTargetClassification = ( ModelInterface_for_Generic_ModAASeq_MultiLabelClassification )