Source code for peptdeep.model.rt

import torch
import pandas as pd
import numpy as np

from peptdeep.model.featurize import get_batch_aa_indices, get_batch_mod_feature

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_interface
import peptdeep.model.building_block as building_block
from peptdeep.utils import evaluate_linear_regression

mod_feature_size = len(model_const["mod_elements"])

IRT_PEPTIDE_DF = pd.DataFrame(
    [
        ["LGGNEQVTR", "RT-pep a", -24.92, "", ""],
        ["GAGSSEPVTGLDAK", "RT-pep b", 0.00, "", ""],
        ["VEATFGVDESNAK", "RT-pep c", 12.39, "", ""],
        ["YILAGVENSK", "RT-pep d", 19.79, "", ""],
        ["TPVISGGPYEYR", "RT-pep e", 28.71, "", ""],
        ["TPVITGAPYEYR", "RT-pep f", 33.38, "", ""],
        ["DGLDAASYYAPVR", "RT-pep g", 42.26, "", ""],
        ["ADVTPADFSEWSK", "RT-pep h", 54.62, "", ""],
        ["GTFIIDPGGVIR", "RT-pep i", 70.52, "", ""],
        ["GTFIIDPAAVIR", "RT-pep k", 87.23, "", ""],
        ["LFLQFGAQGSPFLK", "RT-pep l", 100.00, "", ""],
    ],
    columns=["sequence", "pep_name", "irt", "mods", "mod_sites"],
)
IRT_PEPTIDE_DF["nAA"] = IRT_PEPTIDE_DF.sequence.str.len()


# legacy
irt_pep = IRT_PEPTIDE_DF



[docs]
class Model_RT_Bert(torch.nn.Module):
    """Transformer model for RT prediction"""


[docs]
    def __init__(
        self,
        dropout=0.1,
        nlayers=4,
        hidden=128,
        output_attentions=False,
        **kwargs,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn = building_block.AATransformerEncoding(hidden)

        self._output_attentions = output_attentions

        self.hidden_nn = building_block.Hidden_HFace_Transformer(
            hidden,
            nlayers=nlayers,
            dropout=dropout,
            output_attentions=output_attentions,
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden, 1),
        )


    @property
    def output_attentions(self):
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val: bool):
        self._output_attentions = val
        self.hidden_nn.output_attentions = val


[docs]
    def forward(
        self,
        aa_indices,
        mod_x,
    ):
        x = self.dropout(self.input_nn(aa_indices, mod_x))

        hidden_x = self.hidden_nn(x)
        if self.output_attentions:
            self.attentions = hidden_x[1]
        else:
            self.attentions = None
        x = self.dropout(hidden_x[0] + x * 0.2)

        return self.output_nn(x).squeeze(1)





[docs]
class Model_RT_LSTM_CNN(torch.nn.Module):
    """CNN+LSTM model for RT prediction"""


[docs]
    def __init__(
        self,
        dropout=0.2,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 256
        self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum(hidden)

        self.rt_decoder = building_block.Decoder_Linear(hidden, 1)



[docs]
    def forward(
        self,
        aa_indices,
        mod_x,
    ):
        x = self.rt_encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.rt_decoder(x).squeeze(1)




# legacy
Model_RT_LSTM = Model_RT_LSTM_CNN



[docs]
class AlphaRTModel(model_interface.ModelInterface):
    """
    `ModelInterface` for RT models
    """


[docs]
    def __init__(
        self,
        dropout=0.1,
        model_class: torch.nn.Module = Model_RT_LSTM_CNN,  # model defined above
        device: str = "gpu",
        **kwargs,
    ):
        super().__init__(device=device)
        self.model: Model_RT_LSTM_CNN = None
        self.build(model_class, dropout=dropout, **kwargs)
        self.target_column_to_predict = "rt_pred"
        self.target_column_to_train = "rt_norm"



[docs]
    def test(
        self,
        precursor_df: pd.DataFrame,
        *,
        batch_size: int = 1024,
    ):
        return evaluate_linear_regression(
            self.predict(precursor_df, batch_size=batch_size), x="rt_pred", y="rt_norm"
        )


    def _get_features_from_batch_df(
        self,
        batch_df: pd.DataFrame,
    ):
        return (
            self._get_26aa_indice_features(batch_df),
            self._get_mod_features(batch_df),
        )


[docs]
    def add_irt_column_to_precursor_df(
        self,
        precursor_df: pd.DataFrame,
        irt_pep_df: pd.DataFrame = None,
    ):
        if irt_pep_df is None:
            irt_pep_df = IRT_PEPTIDE_DF
        print(f"Predict RT for {len(irt_pep_df)} iRT precursors.")
        self.predict(irt_pep_df)
        if "irt" not in irt_pep_df.columns:
            irt_pep_df["irt"] = irt_pep_df["rt"]
        eval_df = evaluate_linear_regression(irt_pep_df, "rt_pred", y="irt")
        print("Linear regression of `rt_pred` to `irt`:")
        print(eval_df)
        # simple linear regression
        # rt_pred_mean = irt_pep_df.rt_pred.mean()
        # irt_mean = irt_pep_df.irt.mean()
        # x = irt_pep_df.rt_pred.values - rt_pred_mean
        # y = irt_pep_df.irt.values - irt_mean
        # slope = np.sum(x*y)/np.sum(x*x)
        # intercept = irt_mean - slope*rt_pred_mean
        # end linear regression
        slope = eval_df.slope.values[0]
        intercept = eval_df.intercept.values[0]
        precursor_df["irt_pred"] = precursor_df.rt_pred * slope + intercept
        return precursor_df