Building your own models for RT prediction

[ ]:
%reload_ext autoreload
%autoreload 2
[ ]:
from peptdeep.model.featurize import (
    get_batch_aa_indices,
    get_batch_mod_feature
)

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_base
import peptdeep.model.building_block as building_block

mod_feature_size = len(model_const['mod_elements'])

import torch
import pandas as pd
[ ]:
class RT_LSTM_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.2
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128
        self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum(
            hidden
        )

        self.rt_decoder = building_block.Decoder_Linear(
            hidden,
            1
        )

    def forward(self,
        aa_indices,
        mod_x,
    ):
        x = self.rt_encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.rt_decoder(x).squeeze(1)
[ ]:
class RT_Transformer_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.2
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128
        self.encoder = building_block.Encoder_AA_Mod_Transformer_AttnSum(
            hidden
        )

        self.decoder = building_block.Decoder_Linear(
            hidden,1
        )

    def forward(self,
        aa_indices,
        mod_x,
    ):
        x = self.encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.decoder(x).squeeze(1)
[ ]:
class RT_ModelInterface(model_base.ModelInterface):
    def __init__(self,
        model_class:torch.nn.Module=RT_LSTM_Module,
        dropout=0.1,
    ):
        super().__init__()
        self.build(
            model_class,
            dropout=dropout,
        )
        self.loss_func = torch.nn.L1Loss()
        self.target_column_to_train = 'rt_norm'
        self.target_column_to_predict = 'rt_pred'

    def _get_features_from_batch_df(self,
        batch_df: pd.DataFrame,
    ):
        aa_indices = torch.LongTensor(
            get_batch_aa_indices(
                batch_df['sequence'].values.astype('U')
            )
        )
        mod_x = torch.Tensor(
            get_batch_mod_feature(
                batch_df
            )
        )

        return aa_indices, mod_x

    def _get_targets_from_batch_df(self,
        batch_df: pd.DataFrame,
    ) -> torch.Tensor:
        return torch.Tensor(batch_df['rt_norm'].values)

Testing the RT model

Prepare training data

[ ]:
from peptdeep.model.rt import irt_pep
irt_pep['rt_norm'] = (irt_pep.irt - irt_pep.irt.min())/(irt_pep.irt.max()-irt_pep.irt.min())
irt_pep
sequence pep_name irt mods mod_sites nAA rt_norm
0 LGGNEQVTR RT-pep a -24.92 9 0.000000
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 0.199488
2 VEATFGVDESNAK RT-pep c 12.39 13 0.298671
3 YILAGVENSK RT-pep d 19.79 10 0.357909
4 TPVISGGPYEYR RT-pep e 28.71 12 0.429315
5 TPVITGAPYEYR RT-pep f 33.38 12 0.466699
6 DGLDAASYYAPVR RT-pep g 42.26 13 0.537784
7 ADVTPADFSEWSK RT-pep h 54.62 13 0.636728
8 GTFIIDPGGVIR RT-pep i 70.52 12 0.764009
9 GTFIIDPAAVIR RT-pep k 87.23 12 0.897775
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 1.000000
[ ]:
rt_model = RT_ModelInterface(model_class=RT_LSTM_Module)
Device `gpu` is not available, set to `cpu`

Test the untrained model

[ ]:
rt_model.predict(irt_pep)
sequence pep_name irt mods mod_sites nAA rt_norm rt_pred
0 LGGNEQVTR RT-pep a -24.92 9 0.000000 0.0
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 0.199488 0.0
2 VEATFGVDESNAK RT-pep c 12.39 13 0.298671 0.0
3 YILAGVENSK RT-pep d 19.79 10 0.357909 0.0
4 TPVISGGPYEYR RT-pep e 28.71 12 0.429315 0.0
5 TPVITGAPYEYR RT-pep f 33.38 12 0.466699 0.0
6 DGLDAASYYAPVR RT-pep g 42.26 13 0.537784 0.0
7 ADVTPADFSEWSK RT-pep h 54.62 13 0.636728 0.0
8 GTFIIDPGGVIR RT-pep i 70.52 12 0.764009 0.0
9 GTFIIDPAAVIR RT-pep k 87.23 12 0.897775 0.0
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 1.000000 0.0

Test if training works

[ ]:
rt_model.train(irt_pep, epoch=100, verbose=False)

Test if the model fits the irt_pep data

[ ]:
rt_model.predict(irt_pep)
sequence pep_name irt mods mod_sites nAA rt_norm rt_pred
0 LGGNEQVTR RT-pep a -24.92 9 0.000000 0.000000
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 0.199488 0.209159
2 VEATFGVDESNAK RT-pep c 12.39 13 0.298671 0.293867
3 YILAGVENSK RT-pep d 19.79 10 0.357909 0.349884
4 TPVISGGPYEYR RT-pep e 28.71 12 0.429315 0.416145
5 TPVITGAPYEYR RT-pep f 33.38 12 0.466699 0.462958
6 DGLDAASYYAPVR RT-pep g 42.26 13 0.537784 0.540334
7 ADVTPADFSEWSK RT-pep h 54.62 13 0.636728 0.638801
8 GTFIIDPGGVIR RT-pep i 70.52 12 0.764009 0.725222
9 GTFIIDPAAVIR RT-pep k 87.23 12 0.897775 0.882472
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 1.000000 0.962103

Get number of model parameters

[ ]:
rt_model.get_parameter_num()
232448

It is easy to switch the model to Transformer.

Users can add more nn.Modules without re-designing the AA/PTM feature extraction parts.

[ ]:
rt_model = RT_ModelInterface(model_class=RT_Transformer_Module)
rt_model.train(irt_pep, epoch=50, warmup_epoch=20)
rt_model.predict(irt_pep)
Device `gpu` is not available, set to `cpu`
sequence pep_name irt mods mod_sites nAA rt_norm rt_pred
0 LGGNEQVTR RT-pep a -24.92 9 0.000000 0.007334
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 0.199488 0.209777
2 VEATFGVDESNAK RT-pep c 12.39 13 0.298671 0.350849
3 YILAGVENSK RT-pep d 19.79 10 0.357909 0.388612
4 TPVISGGPYEYR RT-pep e 28.71 12 0.429315 0.483431
5 TPVITGAPYEYR RT-pep f 33.38 12 0.466699 0.506625
6 DGLDAASYYAPVR RT-pep g 42.26 13 0.537784 0.578891
7 ADVTPADFSEWSK RT-pep h 54.62 13 0.636728 0.619564
8 GTFIIDPGGVIR RT-pep i 70.52 12 0.764009 0.818625
9 GTFIIDPAAVIR RT-pep k 87.23 12 0.897775 0.936355
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 1.000000 1.094726
[ ]:
rt_model.get_parameter_num()
817104
[ ]: