Building your own models for RT prediction¶

[ ]:

%reload_ext autoreload
%autoreload 2

[ ]:

from peptdeep.model.featurize import (
    get_batch_aa_indices,
    get_batch_mod_feature
)

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_base
import peptdeep.model.building_block as building_block

mod_feature_size = len(model_const['mod_elements'])

import torch
import pandas as pd

[ ]:

class RT_LSTM_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.2
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128
        self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum(
            hidden
        )

        self.rt_decoder = building_block.Decoder_Linear(
            hidden,
            1
        )

    def forward(self,
        aa_indices,
        mod_x,
    ):
        x = self.rt_encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.rt_decoder(x).squeeze(1)

[ ]:

class RT_Transformer_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.2
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128
        self.encoder = building_block.Encoder_AA_Mod_Transformer_AttnSum(
            hidden
        )

        self.decoder = building_block.Decoder_Linear(
            hidden,1
        )

    def forward(self,
        aa_indices,
        mod_x,
    ):
        x = self.encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.decoder(x).squeeze(1)

[ ]:

class RT_ModelInterface(model_base.ModelInterface):
    def __init__(self,
        model_class:torch.nn.Module=RT_LSTM_Module,
        dropout=0.1,
    ):
        super().__init__()
        self.build(
            model_class,
            dropout=dropout,
        )
        self.loss_func = torch.nn.L1Loss()
        self.target_column_to_train = 'rt_norm'
        self.target_column_to_predict = 'rt_pred'

    def _get_features_from_batch_df(self,
        batch_df: pd.DataFrame,
    ):
        aa_indices = torch.LongTensor(
            get_batch_aa_indices(
                batch_df['sequence'].values.astype('U')
            )
        )
        mod_x = torch.Tensor(
            get_batch_mod_feature(
                batch_df
            )
        )

        return aa_indices, mod_x

    def _get_targets_from_batch_df(self,
        batch_df: pd.DataFrame,
    ) -> torch.Tensor:
        return torch.Tensor(batch_df['rt_norm'].values)

Testing the RT model¶

Prepare training data¶

[ ]:

from peptdeep.model.rt import irt_pep
irt_pep['rt_norm'] = (irt_pep.irt - irt_pep.irt.min())/(irt_pep.irt.max()-irt_pep.irt.min())
irt_pep

	sequence	pep_name	irt	nAA	rt_norm
0	LGGNEQVTR	RT-pep a	-24.92	9	0.000000
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	0.199488
2	VEATFGVDESNAK	RT-pep c	12.39	13	0.298671
3	YILAGVENSK	RT-pep d	19.79	10	0.357909
4	TPVISGGPYEYR	RT-pep e	28.71	12	0.429315
5	TPVITGAPYEYR	RT-pep f	33.38	12	0.466699
6	DGLDAASYYAPVR	RT-pep g	42.26	13	0.537784
7	ADVTPADFSEWSK	RT-pep h	54.62	13	0.636728
8	GTFIIDPGGVIR	RT-pep i	70.52	12	0.764009
9	GTFIIDPAAVIR	RT-pep k	87.23	12	0.897775
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	1.000000

[ ]:

rt_model = RT_ModelInterface(model_class=RT_LSTM_Module)

Device `gpu` is not available, set to `cpu`

Test the untrained model¶

[ ]:

rt_model.predict(irt_pep)

	sequence	pep_name	irt	nAA	rt_norm
0	LGGNEQVTR	RT-pep a	-24.92	9	0.000000
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	0.199488
2	VEATFGVDESNAK	RT-pep c	12.39	13	0.298671
3	YILAGVENSK	RT-pep d	19.79	10	0.357909
4	TPVISGGPYEYR	RT-pep e	28.71	12	0.429315
5	TPVITGAPYEYR	RT-pep f	33.38	12	0.466699
6	DGLDAASYYAPVR	RT-pep g	42.26	13	0.537784
7	ADVTPADFSEWSK	RT-pep h	54.62	13	0.636728
8	GTFIIDPGGVIR	RT-pep i	70.52	12	0.764009
9	GTFIIDPAAVIR	RT-pep k	87.23	12	0.897775
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	1.000000

Test if training works¶

[ ]:

rt_model.train(irt_pep, epoch=100, verbose=False)

Test if the model fits the irt_pep data¶

[ ]:

rt_model.predict(irt_pep)

	sequence	pep_name	irt	nAA	rt_norm	rt_pred
0	LGGNEQVTR	RT-pep a	-24.92	9	0.000000	0.000000
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	0.199488	0.209159
2	VEATFGVDESNAK	RT-pep c	12.39	13	0.298671	0.293867
3	YILAGVENSK	RT-pep d	19.79	10	0.357909	0.349884
4	TPVISGGPYEYR	RT-pep e	28.71	12	0.429315	0.416145
5	TPVITGAPYEYR	RT-pep f	33.38	12	0.466699	0.462958
6	DGLDAASYYAPVR	RT-pep g	42.26	13	0.537784	0.540334
7	ADVTPADFSEWSK	RT-pep h	54.62	13	0.636728	0.638801
8	GTFIIDPGGVIR	RT-pep i	70.52	12	0.764009	0.725222
9	GTFIIDPAAVIR	RT-pep k	87.23	12	0.897775	0.882472
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	1.000000	0.962103

Get number of model parameters¶

[ ]:

rt_model.get_parameter_num()

It is easy to switch the model to Transformer.¶

Users can add more nn.Modules without re-designing the AA/PTM feature extraction parts.¶

[ ]:

rt_model = RT_ModelInterface(model_class=RT_Transformer_Module)
rt_model.train(irt_pep, epoch=50, warmup_epoch=20)
rt_model.predict(irt_pep)

Device `gpu` is not available, set to `cpu`

	sequence	pep_name	irt	nAA	rt_norm	rt_pred
0	LGGNEQVTR	RT-pep a	-24.92	9	0.000000	0.007334
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	0.199488	0.209777
2	VEATFGVDESNAK	RT-pep c	12.39	13	0.298671	0.350849
3	YILAGVENSK	RT-pep d	19.79	10	0.357909	0.388612
4	TPVISGGPYEYR	RT-pep e	28.71	12	0.429315	0.483431
5	TPVITGAPYEYR	RT-pep f	33.38	12	0.466699	0.506625
6	DGLDAASYYAPVR	RT-pep g	42.26	13	0.537784	0.578891
7	ADVTPADFSEWSK	RT-pep h	54.62	13	0.636728	0.619564
8	GTFIIDPGGVIR	RT-pep i	70.52	12	0.764009	0.818625
9	GTFIIDPAAVIR	RT-pep k	87.23	12	0.897775	0.936355
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	1.000000	1.094726

[ ]:

rt_model.get_parameter_num()

[ ]: