Building your own models for CCS prediction¶

[ ]:

%reload_ext autoreload
%autoreload 2

[ ]:

from peptdeep.model.featurize import (
    get_batch_aa_indices,
    get_batch_mod_feature
)

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_base
import peptdeep.model.building_block as building_block

mod_feature_size = len(model_const['mod_elements'])

import torch
import pandas as pd

[ ]:

class CCS_LSTM_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.1,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128

        self.ccs_encoder = (
            building_block.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(
                hidden
            )
        )

        self.ccs_decoder = building_block.Decoder_Linear(
            hidden+1, 1
        )

    def forward(self,
        aa_indices,
        mod_x,
        charges,
    ):
        x = self.ccs_encoder(aa_indices, mod_x, charges)
        x = self.dropout(x)
        x = torch.cat((x, charges),1)
        return self.ccs_decoder(x).squeeze(1)

[ ]:

class CCS_ModelInterface(model_base.ModelInterface):
    def __init__(self,
        model_class:torch.nn.Module=CCS_LSTM_Module,
        dropout=0.1,
    ):
        super().__init__()
        self.build(
            model_class,
            dropout=dropout,
        )
        self.loss_func = torch.nn.L1Loss()
        self._target_column_to_train = 'ccs'
        self._target_column_to_predict = 'ccs_pred'

    def _prepare_predict_data_df(self,
        precursor_df:pd.DataFrame,
    ):
        precursor_df[self._target_column_to_predict] = 0.
        self.predict_df = precursor_df

    def _get_features_from_batch_df(self,
        batch_df: pd.DataFrame,
    ):
        aa_indices = torch.LongTensor(
            get_batch_aa_indices(
                batch_df['sequence'].values.astype('U')
            )
        )

        mod_x = torch.Tensor(
            get_batch_mod_feature(
                batch_df
            )
        )

        charges = torch.Tensor(
            batch_df['charge'].values
        ).unsqueeze(1)*0.1

        return aa_indices, mod_x, charges

    def _get_targets_from_batch_df(self,
        batch_df: pd.DataFrame,
    ) -> torch.Tensor:
        return torch.Tensor(batch_df['ccs'].values)

Testing the CCS model¶

Prepare training data¶

[ ]:

from peptdeep.model.rt import irt_pep
# virtual ccs values for training
irt_pep['ccs'] = pd.RangeIndex(0, len(irt_pep)).values.astype(float)
irt_pep['charge'] = 2
irt_pep

	sequence	pep_name	irt	nAA	ccs	charge
0	LGGNEQVTR	RT-pep a	-24.92	9	0.0	2
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	1.0	2
2	VEATFGVDESNAK	RT-pep c	12.39	13	2.0	2
3	YILAGVENSK	RT-pep d	19.79	10	3.0	2
4	TPVISGGPYEYR	RT-pep e	28.71	12	4.0	2
5	TPVITGAPYEYR	RT-pep f	33.38	12	5.0	2
6	DGLDAASYYAPVR	RT-pep g	42.26	13	6.0	2
7	ADVTPADFSEWSK	RT-pep h	54.62	13	7.0	2
8	GTFIIDPGGVIR	RT-pep i	70.52	12	8.0	2
9	GTFIIDPAAVIR	RT-pep k	87.23	12	9.0	2
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	10.0	2

[ ]:

ccs_model = CCS_ModelInterface()

Device `gpu` is not available, set to `cpu`

Test the untrained model¶

Test if training works¶

[ ]:

ccs_model.train(irt_pep, epoch=100, verbose=False)

Test if the model fits the virtual ccs values¶

[ ]:

ccs_model.predict(irt_pep)

	sequence	pep_name	irt	nAA	ccs	charge	ccs_pred
0	LGGNEQVTR	RT-pep a	-24.92	9	0.0	2	0.000000
1	GAGSSEPVTGLDAK	RT-pep b	0.00	14	1.0	2	1.068118
2	VEATFGVDESNAK	RT-pep c	12.39	13	2.0	2	2.132658
3	YILAGVENSK	RT-pep d	19.79	10	3.0	2	3.344304
4	TPVISGGPYEYR	RT-pep e	28.71	12	4.0	2	4.412459
5	TPVITGAPYEYR	RT-pep f	33.38	12	5.0	2	5.313079
6	DGLDAASYYAPVR	RT-pep g	42.26	13	6.0	2	7.875914
7	ADVTPADFSEWSK	RT-pep h	54.62	13	7.0	2	7.989055
8	GTFIIDPGGVIR	RT-pep i	70.52	12	8.0	2	8.029768
9	GTFIIDPAAVIR	RT-pep k	87.23	12	9.0	2	8.443041
10	LFLQFGAQGSPFLK	RT-pep l	100.00	14	10.0	2	8.817308