Building your own models for CCS prediction

[ ]:
%reload_ext autoreload
%autoreload 2
[ ]:
from peptdeep.model.featurize import (
    get_batch_aa_indices,
    get_batch_mod_feature
)

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_base
import peptdeep.model.building_block as building_block

mod_feature_size = len(model_const['mod_elements'])

import torch
import pandas as pd
[ ]:
class CCS_LSTM_Module(torch.nn.Module):
    def __init__(self,
        dropout=0.1,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        hidden = 128

        self.ccs_encoder = (
            building_block.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(
                hidden
            )
        )

        self.ccs_decoder = building_block.Decoder_Linear(
            hidden+1, 1
        )

    def forward(self,
        aa_indices,
        mod_x,
        charges,
    ):
        x = self.ccs_encoder(aa_indices, mod_x, charges)
        x = self.dropout(x)
        x = torch.cat((x, charges),1)
        return self.ccs_decoder(x).squeeze(1)
[ ]:
class CCS_ModelInterface(model_base.ModelInterface):
    def __init__(self,
        model_class:torch.nn.Module=CCS_LSTM_Module,
        dropout=0.1,
    ):
        super().__init__()
        self.build(
            model_class,
            dropout=dropout,
        )
        self.loss_func = torch.nn.L1Loss()
        self._target_column_to_train = 'ccs'
        self._target_column_to_predict = 'ccs_pred'

    def _prepare_predict_data_df(self,
        precursor_df:pd.DataFrame,
    ):
        precursor_df[self._target_column_to_predict] = 0.
        self.predict_df = precursor_df

    def _get_features_from_batch_df(self,
        batch_df: pd.DataFrame,
    ):
        aa_indices = torch.LongTensor(
            get_batch_aa_indices(
                batch_df['sequence'].values.astype('U')
            )
        )

        mod_x = torch.Tensor(
            get_batch_mod_feature(
                batch_df
            )
        )

        charges = torch.Tensor(
            batch_df['charge'].values
        ).unsqueeze(1)*0.1

        return aa_indices, mod_x, charges

    def _get_targets_from_batch_df(self,
        batch_df: pd.DataFrame,
    ) -> torch.Tensor:
        return torch.Tensor(batch_df['ccs'].values)

Testing the CCS model

Prepare training data

[ ]:
from peptdeep.model.rt import irt_pep
# virtual ccs values for training
irt_pep['ccs'] = pd.RangeIndex(0, len(irt_pep)).values.astype(float)
irt_pep['charge'] = 2
irt_pep
sequence pep_name irt mods mod_sites nAA ccs charge
0 LGGNEQVTR RT-pep a -24.92 9 0.0 2
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2
2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2
3 YILAGVENSK RT-pep d 19.79 10 3.0 2
4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2
5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2
6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2
7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2
8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2
9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2
[ ]:
ccs_model = CCS_ModelInterface()
Device `gpu` is not available, set to `cpu`

Test the untrained model

Test if training works

[ ]:
ccs_model.train(irt_pep, epoch=100, verbose=False)

Test if the model fits the virtual ccs values

[ ]:
ccs_model.predict(irt_pep)
sequence pep_name irt mods mod_sites nAA ccs charge ccs_pred
0 LGGNEQVTR RT-pep a -24.92 9 0.0 2 0.000000
1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2 1.068118
2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2 2.132658
3 YILAGVENSK RT-pep d 19.79 10 3.0 2 3.344304
4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2 4.412459
5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2 5.313079
6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2 7.875914
7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2 7.989055
8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2 8.029768
9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2 8.443041
10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2 8.817308