{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Building your own models for CCS prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from peptdeep.model.featurize import (\n",
" get_batch_aa_indices, \n",
" get_batch_mod_feature\n",
")\n",
"\n",
"from peptdeep.settings import model_const\n",
"\n",
"import peptdeep.model.model_interface as model_base\n",
"import peptdeep.model.building_block as building_block\n",
"\n",
"mod_feature_size = len(model_const['mod_elements'])\n",
"\n",
"import torch\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class CCS_LSTM_Module(torch.nn.Module):\n",
" def __init__(self,\n",
" dropout=0.1,\n",
" ):\n",
" super().__init__()\n",
" \n",
" self.dropout = torch.nn.Dropout(dropout)\n",
" \n",
" hidden = 128\n",
"\n",
" self.ccs_encoder = (\n",
" building_block.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(\n",
" hidden\n",
" )\n",
" )\n",
"\n",
" self.ccs_decoder = building_block.Decoder_Linear(\n",
" hidden+1, 1\n",
" )\n",
"\n",
" def forward(self, \n",
" aa_indices, \n",
" mod_x,\n",
" charges,\n",
" ):\n",
" x = self.ccs_encoder(aa_indices, mod_x, charges)\n",
" x = self.dropout(x)\n",
" x = torch.cat((x, charges),1)\n",
" return self.ccs_decoder(x).squeeze(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class CCS_ModelInterface(model_base.ModelInterface):\n",
" def __init__(self, \n",
" model_class:torch.nn.Module=CCS_LSTM_Module,\n",
" dropout=0.1,\n",
" ):\n",
" super().__init__()\n",
" self.build(\n",
" model_class,\n",
" dropout=dropout,\n",
" )\n",
" self.loss_func = torch.nn.L1Loss()\n",
" self._target_column_to_train = 'ccs'\n",
" self._target_column_to_predict = 'ccs_pred'\n",
"\n",
" def _prepare_predict_data_df(self,\n",
" precursor_df:pd.DataFrame,\n",
" ):\n",
" precursor_df[self._target_column_to_predict] = 0.\n",
" self.predict_df = precursor_df\n",
"\n",
" def _get_features_from_batch_df(self, \n",
" batch_df: pd.DataFrame,\n",
" ):\n",
" aa_indices = torch.LongTensor(\n",
" get_batch_aa_indices(\n",
" batch_df['sequence'].values.astype('U')\n",
" )\n",
" )\n",
"\n",
" mod_x = torch.Tensor(\n",
" get_batch_mod_feature(\n",
" batch_df\n",
" )\n",
" )\n",
"\n",
" charges = torch.Tensor(\n",
" batch_df['charge'].values\n",
" ).unsqueeze(1)*0.1\n",
"\n",
" return aa_indices, mod_x, charges\n",
"\n",
" def _get_targets_from_batch_df(self, \n",
" batch_df: pd.DataFrame,\n",
" ) -> torch.Tensor:\n",
" return torch.Tensor(batch_df['ccs'].values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Testing the CCS model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare training data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sequence | \n",
" pep_name | \n",
" irt | \n",
" mods | \n",
" mod_sites | \n",
" nAA | \n",
" ccs | \n",
" charge | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" LGGNEQVTR | \n",
" RT-pep a | \n",
" -24.92 | \n",
" | \n",
" | \n",
" 9 | \n",
" 0.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" GAGSSEPVTGLDAK | \n",
" RT-pep b | \n",
" 0.00 | \n",
" | \n",
" | \n",
" 14 | \n",
" 1.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" VEATFGVDESNAK | \n",
" RT-pep c | \n",
" 12.39 | \n",
" | \n",
" | \n",
" 13 | \n",
" 2.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" YILAGVENSK | \n",
" RT-pep d | \n",
" 19.79 | \n",
" | \n",
" | \n",
" 10 | \n",
" 3.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 4 | \n",
" TPVISGGPYEYR | \n",
" RT-pep e | \n",
" 28.71 | \n",
" | \n",
" | \n",
" 12 | \n",
" 4.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 5 | \n",
" TPVITGAPYEYR | \n",
" RT-pep f | \n",
" 33.38 | \n",
" | \n",
" | \n",
" 12 | \n",
" 5.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 6 | \n",
" DGLDAASYYAPVR | \n",
" RT-pep g | \n",
" 42.26 | \n",
" | \n",
" | \n",
" 13 | \n",
" 6.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 7 | \n",
" ADVTPADFSEWSK | \n",
" RT-pep h | \n",
" 54.62 | \n",
" | \n",
" | \n",
" 13 | \n",
" 7.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 8 | \n",
" GTFIIDPGGVIR | \n",
" RT-pep i | \n",
" 70.52 | \n",
" | \n",
" | \n",
" 12 | \n",
" 8.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 9 | \n",
" GTFIIDPAAVIR | \n",
" RT-pep k | \n",
" 87.23 | \n",
" | \n",
" | \n",
" 12 | \n",
" 9.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 10 | \n",
" LFLQFGAQGSPFLK | \n",
" RT-pep l | \n",
" 100.00 | \n",
" | \n",
" | \n",
" 14 | \n",
" 10.0 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sequence pep_name irt mods mod_sites nAA ccs charge\n",
"0 LGGNEQVTR RT-pep a -24.92 9 0.0 2\n",
"1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2\n",
"2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2\n",
"3 YILAGVENSK RT-pep d 19.79 10 3.0 2\n",
"4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2\n",
"5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2\n",
"6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2\n",
"7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2\n",
"8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2\n",
"9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2\n",
"10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from peptdeep.model.rt import irt_pep\n",
"# virtual ccs values for training\n",
"irt_pep['ccs'] = pd.RangeIndex(0, len(irt_pep)).values.astype(float)\n",
"irt_pep['charge'] = 2\n",
"irt_pep"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Device `gpu` is not available, set to `cpu`\n"
]
}
],
"source": [
"ccs_model = CCS_ModelInterface()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the untrained model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test if training works"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ccs_model.train(irt_pep, epoch=100, verbose=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test if the model fits the virtual ccs values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sequence | \n",
" pep_name | \n",
" irt | \n",
" mods | \n",
" mod_sites | \n",
" nAA | \n",
" ccs | \n",
" charge | \n",
" ccs_pred | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" LGGNEQVTR | \n",
" RT-pep a | \n",
" -24.92 | \n",
" | \n",
" | \n",
" 9 | \n",
" 0.0 | \n",
" 2 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 1 | \n",
" GAGSSEPVTGLDAK | \n",
" RT-pep b | \n",
" 0.00 | \n",
" | \n",
" | \n",
" 14 | \n",
" 1.0 | \n",
" 2 | \n",
" 1.068118 | \n",
"
\n",
" \n",
" | 2 | \n",
" VEATFGVDESNAK | \n",
" RT-pep c | \n",
" 12.39 | \n",
" | \n",
" | \n",
" 13 | \n",
" 2.0 | \n",
" 2 | \n",
" 2.132658 | \n",
"
\n",
" \n",
" | 3 | \n",
" YILAGVENSK | \n",
" RT-pep d | \n",
" 19.79 | \n",
" | \n",
" | \n",
" 10 | \n",
" 3.0 | \n",
" 2 | \n",
" 3.344304 | \n",
"
\n",
" \n",
" | 4 | \n",
" TPVISGGPYEYR | \n",
" RT-pep e | \n",
" 28.71 | \n",
" | \n",
" | \n",
" 12 | \n",
" 4.0 | \n",
" 2 | \n",
" 4.412459 | \n",
"
\n",
" \n",
" | 5 | \n",
" TPVITGAPYEYR | \n",
" RT-pep f | \n",
" 33.38 | \n",
" | \n",
" | \n",
" 12 | \n",
" 5.0 | \n",
" 2 | \n",
" 5.313079 | \n",
"
\n",
" \n",
" | 6 | \n",
" DGLDAASYYAPVR | \n",
" RT-pep g | \n",
" 42.26 | \n",
" | \n",
" | \n",
" 13 | \n",
" 6.0 | \n",
" 2 | \n",
" 7.875914 | \n",
"
\n",
" \n",
" | 7 | \n",
" ADVTPADFSEWSK | \n",
" RT-pep h | \n",
" 54.62 | \n",
" | \n",
" | \n",
" 13 | \n",
" 7.0 | \n",
" 2 | \n",
" 7.989055 | \n",
"
\n",
" \n",
" | 8 | \n",
" GTFIIDPGGVIR | \n",
" RT-pep i | \n",
" 70.52 | \n",
" | \n",
" | \n",
" 12 | \n",
" 8.0 | \n",
" 2 | \n",
" 8.029768 | \n",
"
\n",
" \n",
" | 9 | \n",
" GTFIIDPAAVIR | \n",
" RT-pep k | \n",
" 87.23 | \n",
" | \n",
" | \n",
" 12 | \n",
" 9.0 | \n",
" 2 | \n",
" 8.443041 | \n",
"
\n",
" \n",
" | 10 | \n",
" LFLQFGAQGSPFLK | \n",
" RT-pep l | \n",
" 100.00 | \n",
" | \n",
" | \n",
" 14 | \n",
" 10.0 | \n",
" 2 | \n",
" 8.817308 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sequence pep_name irt mods mod_sites nAA ccs charge \\\n",
"0 LGGNEQVTR RT-pep a -24.92 9 0.0 2 \n",
"1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2 \n",
"2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2 \n",
"3 YILAGVENSK RT-pep d 19.79 10 3.0 2 \n",
"4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2 \n",
"5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2 \n",
"6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2 \n",
"7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2 \n",
"8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2 \n",
"9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2 \n",
"10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2 \n",
"\n",
" ccs_pred \n",
"0 0.000000 \n",
"1 1.068118 \n",
"2 2.132658 \n",
"3 3.344304 \n",
"4 4.412459 \n",
"5 5.313079 \n",
"6 7.875914 \n",
"7 7.989055 \n",
"8 8.029768 \n",
"9 8.443041 \n",
"10 8.817308 "
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccs_model.predict(irt_pep)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.3 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.3"
},
"vscode": {
"interpreter": {
"hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}