{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Building your own models for CCS prediction" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from peptdeep.model.featurize import (\n", " get_batch_aa_indices, \n", " get_batch_mod_feature\n", ")\n", "\n", "from peptdeep.settings import model_const\n", "\n", "import peptdeep.model.model_interface as model_base\n", "import peptdeep.model.building_block as building_block\n", "\n", "mod_feature_size = len(model_const['mod_elements'])\n", "\n", "import torch\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CCS_LSTM_Module(torch.nn.Module):\n", " def __init__(self,\n", " dropout=0.1,\n", " ):\n", " super().__init__()\n", " \n", " self.dropout = torch.nn.Dropout(dropout)\n", " \n", " hidden = 128\n", "\n", " self.ccs_encoder = (\n", " building_block.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(\n", " hidden\n", " )\n", " )\n", "\n", " self.ccs_decoder = building_block.Decoder_Linear(\n", " hidden+1, 1\n", " )\n", "\n", " def forward(self, \n", " aa_indices, \n", " mod_x,\n", " charges,\n", " ):\n", " x = self.ccs_encoder(aa_indices, mod_x, charges)\n", " x = self.dropout(x)\n", " x = torch.cat((x, charges),1)\n", " return self.ccs_decoder(x).squeeze(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CCS_ModelInterface(model_base.ModelInterface):\n", " def __init__(self, \n", " model_class:torch.nn.Module=CCS_LSTM_Module,\n", " dropout=0.1,\n", " ):\n", " super().__init__()\n", " self.build(\n", " model_class,\n", " dropout=dropout,\n", " )\n", " self.loss_func = torch.nn.L1Loss()\n", " self._target_column_to_train = 'ccs'\n", " self._target_column_to_predict = 'ccs_pred'\n", "\n", " def _prepare_predict_data_df(self,\n", " precursor_df:pd.DataFrame,\n", " ):\n", " precursor_df[self._target_column_to_predict] = 0.\n", " self.predict_df = precursor_df\n", "\n", " def _get_features_from_batch_df(self, \n", " batch_df: pd.DataFrame,\n", " ):\n", " aa_indices = torch.LongTensor(\n", " get_batch_aa_indices(\n", " batch_df['sequence'].values.astype('U')\n", " )\n", " )\n", "\n", " mod_x = torch.Tensor(\n", " get_batch_mod_feature(\n", " batch_df\n", " )\n", " )\n", "\n", " charges = torch.Tensor(\n", " batch_df['charge'].values\n", " ).unsqueeze(1)*0.1\n", "\n", " return aa_indices, mod_x, charges\n", "\n", " def _get_targets_from_batch_df(self, \n", " batch_df: pd.DataFrame,\n", " ) -> torch.Tensor:\n", " return torch.Tensor(batch_df['ccs'].values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Testing the CCS model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare training data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequencepep_nameirtmodsmod_sitesnAAccscharge
0LGGNEQVTRRT-pep a-24.9290.02
1GAGSSEPVTGLDAKRT-pep b0.00141.02
2VEATFGVDESNAKRT-pep c12.39132.02
3YILAGVENSKRT-pep d19.79103.02
4TPVISGGPYEYRRT-pep e28.71124.02
5TPVITGAPYEYRRT-pep f33.38125.02
6DGLDAASYYAPVRRT-pep g42.26136.02
7ADVTPADFSEWSKRT-pep h54.62137.02
8GTFIIDPGGVIRRT-pep i70.52128.02
9GTFIIDPAAVIRRT-pep k87.23129.02
10LFLQFGAQGSPFLKRT-pep l100.001410.02
\n", "
" ], "text/plain": [ " sequence pep_name irt mods mod_sites nAA ccs charge\n", "0 LGGNEQVTR RT-pep a -24.92 9 0.0 2\n", "1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2\n", "2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2\n", "3 YILAGVENSK RT-pep d 19.79 10 3.0 2\n", "4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2\n", "5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2\n", "6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2\n", "7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2\n", "8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2\n", "9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2\n", "10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from peptdeep.model.rt import irt_pep\n", "# virtual ccs values for training\n", "irt_pep['ccs'] = pd.RangeIndex(0, len(irt_pep)).values.astype(float)\n", "irt_pep['charge'] = 2\n", "irt_pep" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Device `gpu` is not available, set to `cpu`\n" ] } ], "source": [ "ccs_model = CCS_ModelInterface()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test the untrained model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test if training works" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ccs_model.train(irt_pep, epoch=100, verbose=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test if the model fits the virtual ccs values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequencepep_nameirtmodsmod_sitesnAAccschargeccs_pred
0LGGNEQVTRRT-pep a-24.9290.020.000000
1GAGSSEPVTGLDAKRT-pep b0.00141.021.068118
2VEATFGVDESNAKRT-pep c12.39132.022.132658
3YILAGVENSKRT-pep d19.79103.023.344304
4TPVISGGPYEYRRT-pep e28.71124.024.412459
5TPVITGAPYEYRRT-pep f33.38125.025.313079
6DGLDAASYYAPVRRT-pep g42.26136.027.875914
7ADVTPADFSEWSKRT-pep h54.62137.027.989055
8GTFIIDPGGVIRRT-pep i70.52128.028.029768
9GTFIIDPAAVIRRT-pep k87.23129.028.443041
10LFLQFGAQGSPFLKRT-pep l100.001410.028.817308
\n", "
" ], "text/plain": [ " sequence pep_name irt mods mod_sites nAA ccs charge \\\n", "0 LGGNEQVTR RT-pep a -24.92 9 0.0 2 \n", "1 GAGSSEPVTGLDAK RT-pep b 0.00 14 1.0 2 \n", "2 VEATFGVDESNAK RT-pep c 12.39 13 2.0 2 \n", "3 YILAGVENSK RT-pep d 19.79 10 3.0 2 \n", "4 TPVISGGPYEYR RT-pep e 28.71 12 4.0 2 \n", "5 TPVITGAPYEYR RT-pep f 33.38 12 5.0 2 \n", "6 DGLDAASYYAPVR RT-pep g 42.26 13 6.0 2 \n", "7 ADVTPADFSEWSK RT-pep h 54.62 13 7.0 2 \n", "8 GTFIIDPGGVIR RT-pep i 70.52 12 8.0 2 \n", "9 GTFIIDPAAVIR RT-pep k 87.23 12 9.0 2 \n", "10 LFLQFGAQGSPFLK RT-pep l 100.00 14 10.0 2 \n", "\n", " ccs_pred \n", "0 0.000000 \n", "1 1.068118 \n", "2 2.132658 \n", "3 3.344304 \n", "4 4.412459 \n", "5 5.313079 \n", "6 7.875914 \n", "7 7.989055 \n", "8 8.029768 \n", "9 8.443041 \n", "10 8.817308 " ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ccs_model.predict(irt_pep)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.3" }, "vscode": { "interpreter": { "hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2" } } }, "nbformat": 4, "nbformat_minor": 2 }