{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building your own models for CCS prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peptdeep.model.featurize import (\n",
    "    get_batch_aa_indices, \n",
    "    get_batch_mod_feature\n",
    ")\n",
    "\n",
    "from peptdeep.settings import model_const\n",
    "\n",
    "import peptdeep.model.model_interface as model_base\n",
    "import peptdeep.model.building_block as building_block\n",
    "\n",
    "mod_feature_size = len(model_const['mod_elements'])\n",
    "\n",
    "import torch\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CCS_LSTM_Module(torch.nn.Module):\n",
    "    def __init__(self,\n",
    "        dropout=0.1,\n",
    "    ):\n",
    "        super().__init__()\n",
    "        \n",
    "        self.dropout = torch.nn.Dropout(dropout)\n",
    "        \n",
    "        hidden = 128\n",
    "\n",
    "        self.ccs_encoder = (\n",
    "            building_block.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(\n",
    "                hidden\n",
    "            )\n",
    "        )\n",
    "\n",
    "        self.ccs_decoder = building_block.Decoder_Linear(\n",
    "            hidden+1, 1\n",
    "        )\n",
    "\n",
    "    def forward(self, \n",
    "        aa_indices, \n",
    "        mod_x,\n",
    "        charges,\n",
    "    ):\n",
    "        x = self.ccs_encoder(aa_indices, mod_x, charges)\n",
    "        x = self.dropout(x)\n",
    "        x = torch.cat((x, charges),1)\n",
    "        return self.ccs_decoder(x).squeeze(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CCS_ModelInterface(model_base.ModelInterface):\n",
    "    def __init__(self, \n",
    "        model_class:torch.nn.Module=CCS_LSTM_Module,\n",
    "        dropout=0.1,\n",
    "    ):\n",
    "        super().__init__()\n",
    "        self.build(\n",
    "            model_class,\n",
    "            dropout=dropout,\n",
    "        )\n",
    "        self.loss_func = torch.nn.L1Loss()\n",
    "        self._target_column_to_train = 'ccs'\n",
    "        self._target_column_to_predict = 'ccs_pred'\n",
    "\n",
    "    def _prepare_predict_data_df(self,\n",
    "        precursor_df:pd.DataFrame,\n",
    "    ):\n",
    "        precursor_df[self._target_column_to_predict] = 0.\n",
    "        self.predict_df = precursor_df\n",
    "\n",
    "    def _get_features_from_batch_df(self, \n",
    "        batch_df: pd.DataFrame,\n",
    "    ):\n",
    "        aa_indices = torch.LongTensor(\n",
    "            get_batch_aa_indices(\n",
    "                batch_df['sequence'].values.astype('U')\n",
    "            )\n",
    "        )\n",
    "\n",
    "        mod_x = torch.Tensor(\n",
    "            get_batch_mod_feature(\n",
    "                batch_df\n",
    "            )\n",
    "        )\n",
    "\n",
    "        charges = torch.Tensor(\n",
    "            batch_df['charge'].values\n",
    "        ).unsqueeze(1)*0.1\n",
    "\n",
    "        return aa_indices, mod_x, charges\n",
    "\n",
    "    def _get_targets_from_batch_df(self, \n",
    "        batch_df: pd.DataFrame,\n",
    "    ) -> torch.Tensor:\n",
    "        return torch.Tensor(batch_df['ccs'].values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Testing the CCS model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>ccs</th>\n",
       "      <th>charge</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA   ccs  charge\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9   0.0       2\n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14   1.0       2\n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13   2.0       2\n",
       "3       YILAGVENSK  RT-pep d   19.79                  10   3.0       2\n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12   4.0       2\n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12   5.0       2\n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13   6.0       2\n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13   7.0       2\n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12   8.0       2\n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12   9.0       2\n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14  10.0       2"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from peptdeep.model.rt import irt_pep\n",
    "# virtual ccs values for training\n",
    "irt_pep['ccs'] = pd.RangeIndex(0, len(irt_pep)).values.astype(float)\n",
    "irt_pep['charge'] = 2\n",
    "irt_pep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Device `gpu` is not available, set to `cpu`\n"
     ]
    }
   ],
   "source": [
    "ccs_model = CCS_ModelInterface()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test the untrained model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test if training works"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ccs_model.train(irt_pep, epoch=100, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test if the model fits the virtual ccs values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>ccs</th>\n",
       "      <th>charge</th>\n",
       "      <th>ccs_pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.068118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2.132658</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2</td>\n",
       "      <td>3.344304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2</td>\n",
       "      <td>4.412459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2</td>\n",
       "      <td>5.313079</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2</td>\n",
       "      <td>7.875914</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2</td>\n",
       "      <td>7.989055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2</td>\n",
       "      <td>8.029768</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2</td>\n",
       "      <td>8.443041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>8.817308</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA   ccs  charge  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9   0.0       2   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14   1.0       2   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13   2.0       2   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10   3.0       2   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12   4.0       2   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12   5.0       2   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13   6.0       2   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13   7.0       2   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12   8.0       2   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12   9.0       2   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14  10.0       2   \n",
       "\n",
       "    ccs_pred  \n",
       "0   0.000000  \n",
       "1   1.068118  \n",
       "2   2.132658  \n",
       "3   3.344304  \n",
       "4   4.412459  \n",
       "5   5.313079  \n",
       "6   7.875914  \n",
       "7   7.989055  \n",
       "8   8.029768  \n",
       "9   8.443041  \n",
       "10  8.817308  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ccs_model.predict(irt_pep)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.3 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.8.3"
  },
  "vscode": {
   "interpreter": {
    "hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}