{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial: building new models from scratch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**even without experience in deep learning**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to predict or classify novel properties of peptides, the user simply needs to provide peptides with corresponding properties (e.g. 'binding_affinity'). \n",
    "\n",
    "We provides several generic `ModelInterface` and `Model` classes in `peptdeep.model.generic_property_prediction` module for users to easily build models for regression and classification problems. Examples are shown as following:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peptdeep.model.generic_property_prediction import (\n",
    "    ModelInterface_for_Generic_AASeq_BinaryClassification,\n",
    "    ModelInterface_for_Generic_AASeq_Regression,\n",
    "    ModelInterface_for_Generic_ModAASeq_BinaryClassification,\n",
    "    ModelInterface_for_Generic_ModAASeq_Regression,\n",
    ")\n",
    "from peptdeep.model.generic_property_prediction import (\n",
    "    Model_for_Generic_AASeq_BinaryClassification_LSTM,\n",
    "    Model_for_Generic_AASeq_BinaryClassification_Transformer,\n",
    "    Model_for_Generic_AASeq_Regression_LSTM,\n",
    "    Model_for_Generic_AASeq_Regression_Transformer,\n",
    "    Model_for_Generic_ModAASeq_BinaryClassification_LSTM,\n",
    "    Model_for_Generic_ModAASeq_BinaryClassification_Transformer,\n",
    "    Model_for_Generic_ModAASeq_Regression_LSTM,\n",
    "    Model_for_Generic_ModAASeq_Regression_Transformer,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Define example Table/DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peptdeep.model.rt import IRT_PEPTIDE_DF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000\n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488\n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671\n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909\n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315\n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699\n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784\n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728\n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009\n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775\n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def create_example_input_dataframe_normalized_irt():\n",
    "    irt_df=IRT_PEPTIDE_DF.copy()\n",
    "    irt_df['normalized_irt'] = (\n",
    "        irt_df.irt-irt_df.irt.min()\n",
    "    )/(irt_df.irt.max()-irt_df.irt.min()) # 0 to 1 norm\n",
    "    return irt_df\n",
    "create_example_input_dataframe_normalized_irt()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Steps to build a model from scratch\n",
    "\n",
    "In the following examples, we only need 7 steps to build a model.\n",
    "\n",
    "1. Prepare a training dataframe with `sequence` column (and `mods`,`mod_sites` columns if the model also takes modifications into consideration), and a target value column to train.\n",
    "2. Select a `ModelInterface` class based on the prediction problem (classification or regression for sequences or modified sequences). Select a `Model` class when initialzing the `ModelInterface` class.\n",
    "3. Tell the `ModelInterface` object which column in the training dataframe stores the target values, and which column stores the values to be predicted.\n",
    "4. `model.train()` for training.\n",
    "5. `model.predict()` for prediction.\n",
    "\n",
    "> Save and load models:\n",
    "6. `model.save(\"/model_folder/model.pth\")` to save the model.\n",
    "7. Use the same `ModelInterface` and `Model` classes, and call `model.load(\"/model_folder/model.pth\")` to load the model for transfer learning and prediction."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Building an simple RT model based on `Model_for_Generic_AASeq_Regression_LSTM`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>predicted_normalized_irt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>0.203671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>0.312852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>0.365846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>0.434760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>0.465173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0.564576</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0.678894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0.893195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>1.061624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.100713</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    predicted_normalized_irt  \n",
       "0                   0.000000  \n",
       "1                   0.203671  \n",
       "2                   0.312852  \n",
       "3                   0.365846  \n",
       "4                   0.434760  \n",
       "5                   0.465173  \n",
       "6                   0.564576  \n",
       "7                   0.678894  \n",
       "8                   0.893195  \n",
       "9                   1.061624  \n",
       "10                  1.100713  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_normalized_irt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_AASeq_Regression(\n",
    "    model_class=Model_for_Generic_AASeq_Regression_LSTM\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'normalized_irt'\n",
    "model.target_column_to_predict = 'predicted_normalized_irt'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Building an simple RT model for only sequences based on `Model_for_Generic_AASeq_Regression_Transformer`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>predicted_normalized_irt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>0.140912</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>0.142185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>0.210857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>0.277200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0.088854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0.399164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0.596815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0.701862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.877500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    predicted_normalized_irt  \n",
       "0                   0.000000  \n",
       "1                   0.000000  \n",
       "2                   0.140912  \n",
       "3                   0.142185  \n",
       "4                   0.210857  \n",
       "5                   0.277200  \n",
       "6                   0.088854  \n",
       "7                   0.399164  \n",
       "8                   0.596815  \n",
       "9                   0.701862  \n",
       "10                  0.877500  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_normalized_irt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_AASeq_Regression(\n",
    "    model_class=Model_for_Generic_AASeq_Regression_Transformer\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'normalized_irt'\n",
    "model.target_column_to_predict = 'predicted_normalized_irt'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regression models for predicting a scalar value for a given amino acid sequence and site-specific PTMs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_LSTM`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>predicted_normalized_irt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>0.368827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>0.295824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>0.337074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>0.527409</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>0.506031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0.629531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0.708878</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0.798570</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0.856519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.973729</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    predicted_normalized_irt  \n",
       "0                   0.000000  \n",
       "1                   0.368827  \n",
       "2                   0.295824  \n",
       "3                   0.337074  \n",
       "4                   0.527409  \n",
       "5                   0.506031  \n",
       "6                   0.629531  \n",
       "7                   0.708878  \n",
       "8                   0.798570  \n",
       "9                   0.856519  \n",
       "10                  0.973729  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_normalized_irt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_ModAASeq_Regression(\n",
    "    model_class=Model_for_Generic_ModAASeq_Regression_LSTM\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'normalized_irt'\n",
    "model.target_column_to_predict = 'predicted_normalized_irt'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_Transformer`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>predicted_normalized_irt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.088521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Phospho@S</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>0.571920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>0.285101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>0.367173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>0.615492</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>0.589607</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0.539454</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0.587029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0.880274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0.811531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.084086</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt       mods mod_sites  nAA  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                         9   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00  Phospho@S         4   14   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                        13   \n",
       "3       YILAGVENSK  RT-pep d   19.79                        10   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                        12   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                        12   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                        13   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                        13   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                        12   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                        12   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                        14   \n",
       "\n",
       "    normalized_irt  predicted_normalized_irt  \n",
       "0         0.000000                  0.088521  \n",
       "1         0.199488                  0.571920  \n",
       "2         0.298671                  0.285101  \n",
       "3         0.357909                  0.367173  \n",
       "4         0.429315                  0.615492  \n",
       "5         0.466699                  0.589607  \n",
       "6         0.537784                  0.539454  \n",
       "7         0.636728                  0.587029  \n",
       "8         0.764009                  0.880274  \n",
       "9         0.897775                  0.811531  \n",
       "10        1.000000                  1.084086  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_normalized_irt()\n",
    "example_df.loc[1,'mods'] = 'Phospho@S'\n",
    "example_df.loc[1,'mod_sites'] = '4'\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_ModAASeq_Regression(\n",
    "    model_class=Model_for_Generic_ModAASeq_Regression_Transformer\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'normalized_irt'\n",
    "model.target_column_to_predict = 'predicted_normalized_irt'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binary classification models for a given amino acid sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a simple classification dataset\n",
    "def create_example_input_dataframe_classification_rt():\n",
    "    rt_df = create_example_input_dataframe_normalized_irt()\n",
    "    rt_df['is_in_first_half_of_column'] = 0\n",
    "    rt_df.loc[:5,'is_in_first_half_of_column']=1\n",
    "    return rt_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_LSTM`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>is_in_first_half_of_column</th>\n",
       "      <th>predicted_will_be_in_first_half_of_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.991829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>1</td>\n",
       "      <td>0.991083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>1</td>\n",
       "      <td>0.991600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>1</td>\n",
       "      <td>0.992202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0</td>\n",
       "      <td>0.351366</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0</td>\n",
       "      <td>0.359982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0</td>\n",
       "      <td>0.352756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0</td>\n",
       "      <td>0.351209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.349120</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    is_in_first_half_of_column  predicted_will_be_in_first_half_of_column  \n",
       "0                            1                                   0.991829  \n",
       "1                            1                                   0.990733  \n",
       "2                            1                                   0.991083  \n",
       "3                            1                                   0.991600  \n",
       "4                            1                                   0.992202  \n",
       "5                            1                                   0.990124  \n",
       "6                            0                                   0.351366  \n",
       "7                            0                                   0.359982  \n",
       "8                            0                                   0.352756  \n",
       "9                            0                                   0.351209  \n",
       "10                           0                                   0.349120  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_classification_rt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_AASeq_BinaryClassification(\n",
    "    model_class=Model_for_Generic_AASeq_BinaryClassification_LSTM\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'is_in_first_half_of_column' \n",
    "model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_Transformer`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>is_in_first_half_of_column</th>\n",
       "      <th>predicted_will_be_in_first_half_of_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996989</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996926</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004137</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0</td>\n",
       "      <td>0.003938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004114</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    is_in_first_half_of_column  predicted_will_be_in_first_half_of_column  \n",
       "0                            1                                   0.997586  \n",
       "1                            1                                   0.997438  \n",
       "2                            1                                   0.996627  \n",
       "3                            1                                   0.997642  \n",
       "4                            1                                   0.996989  \n",
       "5                            1                                   0.996926  \n",
       "6                            0                                   0.004032  \n",
       "7                            0                                   0.004321  \n",
       "8                            0                                   0.004137  \n",
       "9                            0                                   0.003938  \n",
       "10                           0                                   0.004114  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_classification_rt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_AASeq_BinaryClassification(\n",
    "    model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'is_in_first_half_of_column'\n",
    "model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binary classification models for given amino acid sequence and site-specific PTMs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_example_input_dataframe_classification_rt():\n",
    "    rt_df = create_example_input_dataframe_normalized_irt()\n",
    "    rt_df['is_in_first_half_of_column'] = 0\n",
    "    rt_df.loc[:5,'is_in_first_half_of_column']=1\n",
    "    return rt_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_LSTM`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>is_in_first_half_of_column</th>\n",
       "      <th>predicted_will_be_in_first_half_of_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.993120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>1</td>\n",
       "      <td>0.992972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>1</td>\n",
       "      <td>0.992984</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>1</td>\n",
       "      <td>0.992323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>1</td>\n",
       "      <td>0.988538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0</td>\n",
       "      <td>0.370841</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0</td>\n",
       "      <td>0.368691</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0</td>\n",
       "      <td>0.378124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0</td>\n",
       "      <td>0.367393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.365957</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    is_in_first_half_of_column  predicted_will_be_in_first_half_of_column  \n",
       "0                            1                                   0.993120  \n",
       "1                            1                                   0.990600  \n",
       "2                            1                                   0.992972  \n",
       "3                            1                                   0.992984  \n",
       "4                            1                                   0.992323  \n",
       "5                            1                                   0.988538  \n",
       "6                            0                                   0.370841  \n",
       "7                            0                                   0.368691  \n",
       "8                            0                                   0.378124  \n",
       "9                            0                                   0.367393  \n",
       "10                           0                                   0.365957  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_classification_rt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(\n",
    "    model_class=Model_for_Generic_ModAASeq_BinaryClassification_LSTM\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'is_in_first_half_of_column' \n",
    "model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_Transformer`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sequence</th>\n",
       "      <th>pep_name</th>\n",
       "      <th>irt</th>\n",
       "      <th>mods</th>\n",
       "      <th>mod_sites</th>\n",
       "      <th>nAA</th>\n",
       "      <th>normalized_irt</th>\n",
       "      <th>is_in_first_half_of_column</th>\n",
       "      <th>predicted_will_be_in_first_half_of_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LGGNEQVTR</td>\n",
       "      <td>RT-pep a</td>\n",
       "      <td>-24.92</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GAGSSEPVTGLDAK</td>\n",
       "      <td>RT-pep b</td>\n",
       "      <td>0.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>0.199488</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VEATFGVDESNAK</td>\n",
       "      <td>RT-pep c</td>\n",
       "      <td>12.39</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.298671</td>\n",
       "      <td>1</td>\n",
       "      <td>0.995498</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>YILAGVENSK</td>\n",
       "      <td>RT-pep d</td>\n",
       "      <td>19.79</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>0.357909</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TPVISGGPYEYR</td>\n",
       "      <td>RT-pep e</td>\n",
       "      <td>28.71</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.429315</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TPVITGAPYEYR</td>\n",
       "      <td>RT-pep f</td>\n",
       "      <td>33.38</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.466699</td>\n",
       "      <td>1</td>\n",
       "      <td>0.995732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DGLDAASYYAPVR</td>\n",
       "      <td>RT-pep g</td>\n",
       "      <td>42.26</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.537784</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ADVTPADFSEWSK</td>\n",
       "      <td>RT-pep h</td>\n",
       "      <td>54.62</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>13</td>\n",
       "      <td>0.636728</td>\n",
       "      <td>0</td>\n",
       "      <td>0.005084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GTFIIDPGGVIR</td>\n",
       "      <td>RT-pep i</td>\n",
       "      <td>70.52</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.764009</td>\n",
       "      <td>0</td>\n",
       "      <td>0.004195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>GTFIIDPAAVIR</td>\n",
       "      <td>RT-pep k</td>\n",
       "      <td>87.23</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>12</td>\n",
       "      <td>0.897775</td>\n",
       "      <td>0</td>\n",
       "      <td>0.003547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>LFLQFGAQGSPFLK</td>\n",
       "      <td>RT-pep l</td>\n",
       "      <td>100.00</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.003279</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          sequence  pep_name     irt mods mod_sites  nAA  normalized_irt  \\\n",
       "0        LGGNEQVTR  RT-pep a  -24.92                   9        0.000000   \n",
       "1   GAGSSEPVTGLDAK  RT-pep b    0.00                  14        0.199488   \n",
       "2    VEATFGVDESNAK  RT-pep c   12.39                  13        0.298671   \n",
       "3       YILAGVENSK  RT-pep d   19.79                  10        0.357909   \n",
       "4     TPVISGGPYEYR  RT-pep e   28.71                  12        0.429315   \n",
       "5     TPVITGAPYEYR  RT-pep f   33.38                  12        0.466699   \n",
       "6    DGLDAASYYAPVR  RT-pep g   42.26                  13        0.537784   \n",
       "7    ADVTPADFSEWSK  RT-pep h   54.62                  13        0.636728   \n",
       "8     GTFIIDPGGVIR  RT-pep i   70.52                  12        0.764009   \n",
       "9     GTFIIDPAAVIR  RT-pep k   87.23                  12        0.897775   \n",
       "10  LFLQFGAQGSPFLK  RT-pep l  100.00                  14        1.000000   \n",
       "\n",
       "    is_in_first_half_of_column  predicted_will_be_in_first_half_of_column  \n",
       "0                            1                                   0.997545  \n",
       "1                            1                                   0.996575  \n",
       "2                            1                                   0.995498  \n",
       "3                            1                                   0.997241  \n",
       "4                            1                                   0.996784  \n",
       "5                            1                                   0.995732  \n",
       "6                            0                                   0.004000  \n",
       "7                            0                                   0.005084  \n",
       "8                            0                                   0.004195  \n",
       "9                            0                                   0.003547  \n",
       "10                           0                                   0.003279  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_df = create_example_input_dataframe_classification_rt()\n",
    "\n",
    "#initialize the modelinterface, specify which of the models to use\n",
    "model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(\n",
    "    model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer\n",
    ")\n",
    "# specify the name of the column you want to use for traning\n",
    "model.target_column_to_train = 'is_in_first_half_of_column' \n",
    "model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'\n",
    "model.train(example_df, epoch=20)\n",
    "model.predict(example_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.3 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "vscode": {
   "interpreter": {
    "hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}