Source code for peptdeep.model.featurize

import numpy as np
import pandas as pd
from typing import List, Union

from peptdeep.settings import (
    model_const,
    mod_feature_size,
    MOD_TO_FEATURE,
    mod_elements,
    mod_elem_to_idx,
    _parse_mod_formula,
    update_all_mod_features,
)



[docs]
def parse_mod_feature(
    nAA: int, mod_names: List[str], mod_sites: List[int]
) -> np.ndarray:
    """
    Get modification feature of a given peptide (len=nAA).
    Note that `site=0` is for peptide N-term modification,
    `site=-1` is for peptide C-term modification, and
    `1<=site<=nAA` is for residue modifications on the peptide.

    Parameters
    ----------
    nAA : int
        the lenght of the peptide sequence

    mod_names : List[str]
        the modification names

    mod_sites : List[str]
        the modification sites corresponding
        to `mod_names` on the peptide

    Returns
    -------
    np.ndarray
        2-D feature array with shape `(nAA+2,mod_feature_size)`

    """
    mod_x = np.zeros((nAA + 2, mod_feature_size))
    if len(mod_names) > 0:
        for site, mod in zip(mod_sites, mod_names):
            mod_x[site] += MOD_TO_FEATURE[mod]
        # mod_x[mod_sites] = [MOD_TO_FEATURE[mod] for mod in mod_names]
    return mod_x




[docs]
def get_batch_mod_feature(batch_df: pd.DataFrame) -> np.ndarray:
    """
    Parameters
    ----------
    batch_df : pd.DataFrame
        dataframe with 'sequence', 'mods', 'mod_sites' and 'nAA' columns.
        All sequence lengths must be the same, meaning that nAA values must be equal.

    Returns
    -------
    np.ndarray
        3-D tensor with shape (batch_size, nAA+2, mod_feature_size)
    """

    mod_features_list = batch_df.mods.str.split(";").apply(
        lambda mod_names: [MOD_TO_FEATURE[mod] for mod in mod_names if len(mod) > 0]
    )
    mod_sites_list = batch_df.mod_sites.str.split(";").apply(
        lambda mod_sites: [int(site) for site in mod_sites if len(site) > 0]
    )
    mod_x_batch = np.zeros(
        (len(batch_df), batch_df.nAA.values[0] + 2, mod_feature_size)
    )
    for i, (mod_feats, mod_sites) in enumerate(zip(mod_features_list, mod_sites_list)):
        if len(mod_sites) > 0:
            for site, feat in zip(mod_sites, mod_feats):
                # Process multiple mods on one site
                mod_x_batch[i, site, :] += feat
            # mod_x_batch[i,mod_sites,:] = mod_feats
    return mod_x_batch




[docs]
def get_batch_aa_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray:
    """
    Convert peptide sequences into AA ID array. ID=0 is reserved for masking,
    so ID of 'A' is 1, ID of 'B' is 2, ..., ID of 'Z' is 26 (maximum).
    Zeros are padded into the N- and C-term for each sequence.

    Parameters
    ----------
    seq_array : Union[List,np.ndarray]
        list or 1-D array of sequences with the same length

    Returns
    -------
    np.ndarray
        2-D `np.int32` array with the shape
        `(len(seq_array), len(seq_array[0])+2)`. Zeros is padded into the
        N- and C-term of each sequence, so the 1st-D is `len(seq_array[0])+2`.

    """
    x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1) - ord("A") + 1
    # padding zeros at the N- and C-term
    return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)])




[docs]
def get_ascii_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray:
    """
    Convert peptide sequences into ASCII code array.
    The values are from 0 to 127.
    Zeros are padded into the N- and C-term for each sequence.

    Parameters
    ----------
    seq_array : Union[List,np.ndarray]
        list or 1-D array of sequences.

    Returns
    -------
    np.ndarray
        2-D `np.int32` array with the shape
        `(len(seq_array), max seq length+2)`.
        For the the sequence whose length is shorter than max seq length,
        zeros are padded to the missing values.

    """

    x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1)
    return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)])



instrument_dict = dict(
    zip(
        [inst.upper() for inst in model_const["instruments"]],
        range(len(model_const["instruments"])),
    )
)
unknown_inst_index = model_const["max_instrument_num"] - 1



[docs]
def parse_instrument_indices(instrument_list):
    instrument_list = [inst.upper() for inst in instrument_list]
    instrument_list = [inst for inst in instrument_list]
    return [
        instrument_dict[inst] if inst in instrument_dict else unknown_inst_index
        for inst in instrument_list
    ]