Source code for peptdeep.model.featurize

import numpy as np
import pandas as pd
from typing import List, Union

from peptdeep.settings import (
    model_const,
    mod_feature_size,
    MOD_TO_FEATURE,
    mod_elements,
    mod_elem_to_idx,
    _parse_mod_formula,
    update_all_mod_features,
)


[docs] def parse_mod_feature( nAA: int, mod_names: List[str], mod_sites: List[int] ) -> np.ndarray: """ Get modification feature of a given peptide (len=nAA). Note that `site=0` is for peptide N-term modification, `site=-1` is for peptide C-term modification, and `1<=site<=nAA` is for residue modifications on the peptide. Parameters ---------- nAA : int the lenght of the peptide sequence mod_names : List[str] the modification names mod_sites : List[str] the modification sites corresponding to `mod_names` on the peptide Returns ------- np.ndarray 2-D feature array with shape `(nAA+2,mod_feature_size)` """ mod_x = np.zeros((nAA + 2, mod_feature_size)) if len(mod_names) > 0: for site, mod in zip(mod_sites, mod_names): mod_x[site] += MOD_TO_FEATURE[mod] # mod_x[mod_sites] = [MOD_TO_FEATURE[mod] for mod in mod_names] return mod_x
[docs] def get_batch_mod_feature(batch_df: pd.DataFrame) -> np.ndarray: """ Parameters ---------- batch_df : pd.DataFrame dataframe with 'sequence', 'mods', 'mod_sites' and 'nAA' columns. All sequence lengths must be the same, meaning that nAA values must be equal. Returns ------- np.ndarray 3-D tensor with shape (batch_size, nAA+2, mod_feature_size) """ mod_features_list = batch_df.mods.str.split(";").apply( lambda mod_names: [MOD_TO_FEATURE[mod] for mod in mod_names if len(mod) > 0] ) mod_sites_list = batch_df.mod_sites.str.split(";").apply( lambda mod_sites: [int(site) for site in mod_sites if len(site) > 0] ) mod_x_batch = np.zeros( (len(batch_df), batch_df.nAA.values[0] + 2, mod_feature_size) ) for i, (mod_feats, mod_sites) in enumerate(zip(mod_features_list, mod_sites_list)): if len(mod_sites) > 0: for site, feat in zip(mod_sites, mod_feats): # Process multiple mods on one site mod_x_batch[i, site, :] += feat # mod_x_batch[i,mod_sites,:] = mod_feats return mod_x_batch
[docs] def get_batch_aa_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray: """ Convert peptide sequences into AA ID array. ID=0 is reserved for masking, so ID of 'A' is 1, ID of 'B' is 2, ..., ID of 'Z' is 26 (maximum). Zeros are padded into the N- and C-term for each sequence. Parameters ---------- seq_array : Union[List,np.ndarray] list or 1-D array of sequences with the same length Returns ------- np.ndarray 2-D `np.int32` array with the shape `(len(seq_array), len(seq_array[0])+2)`. Zeros is padded into the N- and C-term of each sequence, so the 1st-D is `len(seq_array[0])+2`. """ x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1) - ord("A") + 1 # padding zeros at the N- and C-term return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)])
[docs] def get_ascii_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray: """ Convert peptide sequences into ASCII code array. The values are from 0 to 127. Zeros are padded into the N- and C-term for each sequence. Parameters ---------- seq_array : Union[List,np.ndarray] list or 1-D array of sequences. Returns ------- np.ndarray 2-D `np.int32` array with the shape `(len(seq_array), max seq length+2)`. For the the sequence whose length is shorter than max seq length, zeros are padded to the missing values. """ x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1) return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)])
instrument_dict = dict( zip( [inst.upper() for inst in model_const["instruments"]], range(len(model_const["instruments"])), ) ) unknown_inst_index = model_const["max_instrument_num"] - 1
[docs] def parse_instrument_indices(instrument_list): instrument_list = [inst.upper() for inst in instrument_list] instrument_list = [inst for inst in instrument_list] return [ instrument_dict[inst] if inst in instrument_dict else unknown_inst_index for inst in instrument_list ]