Source code for peptdeep.spec_lib.predict_lib

import pandas as pd
import numpy as np
import torch
import tqdm

from alphabase.peptide.precursor import (
    calc_precursor_isotope_mp,
    calc_precursor_isotope,
)
from alphabase.spectral_library.base import SpecLibBase
from alphabase.spectral_library.flat import SpecLibFlat
from alphabase.peptide.fragment import (
    flatten_fragments,
    concat_precursor_fragment_dataframes,
)

from peptdeep.pretrained_models import ModelManager
from peptdeep.settings import global_settings
from peptdeep.utils import logging
from peptdeep.utils import process_bar

model_mgr_settings = global_settings["model_mgr"]


[docs] class PredictSpecLib(SpecLibBase):
[docs] def __init__( self, model_manager: ModelManager = None, charged_frag_types=["b_z1", "b_z2", "y_z1", "y_z2"], precursor_mz_min: float = 400.0, precursor_mz_max: float = 2000.0, decoy: str = "pseudo_reverse", rt_to_irt: bool = False, generate_precursor_isotope: bool = False, ): """ Parameters ---------- model_manager : ModelManager, optional `ModelManager`, by default None charged_frag_types : list, optional Charged fragment types, by default ['b_z1','b_z2','y_z1','y_z2'] precursor_mz_min : float, optional precursor_mz_min, by default 400.0 precursor_mz_max : float, optional precursor_mz_max, by default 2000.0 decoy : str, optional Decoy choice, see `alphabase.spec_lib.decoy_library`, by default 'pseudo_reverse' rt_to_irt : bool, optional Convert predicted RT to iRT values, by default False generate_precursor_isotope : bool, optional Generate precursor isotopes, defaults to False """ SpecLibBase.__init__( self, charged_frag_types, precursor_mz_min=precursor_mz_min, precursor_mz_max=precursor_mz_max, decoy=decoy, ) self.model_manager = model_manager self._precursor_df = pd.DataFrame() self._fragment_intensity_df = pd.DataFrame() self._fragment_mz_df = pd.DataFrame() self.mp_predict_batch_size: int = 100000 self.rt_to_irt = rt_to_irt self.generate_precursor_isotope = generate_precursor_isotope
def _drop_unused_frag_columns(self): self._fragment_mz_df.drop( columns=[ col for col in self._fragment_mz_df.columns if col not in self.charged_frag_types ], inplace=True, ) self._fragment_intensity_df.drop( columns=[ col for col in self._fragment_intensity_df.columns if col not in self.charged_frag_types ], inplace=True, )
[docs] def set_precursor_and_fragment( self, *, precursor_df: pd.DataFrame, fragment_mz_df: pd.DataFrame = None, fragment_intensity_df: pd.DataFrame = None, ): self._precursor_df = precursor_df if fragment_intensity_df is not None and fragment_mz_df is not None: self._fragment_intensity_df = fragment_intensity_df self._fragment_mz_df = fragment_mz_df self._drop_unused_frag_columns()
[docs] def translate_rt_to_irt_pred(self, irt_pep_df: pd.DataFrame = None): """Add 'irt_pred' into columns based on 'rt_pred'""" return self.model_manager.rt_model.add_irt_column_to_precursor_df( self._precursor_df, irt_pep_df=irt_pep_df )
[docs] def predict_all( self, min_required_precursor_num_for_mp: int = 2000, predict_items: list = ["rt", "mobility", "ms2"], ): """ 1. Predict RT/IM/MS2 for self._precursor_df 2. Calculate isotope information in self._precursor_df """ if "precursor_mz" not in self.precursor_df.columns: self.calc_precursor_mz() self.clip_by_precursor_mz_() if self.generate_precursor_isotope: if self.model_manager.verbose: logging.info("Calculating precursor isotope distributions ...") if len(self.precursor_df) < min_required_precursor_num_for_mp: self._precursor_df = calc_precursor_isotope(self._precursor_df) else: self._precursor_df = calc_precursor_isotope_mp( self._precursor_df, progress_bar=process_bar ) if self.model_manager.verbose: logging.info( f"Predicting RT/IM/MS2 for {len(self._precursor_df)} precursors ..." ) res = self.model_manager.predict_all( self._precursor_df, predict_items=predict_items, frag_types=self.charged_frag_types, min_required_precursor_num_for_mp=min_required_precursor_num_for_mp, multiprocessing=model_mgr_settings["predict"]["multiprocessing"], mp_batch_size=self.mp_predict_batch_size, process_num=global_settings["thread_num"], ) self.set_precursor_and_fragment(**res) if self.rt_to_irt and "rt_pred" in self._precursor_df.columns: self.translate_rt_to_irt_pred() if self.model_manager.verbose: logging.info("End predicting RT/IM/MS2")
[docs] class PredictSpecLibFlat(SpecLibFlat): """ Flatten the predicted spectral library, the key feature is to predict and flatten fragments in batch with `predict_and_parse_lib_in_batch()` Parameters ---------- min_fragment_intensity : float, optional minimal intensity to keep, by default 0.001 keep_top_k_fragments : int, optional top k highest peaks to keep, by default 1000 """
[docs] def __init__( self, min_fragment_intensity: float = 0.001, keep_top_k_fragments: int = 1000, custom_fragment_df_columns: list = [ "type", "number", "position", "charge", "loss_type", ], **kwargs, ): super().__init__( min_fragment_intensity=min_fragment_intensity, keep_top_k_fragments=keep_top_k_fragments, custom_fragment_df_columns=custom_fragment_df_columns, )
[docs] def predict_and_parse_lib_in_batch( self, predict_lib: PredictSpecLib, batch_size: int = 200000 ): """Predict and flatten fragments in batch Parameters ---------- predict_lib : PredictSpecLib spectral library to be predicted and flatten batch_size : int, optional the batch size, by default 200000 """ logging.info( f"Flattening {len(predict_lib.precursor_df)} precursors in batch size {batch_size} ..." ) if len(predict_lib.precursor_df) <= batch_size: predict_lib.predict_all() self.parse_base_library(predict_lib) else: predict_lib.model_manager.verbose = False predict_lib.refine_df() df = predict_lib.precursor_df precursor_df_list = [] fragment_df_list = [] for i in tqdm.tqdm(range(0, len(df), batch_size)): predict_lib._precursor_df = df.iloc[i : i + batch_size].copy() predict_lib.predict_all() flat_df, frag_df = flatten_fragments( predict_lib.precursor_df, predict_lib.fragment_mz_df, predict_lib.fragment_intensity_df, min_fragment_intensity=self.min_fragment_intensity, keep_top_k_fragments=self.keep_top_k_fragments, custom_columns=self.custom_fragment_df_columns, ) precursor_df_list.append(flat_df) fragment_df_list.append(frag_df) predict_lib._precursor_df = df self._precursor_df, self._fragment_df = ( concat_precursor_fragment_dataframes( precursor_df_list, fragment_df_list ) )