Source code for peptdeep.rescore.feature_extractor

import pandas as pd
import numpy as np
import os

import torch
import torch.multiprocessing as mp

from alphabase.peptide.fragment import get_charged_frag_types
from alphabase.peptide.precursor import refine_precursor_df
from alphabase.peptide.fragment import concat_precursor_fragment_dataframes

from peptdeep.pretrained_models import ModelManager
from peptdeep.model.ms2 import calc_ms2_similarity
from peptdeep.mass_spec.match import PepSpecMatch

from peptdeep.rescore.fdr import calc_fdr_for_df
from peptdeep.utils import process_bar, logging
from peptdeep.settings import global_settings
# perc_settings = global_settings['percolator']


[docs] def match_one_raw( psm_df_one_raw, ms2_file, ms2_file_type, frag_types_to_match, ms2_ppm, ms2_tol, calibrate_frag_mass_error, ): """Internal function""" match = PepSpecMatch(charged_frag_types=frag_types_to_match) (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df) = ( match.match_ms2_one_raw( refine_precursor_df(psm_df_one_raw), ms2_file=ms2_file, ms2_file_type=ms2_file_type, ppm=ms2_ppm, tol=ms2_tol, ) ) if calibrate_frag_mass_error: from peptdeep.mass_spec.mass_calibration import MassCalibratorForRT_KNN frag_mass_calibrator = MassCalibratorForRT_KNN() _df_fdr = psm_df.query("fdr<0.01") frag_mass_calibrator.fit(_df_fdr, matched_mz_err_df) matched_mz_err_df = frag_mass_calibrator.calibrate(psm_df, matched_mz_err_df) return (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df)
[docs] def get_psm_scores( psm_df: pd.DataFrame, predict_intensity_df: pd.DataFrame, matched_intensity_df: pd.DataFrame, matched_mass_err_df: pd.DataFrame, ) -> pd.DataFrame: """ AlphaPeptDeep has a built-in score for PSMs, it works much better than other scores such as X!Tandem Parameters ---------- psm_df : pd.DataFrame PSM DataFrame predict_intensity_df : pd.DataFrame Predict intensity DataFrame matched_intensity_df : pd.DataFrame Matched intensity DataFrame matched_mass_err_df : pd.DataFrame Matched mass error DataFrame Returns ------- DataFrame `psm_df` with "*_score" columns appended inplace """ matched_norm_intensity_df = pd.DataFrame( np.log(matched_intensity_df.values + 1), columns=matched_intensity_df.columns.values, ) matched_merr_weight_df = matched_mass_err_df.mask( matched_mass_err_df > 1000000, 0 ).abs() max_merr = matched_merr_weight_df.values.max() if max_merr > 0: matched_merr_weight_df /= max_merr matched_merr_weight_df = 1 - matched_merr_weight_df.pow(4) peak_score_df = matched_norm_intensity_df * matched_merr_weight_df pred_weighted_score_df = peak_score_df * predict_intensity_df def _get_one_score( frag_start_end, peak_score_values, pred_weighted_score_values, ): frag_start, frag_end = frag_start_end frag_ratio = (peak_score_values[frag_start:frag_end] > 0).mean() ** 0.5 return ( peak_score_values[frag_start:frag_end].sum() * frag_ratio, pred_weighted_score_values[frag_start:frag_end].sum() * frag_ratio, ) ( psm_df["merr_weighted_score"], psm_df["pred_weighted_score"], ) = zip( *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( _get_one_score, axis=1, peak_score_values=peak_score_df.values, pred_weighted_score_values=pred_weighted_score_df.values, ) ) return psm_df
[docs] def get_ms2_features( psm_df, frag_types, predict_intensity_df, matched_intensity_df, matched_mass_err_df, ) -> pd.DataFrame: """Extract ms2 features from the given predict_intensity_df and matched_intensity_df. It will add columns into psm_df: - cos: cosine similarity between predicted and matched fragments - pcc: pearson correlation between predicted and matched fragments - sa: spectral angle between predicted and matched fragments - spc: Spearman's rank correlation between predicted and matched fragments. - cos_bion: ... - cos_yion: ... - pcc_bion: ... - pcc_yion: ... - sa_bion: ... - sa_yion: ... - spc_bion: ... - spc_yion: ... - matched_frag_ratio: # matched fragments / # total b+y fragments - matched_bion_ratio: # matched b fragments / # total b fragments - matched_yion_ratio: # matched y fragments / # total y fragments - and more ... """ used_frag_types = frag_types predict_intensity_df = predict_intensity_df[used_frag_types] def _get_frag_features( frag_start_end, matched_inten_values, predicted_inten_values, has_matched_intens, has_predicted_intens, has_both_matched_predicted, ): frag_start, frag_end = frag_start_end matched_frag_num = has_matched_intens[frag_start:frag_end].sum(dtype=np.float32) pred_frag_num = has_predicted_intens[frag_start:frag_end].sum(dtype=np.float32) matched_frag_ratio = matched_frag_num / ( matched_inten_values.shape[1] * (frag_end - frag_start) ) both_matched_pred_frag_num = has_both_matched_predicted[ frag_start:frag_end ].sum(dtype=np.float32) matched_not_pred_frag_num = ( has_matched_intens[frag_start:frag_end] & ~has_both_matched_predicted[frag_start:frag_end] ).sum(dtype=np.float32) pred_not_matched_frag_num = ( has_predicted_intens[frag_start:frag_end] & ~has_both_matched_predicted[frag_start:frag_end] ).sum(dtype=np.float32) if matched_frag_num > 0: both_matched_pred_frag_to_matched = ( both_matched_pred_frag_num / matched_frag_num ) matched_not_pred_frag_ratio = matched_not_pred_frag_num / matched_frag_num else: both_matched_pred_frag_to_matched = 0 matched_not_pred_frag_ratio = 0 if pred_frag_num > 0: both_matched_pred_frag_to_pred = both_matched_pred_frag_num / pred_frag_num pred_not_matched_frag_ratio = pred_not_matched_frag_num / pred_frag_num else: both_matched_pred_frag_to_pred = 0 pred_not_matched_frag_ratio = 0 matched_frag_rel_to_pred = matched_inten_values[frag_start:frag_end][ has_predicted_intens[frag_start:frag_end] ].sum() if matched_frag_rel_to_pred > 0: matched_frag_rel_to_pred /= matched_inten_values[frag_start:frag_end].sum() pred_frag_rel_to_matched = predicted_inten_values[frag_start:frag_end][ has_matched_intens[frag_start:frag_end] ].sum() if pred_frag_rel_to_matched > 0: pred_frag_rel_to_matched /= predicted_inten_values[ frag_start:frag_end ].sum() return ( matched_frag_num, matched_frag_ratio, both_matched_pred_frag_num, both_matched_pred_frag_to_matched, both_matched_pred_frag_to_pred, matched_not_pred_frag_num, matched_not_pred_frag_ratio, pred_not_matched_frag_num, pred_not_matched_frag_ratio, matched_frag_rel_to_pred, pred_frag_rel_to_matched, ) psm_df, ms2_metrics_df = calc_ms2_similarity( psm_df, predict_intensity_df, matched_intensity_df, charged_frag_types=used_frag_types, metrics=["COS", "SA", "SPC", "PCC"], spc_top_k=perc_settings["top_k_frags_to_calc_spc"], ) psm_df.rename( columns={ "COS": "cos", "SA": "sa", "SPC": "spc", "PCC": "pcc", }, inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[used_frag_types], matched_intensity_df=matched_intensity_df[used_frag_types], matched_mass_err_df=matched_mass_err_df[used_frag_types], ) psm_df.rename( columns={ "merr_weighted_score": "merr_weighted_frag_score", "pred_weighted_score": "pred_weighted_frag_score", }, inplace=True, ) has_matched_intens = matched_intensity_df[used_frag_types].values > 0 has_predicted_intens = predict_intensity_df[used_frag_types].values > 0.001 has_both_matched_predicted = has_matched_intens & has_predicted_intens ( psm_df["matched_frag_num"], psm_df["matched_frag_ratio"], psm_df["both_matched_pred_frag_num"], psm_df["both_matched_pred_frag_to_matched"], psm_df["both_matched_pred_frag_to_pred"], psm_df["matched_not_pred_frag_num"], psm_df["matched_not_pred_frag_ratio"], psm_df["pred_not_matched_frag_num"], psm_df["pred_not_matched_frag_ratio"], psm_df["matched_frag_rel_to_pred"], psm_df["pred_frag_rel_to_matched"], ) = zip( *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( _get_frag_features, axis=1, matched_inten_values=matched_intensity_df[used_frag_types].values, predicted_inten_values=predict_intensity_df[used_frag_types].values, has_matched_intens=has_matched_intens, has_predicted_intens=has_predicted_intens, has_both_matched_predicted=has_both_matched_predicted, ) ) b_frag_types = [_t for _t in used_frag_types if _t.startswith("b")] if len(b_frag_types) > 0: psm_df, ms2_metrics_df = calc_ms2_similarity( psm_df, predict_intensity_df, matched_intensity_df, charged_frag_types=b_frag_types, metrics=["COS", "SA", "SPC", "PCC"], ) psm_df.rename( columns={ "COS": "cos_bion", "SA": "sa_bion", "SPC": "spc_bion", "PCC": "pcc_bion", }, inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[b_frag_types], matched_intensity_df=matched_intensity_df[b_frag_types], matched_mass_err_df=matched_mass_err_df[b_frag_types], ) psm_df.rename( columns={ "merr_weighted_score": "merr_weighted_bion_score", "pred_weighted_score": "pred_weighted_bion_score", }, inplace=True, ) has_matched_intens = matched_intensity_df[b_frag_types].values > 0 has_predicted_intens = predict_intensity_df[b_frag_types].values > 0 has_both_matched_predicted = has_matched_intens & has_predicted_intens ( psm_df["matched_bion_num"], psm_df["matched_bion_ratio"], psm_df["both_matched_pred_bion_num"], psm_df["both_matched_pred_bion_to_matched"], psm_df["both_matched_pred_bion_to_pred"], psm_df["matched_not_pred_bion_num"], psm_df["matched_not_pred_bion_ratio"], psm_df["pred_not_matched_bion_num"], psm_df["pred_not_matched_bion_ratio"], psm_df["matched_bion_rel_to_pred"], psm_df["pred_bion_rel_to_matched"], ) = zip( *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( _get_frag_features, axis=1, matched_inten_values=matched_intensity_df[b_frag_types].values, predicted_inten_values=predict_intensity_df[b_frag_types].values, has_matched_intens=has_matched_intens, has_predicted_intens=has_predicted_intens, has_both_matched_predicted=has_both_matched_predicted, ) ) else: psm_df[ [ "matched_bion_num", "matched_bion_ratio", "both_matched_pred_bion_num", "both_matched_pred_bion_to_matched", "both_matched_pred_bion_to_pred", "matched_not_pred_bion_num", "matched_not_pred_bion_ratio", "pred_not_matched_bion_num", "pred_not_matched_bion_ratio", "matched_bion_rel_to_pred", "pred_bion_rel_to_matched", ] ] = 0 y_frag_types = [_t for _t in used_frag_types if _t.startswith("y")] if len(y_frag_types) > 0: psm_df, ms2_metrics_df = calc_ms2_similarity( psm_df, predict_intensity_df, matched_intensity_df, charged_frag_types=y_frag_types, metrics=["COS", "SA", "SPC", "PCC"], ) psm_df.rename( columns={ "COS": "cos_yion", "SA": "sa_yion", "SPC": "spc_yion", "PCC": "pcc_yion", }, inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[b_frag_types], matched_intensity_df=matched_intensity_df[b_frag_types], matched_mass_err_df=matched_mass_err_df[b_frag_types], ) psm_df.rename( columns={ "merr_weighted_score": "merr_weighted_yion_score", "pred_weighted_score": "pred_weighted_yion_score", }, inplace=True, ) has_matched_intens = matched_intensity_df[y_frag_types].values > 0 has_predicted_intens = predict_intensity_df[y_frag_types].values > 0 has_both_matched_predicted = has_matched_intens & has_predicted_intens ( psm_df["matched_yion_num"], psm_df["matched_yion_ratio"], psm_df["both_matched_pred_yion_num"], psm_df["both_matched_pred_yion_to_matched"], psm_df["both_matched_pred_yion_to_pred"], psm_df["matched_not_pred_yion_num"], psm_df["matched_not_pred_yion_ratio"], psm_df["pred_not_matched_yion_num"], psm_df["pred_not_matched_yion_ratio"], psm_df["matched_yion_rel_to_pred"], psm_df["pred_yion_rel_to_matched"], ) = zip( *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( _get_frag_features, axis=1, matched_inten_values=matched_intensity_df[y_frag_types].values, predicted_inten_values=predict_intensity_df[y_frag_types].values, has_matched_intens=has_matched_intens, has_predicted_intens=has_predicted_intens, has_both_matched_predicted=has_both_matched_predicted, ) ) else: psm_df[ [ "matched_yion_num", "matched_yion_ratio", "both_matched_pred_yion_num", "both_matched_pred_yion_to_matched", "both_matched_pred_yion_to_pred", "matched_not_pred_yion_num", "matched_not_pred_yion_ratio", "pred_not_matched_yion_num", "pred_not_matched_yion_ratio", "matched_yion_rel_to_pred", "pred_yion_rel_to_matched", ] ] = 0 def _charge_one_hot(ch): x = [0] * 7 if ch > 6: x[-1] = 1 else: x[ch - 1] = 1 return tuple(x) ( psm_df["pep_z1"], psm_df["pep_z2"], psm_df["pep_z3"], psm_df["pep_z4"], psm_df["pep_z5"], psm_df["pep_z6"], psm_df["pep_z_gt_6"], ) = zip(*psm_df.charge.astype(np.int8).apply(_charge_one_hot)) def _mod_count(mods): if not mods: return 0 mod_count = 0 for mod in mods.split(";"): if mod != "Carbamidomethyl@C": mod_count += 1 return mod_count psm_df["mod_num"] = psm_df.mods.apply(_mod_count) return psm_df
# for imap/imap_unordered with multiprocessing.Pool()
[docs] def match_one_raw_mp(args): return match_one_raw(*args)
# for imap/imap_unordered with multiprocessing.Pool()
[docs] def get_ms2_features_mp(args): return get_ms2_features(*args)
[docs] class ScoreFeatureExtractor: """ScoreFeatureExtractor: Feature extractor for percolator with a single process. Parameters ---------- model_mgr : ModelManager The ModelManager in peptdeep.pretrained_models. """
[docs] def __init__(self, model_mgr: ModelManager): self.model_mgr = model_mgr self.model_mgr.verbose = False self.raw_num_to_tune = perc_settings["raw_num_to_tune"] self.score_feature_list = [ "sa", "spc", "pcc", "sa_bion", "spc_bion", "pcc_bion", "sa_yion", "spc_yion", "pcc_yion", "rt_delta_abs", "mobility_delta_abs", "merr_weighted_frag_score", "pred_weighted_frag_score", "merr_weighted_bion_score", "pred_weighted_bion_score", "merr_weighted_yion_score", "pred_weighted_yion_score", "matched_frag_num", "matched_frag_ratio", "both_matched_pred_frag_num", "both_matched_pred_frag_to_matched", "both_matched_pred_frag_to_pred", "matched_not_pred_frag_num", "matched_not_pred_frag_ratio", "pred_not_matched_frag_num", "pred_not_matched_frag_ratio", "matched_frag_rel_to_pred", "pred_frag_rel_to_matched", "matched_bion_num", "matched_bion_ratio", "both_matched_pred_bion_num", "both_matched_pred_bion_to_matched", "both_matched_pred_bion_to_pred", "matched_not_pred_bion_num", "matched_not_pred_bion_ratio", "pred_not_matched_bion_num", "pred_not_matched_bion_ratio", "matched_bion_rel_to_pred", "pred_bion_rel_to_matched", "matched_yion_num", "matched_yion_ratio", "both_matched_pred_yion_num", "both_matched_pred_yion_to_matched", "both_matched_pred_yion_to_pred", "matched_not_pred_yion_num", "matched_not_pred_yion_ratio", "pred_not_matched_yion_num", "pred_not_matched_yion_ratio", "matched_yion_rel_to_pred", "pred_yion_rel_to_matched", "pep_z1", "pep_z2", "pep_z3", "pep_z4", "pep_z5", "pep_z6", "pep_z_gt_6", "mod_num", ] self.reset_by_global_settings()
[docs] def reset_by_global_settings(self): self.require_model_tuning = perc_settings["require_model_tuning"] self.require_raw_specific_tuning = perc_settings["require_raw_specific_tuning"] self.raw_specific_ms2_tuning = perc_settings["raw_specific_ms2_tuning"] self.calibrate_frag_mass_error = perc_settings["calibrate_frag_mass_error"]
def _select_raw_to_tune( self, psm_df: pd.DataFrame, ) -> tuple: """Randomly select `self.raw_num_to_tune` raw files to tune the models. If # raw files is less than `self.raw_num_to_tune`, all raw files will be used to tune the model. Parameters ---------- psm_df : pd.DataFrame dataframe contains PSMs of all raw files. Returns ------- df_groupby_raw psm_df.groupby('raw_name') list selected raw_name list """ if "fdr" not in psm_df.columns: psm_df = calc_fdr_for_df(psm_df, "score") df_fdr = psm_df[(psm_df.fdr < 0.01) & (psm_df.decoy == 0)] df_groupby_raw = df_fdr.groupby("raw_name") if df_groupby_raw.ngroups < self.raw_num_to_tune: tune_raw_num = df_groupby_raw.ngroups else: tune_raw_num = self.raw_num_to_tune raw_list = list( df_groupby_raw["score"] .count() .rank(ascending=True) .nlargest(tune_raw_num) .index ) return df_groupby_raw, raw_list
[docs] def fine_tune_models( self, psm_df: pd.DataFrame, ms2_file_dict: dict, ms2_file_type: str, frag_types_to_match: str, ms2_ppm: bool, ms2_tol: float, ): """Sample some (n=`self.raw_num_to_tune`) from ms2 files, and extract spectrum/peak information, and then fine-tune the models. Parameters ---------- psm_df : pd.DataFrame psm_df ms2_file_dict : dict {raw_name: ms2_file_path} ms2_file_type : str ms2_file_type, could be 'alphapept', 'mgf', 'thermo_raw' frag_types_to_match : str ['b_z1','b_z2','y_z1'...] ms2_ppm : bool is ppm tolerance for ms2 matching ms2_tol : float tolerance value for ms2 matching """ logging.info("Preparing for fine-tuning ...") (df_groupby_raw, raw_list) = self._select_raw_to_tune(psm_df) psm_df_list = [] matched_intensity_df_list = [] for raw_name, df in process_bar(df_groupby_raw, df_groupby_raw.ngroups): if raw_name not in raw_list or raw_name not in ms2_file_dict: continue (df, _, inten_df, _) = match_one_raw( df, ms2_file_dict[raw_name], ms2_file_type, frag_types_to_match, ms2_ppm, ms2_tol, self.calibrate_frag_mass_error, ) psm_df_list.append(df) matched_intensity_df_list.append(inten_df) logging.info("Fine-tuning ...") if len(psm_df_list) == 0: return self._tune( *concat_precursor_fragment_dataframes( psm_df_list, matched_intensity_df_list ) ) logging.info("Fine-tuning done")
def _save_models(self): # save the model for future uses model_folder = os.path.join(perc_settings["output_folder"], "tuned_models") self.model_mgr.save_models(model_folder) with open( os.path.join(model_folder, "grid_instrument_nce_search.txt"), "w" ) as f: f.write( f"# The ms2 model is tuned for following instrument and nce, after grid instrument and nce search.\n" ) f.write(f"instrument={self.model_mgr.instrument}\n") f.write(f"nce={self.model_mgr.nce}\n") def _tune(self, psm_df, matched_intensity_df): self.model_mgr.train_ccs_model(psm_df) self.model_mgr.train_rt_model(psm_df) _grid_nce = self.model_mgr.use_grid_nce_search if self.model_mgr.ms2_model.device_type == "cpu": self.model_mgr.use_grid_nce_search = False self.model_mgr.train_ms2_model(psm_df, matched_intensity_df) self.model_mgr.use_grid_nce_search = _grid_nce self._save_models()
[docs] def extract_rt_features(self, psm_df): if ( self.require_raw_specific_tuning and self.model_mgr.ms2_model.device_type != "cpu" ): ( psm_num_to_train_rt_ccs, psm_num_per_mod_to_train_rt_ccs, epoch_to_train_rt_ccs, ) = ( self.model_mgr.psm_num_to_train_rt_ccs, self.model_mgr.psm_num_per_mod_to_train_rt_ccs, self.model_mgr.epoch_to_train_rt_ccs, ) (self.model_mgr.psm_num_to_train_rt_ccs) = perc_settings[ "psm_num_per_raw_to_tune" ] self.model_mgr.psm_num_per_mod_to_train_rt_ccs = 0 (self.model_mgr.epoch_to_train_rt_ccs) = perc_settings[ "epoch_per_raw_to_tune" ] self.model_mgr.train_rt_model( psm_df[(psm_df.fdr < 0.01) & (psm_df.decoy == 0)] ) ( self.model_mgr.psm_num_to_train_rt_ccs, self.model_mgr.psm_num_per_mod_to_train_rt_ccs, self.model_mgr.epoch_to_train_rt_ccs, ) = ( psm_num_to_train_rt_ccs, psm_num_per_mod_to_train_rt_ccs, epoch_to_train_rt_ccs, ) if "rt_norm" in psm_df.columns: psm_df = self.model_mgr.predict_rt(psm_df) psm_df["rt_delta"] = psm_df.rt_pred - psm_df.rt_norm mean_delta = psm_df.loc[ (psm_df.fdr < 0.01) & (psm_df.decoy == 0), "rt_delta" ].mean() if np.isnan(mean_delta): mean_delta = 0 psm_df["rt_delta_abs"] = (psm_df.rt_delta - mean_delta).abs() else: psm_df["rt_delta"] = 0 psm_df["rt_delta_abs"] = 0
[docs] def extract_mobility_features(self, psm_df): if "mobility" in psm_df.columns: psm_df = self.model_mgr.predict_mobility(psm_df) psm_df["mobility_delta"] = psm_df.mobility_pred - psm_df.mobility mean_delta = psm_df.loc[ (psm_df.fdr < 0.01) & (psm_df.decoy == 0), "mobility_delta" ].mean() if np.isnan(mean_delta): mean_delta = 0 psm_df["mobility_delta_abs"] = (psm_df.mobility_delta - mean_delta).abs() else: psm_df["mobility_delta"] = 0 psm_df["mobility_delta_abs"] = 0
[docs] def match_ms2( self, psm_df: pd.DataFrame, ms2_file_dict, # raw_name: ms2_file_path or ms_reader object ms2_file_type: str, frag_types_to_match: list = get_charged_frag_types(["b", "y"], 2), ms2_ppm=True, ms2_tol=20, ): self.match = PepSpecMatch(charged_frag_types=frag_types_to_match) self.match.match_ms2_centroid( refine_precursor_df(psm_df), ms2_file_dict=ms2_file_dict, ms2_file_type=ms2_file_type, ppm=ms2_ppm, tol=ms2_tol, )
def _get_model_frag_types(self, frag_types): used_frag_types = [] for frag_type in frag_types: if frag_type in (self.model_mgr.ms2_model.charged_frag_types): used_frag_types.append(frag_type) return used_frag_types
[docs] def extract_features( self, psm_df: pd.DataFrame, ms2_file_dict, ms2_file_type, frag_types: list = get_charged_frag_types(["b", "y"], 2), ms2_ppm=global_settings["peak_matching"]["ms2_ppm"], ms2_tol=global_settings["peak_matching"]["ms2_tol_value"], ) -> pd.DataFrame: """Extract features and add columns (`self.score_feature_list`) into psm_df Parameters ---------- psm_df : pd.DataFrame psm dataframe to extract features ms2_file_dict : [type] MS2 file path dict: {raw_name: ms2_path} ms2_file_type : str, optional MS2 file type, coult be 'alphapept', 'mgf', or 'raw'. frag_types : list, optional fragment types. Defaults to `alphabase.fragment.get_charged_frag_types(['b','y'], 2)`. ms2_ppm : bool, optional Matching MS2 mass tolerance unit. Defaults to True. ms2_tol : int, optional Matching mass tolerance. Defaults to 20. Returns ------- pd.DataFrame psm_df with feature columns added """ frag_types = self._get_model_frag_types(frag_types) if self.require_model_tuning: logging.info("Fine-tuning models ...") self.fine_tune_models( psm_df, ms2_file_dict, ms2_file_type, frag_types, ms2_ppm, ms2_tol ) logging.info(f"Extracting peptdeep features for {len(psm_df)} PSMs ...") result_psm_list = [] groupby = psm_df.groupby("raw_name") for raw_name, df in process_bar(groupby, groupby.ngroups): if raw_name not in ms2_file_dict: continue (df, frag_mz_df, frag_inten_df, frag_merr_df) = match_one_raw( df, ms2_file_dict[raw_name], ms2_file_type, frag_types, ms2_ppm, ms2_tol, self.calibrate_frag_mass_error, ) self.extract_rt_features(df) self.extract_mobility_features(df) predict_inten_df = self.model_mgr.predict_ms2(df) result_psm_list.append( get_ms2_features( df, frag_types, predict_inten_df, frag_inten_df, frag_merr_df, ) ) self.psm_df = pd.concat(result_psm_list, ignore_index=True) logging.info("Finish extracting features") return self.psm_df
[docs] class ScoreFeatureExtractorMP(ScoreFeatureExtractor):
[docs] def __init__(self, model_mgr: ModelManager): """ScoreFeatureExtractorMP: Feature extractor for percolator with multiprocessing. Parameters ---------- model_mgr : ModelManager The ModelManager in peptdeep.pretrained_models. """ super().__init__(model_mgr=model_mgr) # share_memory to save memory self.model_mgr.ms2_model.model.share_memory() self.model_mgr.rt_model.model.share_memory() self.model_mgr.ccs_model.model.share_memory()
[docs] def fine_tune_models( self, psm_df, ms2_file_dict, ms2_file_type, frag_types_to_match, ms2_ppm, ms2_tol, ): """Sample some (n=`self.raw_num_to_tune`) from ms2 files, and extract (MP) spectrum/peak information, and then fine-tune the models. Parameters ---------- psm_df : pd.DataFrame psm_df ms2_file_dict : dict {raw_name: ms2_file_path} ms2_file_type : str ms2_file_type, could be 'alphapept', 'mgf', 'thermo_raw' frag_types_to_match : str ['b_z1','b_z2','y_z1'...] ms2_ppm : bool is ppm tolerance for ms2 matching ms2_tol : float tolerance value for ms2 matching """ (df_groupby_raw, raw_list) = self._select_raw_to_tune(psm_df) def one_raw_param_generator(df_groupby_raw): for raw_name, df in df_groupby_raw: if raw_name not in raw_list or raw_name not in ms2_file_dict: continue yield ( df, ms2_file_dict[raw_name], ms2_file_type, frag_types_to_match, ms2_ppm, ms2_tol, self.calibrate_frag_mass_error, ) logging.info("Preparing for fine-tuning ...") psm_df_list = [] matched_intensity_df_list = [] with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: for df, _, inten_df, _ in process_bar( p.imap_unordered( match_one_raw_mp, one_raw_param_generator(df_groupby_raw) ), df_groupby_raw.ngroups, ): psm_df_list.append(df) matched_intensity_df_list.append(inten_df) logging.info("Fine-tuning ...") if len(psm_df_list) == 0: return self._tune( *concat_precursor_fragment_dataframes( psm_df_list, matched_intensity_df_list ) )
[docs] def extract_features_one_raw_mp(self, args): return self.extract_features_one_raw(*args)
[docs] def extract_features_one_raw( self, df_one_raw: pd.DataFrame, ms2_file, ms2_file_type, frag_types, ms2_ppm, ms2_tol, calibrate_frag_mass_error, ): (df, frag_mz_df, frag_inten_df, frag_merr_df) = match_one_raw( df_one_raw, ms2_file, ms2_file_type, frag_types, ms2_ppm, ms2_tol, calibrate_frag_mass_error, ) self.extract_rt_features(df) self.extract_mobility_features(df) predict_inten_df = self.model_mgr.predict_ms2(df) return get_ms2_features( df, frag_types, predict_inten_df, frag_inten_df, frag_merr_df, )
[docs] def extract_features( self, psm_df: pd.DataFrame, ms2_file_dict, ms2_file_type, frag_types: list = get_charged_frag_types(["b", "y"], 2), ms2_ppm=global_settings["peak_matching"]["ms2_ppm"], ms2_tol=global_settings["peak_matching"]["ms2_tol_value"], ) -> pd.DataFrame: """Extract (multiprocessing) features and add columns (self.score_feature_list) into psm_df. Parameters ---------- psm_df : pd.DataFrame psm dataframe to extract features ms2_file_dict : [type] MS2 file path dict: {raw_name: ms2_path} ms2_file_type : str, optional MS2 file type, coult be 'alphapept', 'mgf', or 'thermo'. frag_types : list, optional fragment types. Defaults to `alphabase.fragment.get_charged_frag_types(['b','y'], 2)`. ms2_ppm : bool, optional Matching MS2 mass tolerance unit. Defaults to True. ms2_tol : int, optional Matching mass tolerance. Defaults to 20. Returns ------- pd.DataFrame psm_df with feature columns added """ used_frag_types = self._get_model_frag_types(frag_types) if self.require_model_tuning: logging.info("Require fine-tuning models ...") self.fine_tune_models( psm_df, ms2_file_dict, ms2_file_type, used_frag_types, ms2_ppm, ms2_tol ) self.model_mgr._train_psm_logging = False def one_raw_param_generator(df_groupby_raw): for raw_name, df in df_groupby_raw: if raw_name not in ms2_file_dict: continue yield ( df, ms2_file_dict[raw_name], ms2_file_type, used_frag_types, ms2_ppm, ms2_tol, self.calibrate_frag_mass_error, ) logging.info( f"Extracting peptdeep features for {len(psm_df)} PSMs with multiprocessing ..." ) df_groupby_raw = psm_df.groupby("raw_name") result_psm_list = [] if ( self.require_raw_specific_tuning or self.model_mgr.ms2_model.device_type != "cpu" ): # multiprocessing is only used for ms2 matching def prediction_gen(df_groupby_raw): with mp.get_context("spawn").Pool(global_settings["thread_num"]) as _p: for ( df, frag_mz_df, frag_inten_df, frag_merr_df, ) in _p.imap_unordered( match_one_raw_mp, one_raw_param_generator(df_groupby_raw) ): # outsite multiprocessing region self.extract_rt_features(df) self.extract_mobility_features(df) if ( self.require_raw_specific_tuning and self.raw_specific_ms2_tuning ): ( psm_num_to_train_ms2, psm_num_per_mod_to_train_ms2, epoch_to_train_ms2, use_grid_nce_search, ) = ( self.model_mgr.psm_num_to_train_ms2, self.model_mgr.psm_num_per_mod_to_train_ms2, self.model_mgr.epoch_to_train_ms2, self.model_mgr.use_grid_nce_search, ) (self.model_mgr.psm_num_to_train_ms2) = perc_settings[ "psm_num_per_raw_to_tune" ] self.model_mgr.psm_num_per_mod_to_train_ms2 = 0 self.model_mgr.epoch_to_train_ms2 = 3 self.model_mgr.use_grid_nce_search = False if "nce" not in df.columns: self.model_mgr.set_default_nce(df) self.model_mgr.train_ms2_model( df[(df.fdr < 0.01) & (df.decoy == 0)], frag_inten_df ) ( self.model_mgr.psm_num_to_train_ms2, self.model_mgr.psm_num_per_mod_to_train_ms2, self.model_mgr.epoch_to_train_ms2, self.model_mgr.use_grid_nce_search, ) = ( psm_num_to_train_ms2, psm_num_per_mod_to_train_ms2, epoch_to_train_ms2, use_grid_nce_search, ) predict_inten_df = self.model_mgr.predict_ms2(df) yield ( df, used_frag_types, predict_inten_df, frag_inten_df, frag_merr_df, ) with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: for df in process_bar( p.imap_unordered( get_ms2_features_mp, prediction_gen(df_groupby_raw) ), df_groupby_raw.ngroups, ): result_psm_list.append(df) else: # use multiprocessing for prediction # only when no GPUs are available with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: for _df in process_bar( p.imap_unordered( self.extract_features_one_raw_mp, one_raw_param_generator(df_groupby_raw), ), df_groupby_raw.ngroups, ): result_psm_list.append(_df) self.psm_df = pd.concat(result_psm_list, ignore_index=True) logging.info("Finished feature extraction with multiprocessing") self.model_mgr._train_psm_logging = True return self.psm_df