import os
import psutil
import pandas as pd
import numpy as np
from typing import Union, Tuple
from alphabase.peptide.fragment import get_charged_frag_types
from alphabase.psm_reader import psm_reader_provider
from peptdeep.settings import global_settings
from peptdeep.protein.fasta import PredictSpecLibFasta
from peptdeep.spec_lib.translate import (
speclib_to_single_df,
mod_to_unimod_dict,
translate_to_tsv,
)
from peptdeep.pretrained_models import ModelManager
from peptdeep.utils import logging, read_peptide_table
[docs]
class PredictLibraryMakerBase(object):
"""
Base class to predict libraries
"""
[docs]
def __init__(
self,
model_manager: ModelManager = None,
):
lib_settings = global_settings["library"]
self.spec_lib = PredictSpecLibFasta(
model_manager=model_manager,
charged_frag_types=get_charged_frag_types(
lib_settings["frag_types"],
lib_settings["max_frag_charge"],
),
protease=lib_settings["fasta"]["protease"],
max_missed_cleavages=lib_settings["fasta"]["max_miss_cleave"],
peptide_length_min=lib_settings["min_peptide_len"],
peptide_length_max=lib_settings["max_peptide_len"],
precursor_charge_min=lib_settings["min_precursor_charge"],
precursor_charge_max=lib_settings["max_precursor_charge"],
precursor_mz_min=lib_settings["min_precursor_mz"],
precursor_mz_max=lib_settings["max_precursor_mz"],
var_mods=lib_settings["var_mods"],
min_var_mod_num=lib_settings["min_var_mod_num"],
max_var_mod_num=lib_settings["max_var_mod_num"],
fix_mods=lib_settings["fix_mods"],
labeling_channels=lib_settings["labeling_channels"],
special_mods=lib_settings["special_mods"],
min_special_mod_num=lib_settings["min_special_mod_num"],
max_special_mod_num=lib_settings["max_special_mod_num"],
special_mods_cannot_modify_pep_n_term=lib_settings[
"special_mods_cannot_modify_pep_n_term"
],
special_mods_cannot_modify_pep_c_term=lib_settings[
"special_mods_cannot_modify_pep_c_term"
],
decoy=lib_settings["decoy"],
include_contaminants=lib_settings["fasta"]["add_contaminants"],
I_to_L=False,
generate_precursor_isotope=lib_settings["generate_precursor_isotope"],
rt_to_irt=lib_settings["rt_to_irt"],
)
def _check_df(self) -> str:
pass
def _input(self, infiles):
"""Virtual method to be re-implemented by sub-classes"""
raise NotImplementedError("All sub-classes must re-implement '_input()' method")
def _predict(self):
self.spec_lib.predict_all()
@property
def precursor_df(self) -> pd.DataFrame:
return self.spec_lib.precursor_df
@property
def fragment_intensity_df(self) -> pd.DataFrame:
return self.spec_lib.fragment_intensity_df
@property
def fragment_mz_df(self) -> pd.DataFrame:
return self.spec_lib.fragment_mz_df
[docs]
def make_library(self, infiles: Union[str, list, pd.DataFrame]):
"""Predict a library for the `infiles`,
this function runs the following methods.
- self._input(infiles)
- self._check_df()
- self._predict()
Parameters
----------
_input
_input file or source
Raises
------
ValueError
ValueError for some reasons
"""
logging.info("Generating the spectral library ...")
try:
self._input(infiles)
logging.info(f"Loaded {len(self.spec_lib.precursor_df)} precursors.")
self._check_df()
self._predict()
logging.info(
"Predicting the spectral library with "
f"{len(self.precursor_df)} precursors "
f"and {np.prod(self.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments "
f"used {psutil.Process(os.getpid()).memory_info().rss/1024**3:.4f} GB memory"
)
except ValueError as e:
raise e
[docs]
def translate_to_tsv(self, tsv_path: str, translate_mod_dict: dict = None):
"""Translate the predicted DataFrames into a TSV file"""
logging.info(f"Translating to {tsv_path} for DiaNN/Spectronaut...")
lib_settings = global_settings["library"]
if "proteins" not in self.spec_lib._precursor_df.columns:
self.spec_lib.append_protein_name()
translate_to_tsv(
self.spec_lib,
tsv_path,
keep_k_highest_fragments=lib_settings["output_tsv"]["keep_higest_k_peaks"],
min_frag_intensity=lib_settings["output_tsv"]["min_relative_intensity"],
min_frag_mz=lib_settings["output_tsv"]["min_fragment_mz"],
max_frag_mz=lib_settings["output_tsv"]["max_fragment_mz"],
batch_size=lib_settings["output_tsv"]["translate_batch_size"],
translate_mod_dict=translate_mod_dict,
)
[docs]
def translate_library(self, translate_mod_dict: dict = None) -> pd.DataFrame:
"""Translate predicted DataFrames into
a single DataFrame in SWATH library format
"""
logging.info("Translating library for DiaNN/Spectronaut...")
lib_settings = global_settings["library"]
if "proteins" not in self.spec_lib._precursor_df.columns:
self.spec_lib.append_protein_name()
return speclib_to_single_df(
self.spec_lib,
translate_mod_dict=translate_mod_dict,
keep_k_highest_fragments=lib_settings["output_tsv"]["keep_higest_k_peaks"],
min_frag_intensity=lib_settings["output_tsv"]["min_relative_intensity"],
min_frag_mz=lib_settings["output_tsv"]["min_fragment_mz"],
max_frag_mz=lib_settings["output_tsv"]["max_fragment_mz"],
)
[docs]
def load_dfs(infiles):
if isinstance(infiles, str):
infiles = [infiles]
df_list = []
for file_path in infiles:
df_list.append(read_peptide_table(file_path))
return pd.concat(df_list, ignore_index=True)
[docs]
class PSMReaderLibraryMaker(PredictLibraryMakerBase):
def _input(self, psm_type_infiles: Tuple[str, Union[str, list]]):
psm_type, infiles = psm_type_infiles
if isinstance(infiles, str):
infiles = [infiles]
psm_reader = psm_reader_provider.get_reader(psm_type)
df = psm_reader.import_files(infiles)
df.drop_duplicates(["sequence", "mods", "mod_sites", "charge"], inplace=True)
df.drop(
columns=[
x
for x in df.columns.values
if x
not in [
"sequence",
"mods",
"mod_sites",
"charge",
"proteins",
"genes",
"nAA",
]
],
inplace=True,
)
df["sequence"] = df.sequence.astype("U")
df["mods"] = df.mods.astype("U")
df["mod_sites"] = df.mod_sites.astype("U")
if "proteins" in df.columns:
df["proteins"] = df.proteins.astype("U")
if "genes" in df.columns:
df["genes"] = df.genes.astype("U")
self.spec_lib._precursor_df = df
self.spec_lib.append_decoy_sequence()
self.spec_lib.add_peptide_labeling()
[docs]
class PrecursorLibraryMaker(PredictLibraryMakerBase):
"""For input dataframe of charged modified sequences"""
def _input(self, infiles: Union[str, list, pd.DataFrame]):
if isinstance(infiles, pd.DataFrame):
df = infiles
else:
df = load_dfs(infiles)
if "charge" not in df.columns:
raise KeyError('`precursor_table` must contain the "charge" column.')
df.drop_duplicates(["sequence", "mods", "mod_sites", "charge"], inplace=True)
self.spec_lib._precursor_df = df
self.spec_lib.add_peptide_labeling()
self.spec_lib.append_decoy_sequence()
def _check_df(self):
(self.spec_lib.precursor_df["charge"]) = self.spec_lib.precursor_df[
"charge"
].astype(np.int8)
if (
"mods" not in self.spec_lib.precursor_df.columns
or "mod_sites" not in self.spec_lib.precursor_df.columns
):
self.spec_lib.precursor_df["mods"] = ""
self.spec_lib.precursor_df["mod_sites"] = ""
else:
(self.spec_lib.precursor_df["mods"]) = self.spec_lib.precursor_df[
"mods"
].astype("U")
(self.spec_lib.precursor_df["mod_sites"]) = self.spec_lib.precursor_df[
"mod_sites"
].astype("U")
self.spec_lib.protein_df = pd.DataFrame()
[docs]
class PeptideLibraryMaker(PrecursorLibraryMaker):
"""For input dataframe of modified sequences"""
def _input(self, infiles: Union[str, list, pd.DataFrame]):
if isinstance(infiles, pd.DataFrame):
df = infiles
else:
df = load_dfs(infiles)
df.drop_duplicates(["sequence", "mods", "mod_sites"], inplace=True)
self.spec_lib._precursor_df = df
self.spec_lib.append_decoy_sequence()
self.spec_lib.add_peptide_labeling()
self.spec_lib.add_charge()
[docs]
class SequenceLibraryMaker(PeptideLibraryMaker):
"""For input dataframe of AA sequences"""
def _input(self, infiles: Union[str, list, pd.DataFrame]):
if isinstance(infiles, pd.DataFrame):
df = infiles
else:
df = load_dfs(infiles)
if "sequence" not in df.columns:
raise KeyError("`SequenceLibraryMaker` must contain `sequence` column")
df.drop_duplicates(["sequence"], inplace=True)
self.spec_lib._precursor_df = df
self.spec_lib.append_decoy_sequence()
self.spec_lib.add_modifications()
self.spec_lib.add_special_modifications()
self.spec_lib.add_peptide_labeling()
self.spec_lib.add_charge()
[docs]
class FastaLibraryMaker(PredictLibraryMakerBase):
"""For fasta or a list of fasta files"""
def _input(self, fasta: Union[str, list]):
self.spec_lib.get_peptides_from_fasta(fasta)
self.spec_lib.append_decoy_sequence()
self.spec_lib.add_modifications()
self.spec_lib.add_special_modifications()
self.spec_lib.add_peptide_labeling()
self.spec_lib.add_charge()
[docs]
class LibraryMakerProvider:
"""
Factory class for library makers
"""
[docs]
def __init__(self):
self.library_maker_dict = {}
[docs]
def register_maker(self, maker_name: str, maker_class):
self.library_maker_dict[maker_name.lower()] = maker_class
[docs]
def get_maker(
self,
maker_name: str,
*,
model_manager=None,
) -> PredictLibraryMakerBase:
maker_name = maker_name.lower()
if maker_name in self.library_maker_dict:
return self.library_maker_dict[maker_name](model_manager)
elif maker_name in psm_reader_provider.reader_dict:
return PSMReaderLibraryMaker(model_manager)
else:
raise KeyError(f"Library maker `{maker_name}` is not registered.")
library_maker_provider = LibraryMakerProvider()
library_maker_provider.register_maker("precursor_table", PrecursorLibraryMaker)
library_maker_provider.register_maker("precursor_library", PrecursorLibraryMaker)
library_maker_provider.register_maker("peptide_table", PeptideLibraryMaker)
library_maker_provider.register_maker("peptide_library", PeptideLibraryMaker)
library_maker_provider.register_maker("sequence_table", SequenceLibraryMaker)
library_maker_provider.register_maker("sequence_library", SequenceLibraryMaker)
library_maker_provider.register_maker("fasta", FastaLibraryMaker)
library_maker_provider.register_maker("fasta_library", FastaLibraryMaker)