Source code for peptdeep.mass_spec.match

import numpy as np
import numba
import pandas as pd
import tqdm

from alphabase.peptide.fragment import (
    create_fragment_mz_dataframe,
    get_charged_frag_types,
)
from peptdeep.mass_spec.ms_reader import ms2_reader_provider, MSReaderBase



[docs]
@numba.njit
def match_centroid_mz(
    spec_mzs: np.ndarray,
    query_mzs: np.ndarray,
    spec_mz_tols: np.ndarray,
) -> np.ndarray:
    """
    Matching query masses against sorted MS2/spec centroid masses,
    only closest (minimal abs mass error) peaks are returned.

    Parameters
    ----------
    spec_mzs : np.ndarray
        MS2 or spec mz values, 1-D float array

    query_mzs : np.ndarray
        query mz values, n-D float array

    spec_mz_tols : np.ndarray
        Da tolerance array, same shape as spec_mzs

    Returns
    -------
    np.ndarray
        np.ndarray of int32, the shape is the same as query_mzs.
        -1 means no peaks are matched for the query mz

    """
    idxes = np.searchsorted(spec_mzs, query_mzs)
    ret_indices = np.empty_like(query_mzs, dtype=np.int32)
    # ret_indices[:] = -1
    for i, idx in np.ndenumerate(idxes):
        min_merr = abs(spec_mzs[idx - 1] - query_mzs[i])
        min_idx = -1
        if min_merr <= spec_mz_tols[idx - 1]:
            min_idx = idx - 1
        if idx < len(spec_mzs):
            merr = abs(spec_mzs[idx] - query_mzs[i])
            if merr <= spec_mz_tols[idx] and merr < min_merr:
                min_idx = idx
        ret_indices[i] = min_idx
    return ret_indices




[docs]
@numba.njit
def match_profile_mz(
    spec_mzs: np.ndarray,
    query_mzs: np.ndarray,
    spec_mz_tols: np.ndarray,
    spec_intens: np.ndarray,
) -> np.ndarray:
    """
    Matching query masses against sorted MS2/spec profile masses,
    only highest peaks are returned.

    Parameters
    ----------
    spec_mzs : np.ndarray
        MS2 or spec mz values, 1-D float array

    query_mzs : np.ndarray
        query mz values, n-D float array

    spec_mz_tols : np.ndarray
        Da tolerance array, same shape as spec_mzs

    Returns
    -------
    np.ndarray
        np.ndarray of int32, the shape is the same as query_mzs.
        -1 means no peaks are matched for the query mz

    """
    idxes = np.searchsorted(spec_mzs, query_mzs)
    ret_indices = np.empty_like(query_mzs, dtype=np.int32)
    for i, idx in np.ndenumerate(idxes):
        if idx == len(spec_mzs):
            idx = idx - 1

        highest = 0
        highest_idx = -1

        for _idx in range(idx, -1, -1):
            if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]:
                break
            if highest < spec_intens[_idx]:
                highest = spec_intens[_idx]
                highest_idx = _idx
        for _idx in range(idx + 1, len(spec_mzs)):
            if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]:
                break
            if highest < spec_intens[_idx]:
                highest = spec_intens[_idx]
                highest_idx = _idx
        ret_indices[i] = highest_idx
    return ret_indices




[docs]
@numba.njit
def match_first_last_profile_mz(
    spec_mzs: np.ndarray,
    query_mzs: np.ndarray,
    spec_mz_tols: np.ndarray,
) -> np.ndarray:
    """
    Matching query masses against sorted MS2/spec profile masses,
    both first and last m/z values are returned.

    Parameters
    ----------
    spec_mzs : np.ndarray
        MS2 or spec mz values, 1-D float array

    query_mzs : np.ndarray
        query mz values, n-D float array

    spec_mz_tols : np.ndarray
        Da tolerance array, same shape as spec_mzs

    Returns
    -------
    np.ndarray
        2D np.ndarray of int32 with first and last matched index
        for the query mz. The shape is the same as (len(query_mzs),2).
        -1 means no peaks are matched for the query mz
    """
    idxes = np.searchsorted(spec_mzs, query_mzs)
    first_indices = np.empty_like(query_mzs, dtype=np.int32)
    last_indices = np.empty_like(query_mzs, dtype=np.int32)
    first_indices[:] = -1
    last_indices[:] = -1
    for i, idx in np.ndenumerate(idxes):
        if idx == len(spec_mzs):
            idx = idx - 1
        for _idx in range(idx, -1, -1):
            if spec_mzs[_idx] < query_mzs[i] - spec_mz_tols[_idx]:
                break
            else:
                first_indices[i] = _idx
        for _idx in range(idx, len(spec_mzs)):
            if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]:
                break
            else:
                last_indices[i] = _idx
    return first_indices, last_indices




[docs]
@numba.njit
def match_one_raw_with_numba(
    spec_idxes,
    frag_start_idxes,
    frag_stop_idxes,
    all_frag_mzs,
    all_spec_mzs,
    all_spec_intensities,
    peak_start_idxes,
    peak_end_idxes,
    matched_intensities,
    matched_mz_errs,
    ppm,
    tol,
):
    """
    Internel function to match fragment mz values to spectrum mz values.
    Matched_mz_errs[i] = np.inf if no peaks are matched.
    """
    for spec_idx, frag_start, frag_end in zip(
        spec_idxes, frag_start_idxes, frag_stop_idxes
    ):
        peak_start = peak_start_idxes[spec_idx]
        peak_end = peak_end_idxes[spec_idx]
        if peak_end == peak_start:
            continue
        spec_mzs = all_spec_mzs[peak_start:peak_end]
        spec_intens = all_spec_intensities[peak_start:peak_end]

        if ppm:
            spec_mz_tols = spec_mzs * tol * 1e-6
        else:
            spec_mz_tols = np.full_like(spec_mzs, tol)

        frag_mzs = all_frag_mzs[frag_start:frag_end, :].copy()

        matched_idxes = match_centroid_mz(spec_mzs, frag_mzs, spec_mz_tols).reshape(-1)

        matched_intens = spec_intens[matched_idxes]
        matched_intens[matched_idxes == -1] = 0

        matched_mass_errs = np.abs(
            spec_mzs[matched_idxes.reshape(-1)] - frag_mzs.reshape(-1)
        )
        matched_mass_errs[matched_idxes == -1] = np.inf

        matched_intensities[frag_start:frag_end, :] = matched_intens.reshape(
            frag_mzs.shape
        )

        matched_mz_errs[frag_start:frag_end, :] = matched_mass_errs.reshape(
            frag_mzs.shape
        )




[docs]
class PepSpecMatch(object):
    """Main entry for peptide-spectrum matching.

    TODO: figure out relation with `peptdeep.match.psm_match.PepSpecMatch`.
    """


[docs]
    def __init__(
        self,
        charged_frag_types=get_charged_frag_types(
            ["b", "y", "b_modloss", "y_modloss"], 2
        ),
    ):
        self.charged_frag_types = charged_frag_types


    def _preprocess_psms(self, psm_df):
        pass


[docs]
    def get_fragment_mz_df(self, psm_df):
        return create_fragment_mz_dataframe(psm_df, self.charged_frag_types)



[docs]
    def match_ms2_one_raw(
        self,
        psm_df_one_raw: pd.DataFrame,
        ms2_file: str,
        ms2_file_type: str = "alphapept",
        ppm: bool = True,
        tol: float = 20.0,
    ) -> tuple:
        """Matching psm_df_one_raw against ms2_file

        Parameters
        ----------
        psm_df_one_raw : pd.DataFrame
            psm dataframe
            that contains only one raw file

        ms2_file : str
            ms2 file path

        ms2_file_type : str, optional
            ms2 file type, could be ["thermo","alphapept","mgf"].
            Default to 'alphapept'

        ppm : bool, optional
            if use ppm tolerance. Defaults to True.

        tol : float, optional
            tolerance value. Defaults to 20.0.

        Returns
        -------
        tuple:
            pd.DataFrame: psm dataframe with fragment index information.

            pd.DataFrame: fragment mz dataframe.

            pd.DataFrame: matched intensity dataframe.

            pd.DataFrame: matched mass error dataframe.
            np.inf if a fragment is not matched.

        """
        self._preprocess_psms(psm_df_one_raw)
        psm_df = psm_df_one_raw
        if isinstance(ms2_file, MSReaderBase):
            ms2_reader = ms2_file
        else:
            ms2_reader = ms2_reader_provider.get_reader(ms2_file_type)
            ms2_reader.load(ms2_file)

        add_spec_info_list = []
        if "rt_norm" not in psm_df.columns:
            add_spec_info_list.append("rt")

        if (
            "mobility" not in psm_df.columns
            and "mobility" in ms2_reader.spectrum_df.columns
        ):
            add_spec_info_list.append("mobility")

        if "nce" not in psm_df.columns and "nce" in ms2_reader.spectrum_df.columns:
            add_spec_info_list.append("nce")

        if len(add_spec_info_list) > 0:
            # pfind does not report RT in the result file
            psm_df = (
                psm_df.reset_index()
                .merge(
                    ms2_reader.spectrum_df[["spec_idx"] + add_spec_info_list],
                    how="left",
                    on="spec_idx",
                )
                .set_index("index")
            )

            if "rt" in add_spec_info_list:
                psm_df["rt_norm"] = psm_df.rt / ms2_reader.spectrum_df.rt.max()

        fragment_mz_df = self.get_fragment_mz_df(psm_df)

        matched_intensity_df = pd.DataFrame(
            np.zeros_like(fragment_mz_df.values, dtype=np.float64),
            columns=fragment_mz_df.columns,
        )

        matched_mz_err_df = pd.DataFrame(
            np.full_like(fragment_mz_df.values, np.inf, dtype=np.float64),
            columns=fragment_mz_df.columns,
        )

        for spec_idx, frag_start_idx, frag_stop_idx in psm_df[
            ["spec_idx", "frag_start_idx", "frag_stop_idx"]
        ].values:
            (spec_mzs, spec_intens) = ms2_reader.get_peaks(spec_idx)
            if len(spec_mzs) == 0:
                continue

            if ppm:
                mz_tols = spec_mzs * tol * 1e-6
            else:
                mz_tols = np.full_like(spec_mzs, tol)

            frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :]

            matched_idxes = match_centroid_mz(spec_mzs, frag_mzs, mz_tols)
            matched_intens = spec_intens[matched_idxes]
            matched_intens[matched_idxes == -1] = 0

            matched_mz_errs = np.abs(spec_mzs[matched_idxes] - frag_mzs)
            matched_mz_errs[matched_idxes == -1] = np.inf

            matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = (
                matched_intens
            )

            matched_mz_err_df.values[frag_start_idx:frag_stop_idx, :] = matched_mz_errs

        return (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df)


    def _match_ms2_centroid_one_raw(self, raw_name, df_group):
        if raw_name in self._ms2_file_dict:
            if isinstance(self._ms2_file_dict[raw_name], MSReaderBase):
                ms2_reader = self._ms2_file_dict[raw_name]
            else:
                ms2_reader = ms2_reader_provider.get_reader(self._ms2_file_type)
                ms2_reader.load(self._ms2_file_dict[raw_name])
            if self.rt_not_in_df:
                # pfind does not report RT in the result file
                _df = (
                    df_group.reset_index()
                    .merge(
                        ms2_reader.spectrum_df[["spec_idx", "rt"]],
                        how="left",
                        on="spec_idx",
                    )
                    .set_index("index")
                )

                _df["rt_norm"] = _df.rt / ms2_reader.spectrum_df.rt.max()
                self.psm_df.loc[_df.index, ["rt", "rt_norm"]] = _df[["rt", "rt_norm"]]

            match_one_raw_with_numba(
                df_group.spec_idx.values,
                df_group.frag_start_idx.values,
                df_group.frag_stop_idx.values,
                self.fragment_mz_df.values,
                ms2_reader.peak_df.mz.values,
                ms2_reader.peak_df.intensity.values,
                ms2_reader.spectrum_df.peak_start_idx.values,
                ms2_reader.spectrum_df.peak_end_idx.values,
                self.matched_intensity_df.values,
                self.matched_mz_err_df.values,
                self.ppm,
                self.tol,
            )


[docs]
    def match_ms2_centroid(
        self,
        psm_df: pd.DataFrame,
        ms2_file_dict: dict,  # raw_name: ms2_file_path or ms_reader object
        ms2_file_type: str = "alphapept",  # or 'mgf', or 'thermo'
        ppm=True,
        tol=20.0,
    ):
        """Matching PSM dataframe against the ms2 files in ms2_file_dict
        This method will store matched values as attributes:
        - self.psm_df
        - self.fragment_mz_df
        - self.matched_intensity_df
        - self.matched_mz_err_df

        Parameters
        ----------
        psm_df : pd.DataFrame
            PSM dataframe

        ms2_file_dict : dict
            {raw_name: ms2 path}

        ms2_file_type : str, optional
            Could be 'alphapept', 'mgf' or 'thermo'.
            Defaults to 'alphapept'.

        ppm : bool, optional
            Defaults to True.

        tol : float, optional
            PPM units, defaults to 20.0.

        """
        self._preprocess_psms(psm_df)
        self.psm_df = psm_df

        if "frag_start_idx" in self.psm_df.columns:
            del self.psm_df["frag_start_idx"]
            del self.psm_df["frag_stop_idx"]

        self.fragment_mz_df = self.get_fragment_mz_df(self.psm_df)

        self.matched_intensity_df = pd.DataFrame(
            np.zeros_like(self.fragment_mz_df.values, dtype=np.float64),
            columns=self.fragment_mz_df.columns,
        )

        self.matched_mz_err_df = pd.DataFrame(
            np.full_like(self.fragment_mz_df.values, np.inf, dtype=np.float64),
            columns=self.fragment_mz_df.columns,
        )

        self._ms2_file_dict = ms2_file_dict
        self._ms2_file_type = ms2_file_type
        self.ppm = ppm
        self.tol = tol

        if "rt_norm" not in self.psm_df.columns:
            self.rt_not_in_df = True
        else:
            self.rt_not_in_df = False
        for raw_name, df_group in tqdm.tqdm(self.psm_df.groupby("raw_name")):
            self._match_ms2_centroid_one_raw(raw_name, df_group)