Translate spectral libraries

AlphaBase/PeptDeep to TSV (DiaNN/Spectronaut/OpenSWATH)

[ ]:
#%pip install peptdeep
[ ]:
%reload_ext autoreload
%autoreload 2
[ ]:
alphapeptdeep_hdf = r'y:\User\Feng\speclib\human_swissprot.speclib.hdf'
top_k_frag = 16

Do not change below

[ ]:
frag_inten = 0.001


min_frag_mz = 200
min_frag_nAA = 0

output_diann_tsv = (
    f"{alphapeptdeep_hdf[:-len('.speclib.hdf')]}_frags={top_k_frag}.speclib.tsv"
)
output_diann_tsv
'y:\\User\\Feng\\speclib\\human_swissprot_frags=16.speclib.tsv'
[ ]:
from peptdeep.protein.fasta import PredictSpecLibFasta

fasta_lib = PredictSpecLibFasta(
    None,
    decoy=None
)
[ ]:
fasta_lib.load_hdf(alphapeptdeep_hdf, load_mod_seq=True)
[ ]:
fasta_lib.protein_df
description full_name gene_name protein_id sequence
0 sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... sp|Q9H9K5|MER34_HUMAN ERVMER34-1 Q9H9K5 MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...
1 sp|P04439|HLAA_HUMAN HLA class I histocompatib... sp|P04439|HLAA_HUMAN HLA-A P04439 MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2 sp|P01911|DRB1_HUMAN HLA class II histocompati... sp|P01911|DRB1_HUMAN HLA-DRB1 P01911 MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...
3 sp|P01889|HLAB_HUMAN HLA class I histocompatib... sp|P01889|HLAB_HUMAN HLA-B P01889 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4 sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... sp|P31689|DNJA1_HUMAN DNAJA1 P31689 MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...
... ... ... ... ... ...
20391 sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... sp|Q8WVZ7|RN133_HUMAN RNF133 Q8WVZ7 MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...
20392 sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... sp|P05387|RLA2_HUMAN RPLP2 P05387 MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...
20393 sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... sp|P51991|ROA3_HUMAN HNRNPA3 P51991 MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...
20394 sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... sp|Q9BZX4|ROP1B_HUMAN ROPN1B Q9BZX4 MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...
20395 sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... sp|P34096|RNAS4_HUMAN RNASE4 P34096 MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET...

20396 rows × 5 columns

[ ]:
fasta_lib.precursor_df
ccs_pred charge frag_stop_idx frag_start_idx miss_cleavage mobility_pred mod_seq_charge_hash mod_seq_hash nAA precursor_mz ... rt_pred instrument irt_pred is_prot_cterm is_prot_nterm mod_sites mods nce protein_idxes sequence
0 315.529022 2 6 0 1 0.775438 471662500970219630 471662500970219628 7 434.249018 ... 0.115377 timsTOF -37.187631 False False 30 19786 RIHTGQR
1 304.965790 2 12 6 0 0.748912 -5301076820607700088 -5301076820607700090 7 414.216952 ... 0.208976 timsTOF -16.331142 False False 30 9448 PMPMPVR
2 304.080536 2 18 12 0 0.746970 6057464136741449833 6057464136741449831 7 422.214409 ... 0.158058 timsTOF -27.677099 False False 4 Oxidation@M 30 9448 PMPMPVR
3 305.825348 2 24 18 0 0.751256 -6431722582867031754 -6431722582867031756 7 422.214409 ... 0.157143 timsTOF -27.881022 False False 2 Oxidation@M 30 9448 PMPMPVR
4 330.547638 2 30 24 0 0.814317 -7409729050206298799 -7409729050206298801 7 513.726727 ... 0.423747 timsTOF 31.526291 False False 5 Carbamidomethyl@C 30 12819 QEWFCTR
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3654202 891.748413 4 60404866 60404832 1 1.108824 7192344052213098708 7192344052213098704 35 866.228888 ... 0.831350 timsTOF 122.352159 False False 30 978 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654203 785.478699 3 60404900 60404866 1 1.302269 -1485306056792248108 -1485306056792248111 35 1159.967730 ... 0.826977 timsTOF 121.377815 False False 17 Oxidation@M 30 978 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654204 892.459656 4 60404934 60404900 1 1.109729 -1485306056792248107 -1485306056792248111 35 870.227616 ... 0.826977 timsTOF 121.377815 False False 17 Oxidation@M 30 978 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654205 791.322266 4 60404968 60404934 1 0.984398 5191231126132273755 5191231126132273751 35 976.910866 ... 0.670129 timsTOF 86.427514 False False 30 2299 KNQAADDDDEDLNDTNYDEFNGYAGSLFSSGPYEK
3654206 823.819214 4 60405002 60404968 1 1.024754 -7707559913944666934 -7707559913944666938 35 958.434460 ... 0.725150 timsTOF 98.687774 False False 24 Carbamidomethyl@C 30 10080 AYDADSGFNGKVLFTISDGNTDSCFNIDMETGQLK

3654207 rows × 21 columns

[ ]:
if 'id' in fasta_lib.protein_df.columns:
    fasta_lib.protein_df.rename(columns={'id':'protein_id'}, inplace=True)
fasta_lib.protein_df
description full_name gene_name protein_id sequence
0 sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... sp|Q9H9K5|MER34_HUMAN ERVMER34-1 Q9H9K5 MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...
1 sp|P04439|HLAA_HUMAN HLA class I histocompatib... sp|P04439|HLAA_HUMAN HLA-A P04439 MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2 sp|P01911|DRB1_HUMAN HLA class II histocompati... sp|P01911|DRB1_HUMAN HLA-DRB1 P01911 MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...
3 sp|P01889|HLAB_HUMAN HLA class I histocompatib... sp|P01889|HLAB_HUMAN HLA-B P01889 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4 sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... sp|P31689|DNJA1_HUMAN DNAJA1 P31689 MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...
... ... ... ... ... ...
20391 sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... sp|Q8WVZ7|RN133_HUMAN RNF133 Q8WVZ7 MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...
20392 sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... sp|P05387|RLA2_HUMAN RPLP2 P05387 MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...
20393 sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... sp|P51991|ROA3_HUMAN HNRNPA3 P51991 MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...
20394 sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... sp|Q9BZX4|ROP1B_HUMAN ROPN1B Q9BZX4 MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...
20395 sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... sp|P34096|RNAS4_HUMAN RNASE4 P34096 MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET...

20396 rows × 5 columns

[ ]:
import os, psutil
import numpy as np
process = psutil.Process(os.getpid())
print(f'{len(fasta_lib.precursor_df)*1e-6:.2f}M precursors with {np.prod(fasta_lib.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments used {process.memory_info().rss/1024**3:.4f} GB memory')
3.65M precursors with 241.62M fragments used 6.0145 GB memory
[ ]:
fasta_lib.append_protein_name()
[ ]:
from peptdeep.spec_lib.translate import translate_to_tsv
if 'decoy' in fasta_lib.precursor_df.columns:
    fasta_lib._precursor_df = fasta_lib.precursor_df[fasta_lib._precursor_df.decoy == 0]
translate_to_tsv(
    fasta_lib,
    output_diann_tsv,
    keep_k_highest_fragments=top_k_frag,
    min_frag_nAA=min_frag_nAA,
    min_frag_mz=min_frag_mz,
    min_frag_intensity=frag_inten,
    batch_size=100000,
    translate_mod_dict=None,
    multiprocessing=True,
)
100%|██████████| 37/37 [1:02:05<00:00, 100.70s/it]
Translation finished, it will take several minutes to export the rest precursors to the tsv file...