Translate spectral libraries¶
AlphaBase/PeptDeep to TSV (DiaNN/Spectronaut/OpenSWATH)
[ ]:
#%pip install peptdeep
[ ]:
%reload_ext autoreload
%autoreload 2
[ ]:
alphapeptdeep_hdf = r'y:\User\Feng\speclib\human_swissprot.speclib.hdf'
top_k_frag = 16
Do not change below¶
[ ]:
frag_inten = 0.001
min_frag_mz = 200
min_frag_nAA = 0
output_diann_tsv = (
f"{alphapeptdeep_hdf[:-len('.speclib.hdf')]}_frags={top_k_frag}.speclib.tsv"
)
output_diann_tsv
'y:\\User\\Feng\\speclib\\human_swissprot_frags=16.speclib.tsv'
[ ]:
from peptdeep.protein.fasta import PredictSpecLibFasta
fasta_lib = PredictSpecLibFasta(
None,
decoy=None
)
[ ]:
fasta_lib.load_hdf(alphapeptdeep_hdf, load_mod_seq=True)
[ ]:
fasta_lib.protein_df
| description | full_name | gene_name | protein_id | sequence | |
|---|---|---|---|---|---|
| 0 | sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... | sp|Q9H9K5|MER34_HUMAN | ERVMER34-1 | Q9H9K5 | MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL... |
| 1 | sp|P04439|HLAA_HUMAN HLA class I histocompatib... | sp|P04439|HLAA_HUMAN | HLA-A | P04439 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF... |
| 2 | sp|P01911|DRB1_HUMAN HLA class II histocompati... | sp|P01911|DRB1_HUMAN | HLA-DRB1 | P01911 | MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF... |
| 3 | sp|P01889|HLAB_HUMAN HLA class I histocompatib... | sp|P01889|HLAB_HUMAN | HLA-B | P01889 | MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... |
| 4 | sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... | sp|P31689|DNJA1_HUMAN | DNAJA1 | P31689 | MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK... |
| ... | ... | ... | ... | ... | ... |
| 20391 | sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... | sp|Q8WVZ7|RN133_HUMAN | RNF133 | Q8WVZ7 | MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF... |
| 20392 | sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... | sp|P05387|RLA2_HUMAN | RPLP2 | P05387 | MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL... |
| 20393 | sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... | sp|P51991|ROA3_HUMAN | HNRNPA3 | P51991 | MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET... |
| 20394 | sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... | sp|Q9BZX4|ROP1B_HUMAN | ROPN1B | Q9BZX4 | MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR... |
| 20395 | sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... | sp|P34096|RNAS4_HUMAN | RNASE4 | P34096 | MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET... |
20396 rows × 5 columns
[ ]:
fasta_lib.precursor_df
| ccs_pred | charge | frag_stop_idx | frag_start_idx | miss_cleavage | mobility_pred | mod_seq_charge_hash | mod_seq_hash | nAA | precursor_mz | ... | rt_pred | instrument | irt_pred | is_prot_cterm | is_prot_nterm | mod_sites | mods | nce | protein_idxes | sequence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 315.529022 | 2 | 6 | 0 | 1 | 0.775438 | 471662500970219630 | 471662500970219628 | 7 | 434.249018 | ... | 0.115377 | timsTOF | -37.187631 | False | False | 30 | 19786 | RIHTGQR | ||
| 1 | 304.965790 | 2 | 12 | 6 | 0 | 0.748912 | -5301076820607700088 | -5301076820607700090 | 7 | 414.216952 | ... | 0.208976 | timsTOF | -16.331142 | False | False | 30 | 9448 | PMPMPVR | ||
| 2 | 304.080536 | 2 | 18 | 12 | 0 | 0.746970 | 6057464136741449833 | 6057464136741449831 | 7 | 422.214409 | ... | 0.158058 | timsTOF | -27.677099 | False | False | 4 | Oxidation@M | 30 | 9448 | PMPMPVR |
| 3 | 305.825348 | 2 | 24 | 18 | 0 | 0.751256 | -6431722582867031754 | -6431722582867031756 | 7 | 422.214409 | ... | 0.157143 | timsTOF | -27.881022 | False | False | 2 | Oxidation@M | 30 | 9448 | PMPMPVR |
| 4 | 330.547638 | 2 | 30 | 24 | 0 | 0.814317 | -7409729050206298799 | -7409729050206298801 | 7 | 513.726727 | ... | 0.423747 | timsTOF | 31.526291 | False | False | 5 | Carbamidomethyl@C | 30 | 12819 | QEWFCTR |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3654202 | 891.748413 | 4 | 60404866 | 60404832 | 1 | 1.108824 | 7192344052213098708 | 7192344052213098704 | 35 | 866.228888 | ... | 0.831350 | timsTOF | 122.352159 | False | False | 30 | 978 | NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR | ||
| 3654203 | 785.478699 | 3 | 60404900 | 60404866 | 1 | 1.302269 | -1485306056792248108 | -1485306056792248111 | 35 | 1159.967730 | ... | 0.826977 | timsTOF | 121.377815 | False | False | 17 | Oxidation@M | 30 | 978 | NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR |
| 3654204 | 892.459656 | 4 | 60404934 | 60404900 | 1 | 1.109729 | -1485306056792248107 | -1485306056792248111 | 35 | 870.227616 | ... | 0.826977 | timsTOF | 121.377815 | False | False | 17 | Oxidation@M | 30 | 978 | NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR |
| 3654205 | 791.322266 | 4 | 60404968 | 60404934 | 1 | 0.984398 | 5191231126132273755 | 5191231126132273751 | 35 | 976.910866 | ... | 0.670129 | timsTOF | 86.427514 | False | False | 30 | 2299 | KNQAADDDDEDLNDTNYDEFNGYAGSLFSSGPYEK | ||
| 3654206 | 823.819214 | 4 | 60405002 | 60404968 | 1 | 1.024754 | -7707559913944666934 | -7707559913944666938 | 35 | 958.434460 | ... | 0.725150 | timsTOF | 98.687774 | False | False | 24 | Carbamidomethyl@C | 30 | 10080 | AYDADSGFNGKVLFTISDGNTDSCFNIDMETGQLK |
3654207 rows × 21 columns
[ ]:
if 'id' in fasta_lib.protein_df.columns:
fasta_lib.protein_df.rename(columns={'id':'protein_id'}, inplace=True)
fasta_lib.protein_df
| description | full_name | gene_name | protein_id | sequence | |
|---|---|---|---|---|---|
| 0 | sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... | sp|Q9H9K5|MER34_HUMAN | ERVMER34-1 | Q9H9K5 | MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL... |
| 1 | sp|P04439|HLAA_HUMAN HLA class I histocompatib... | sp|P04439|HLAA_HUMAN | HLA-A | P04439 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF... |
| 2 | sp|P01911|DRB1_HUMAN HLA class II histocompati... | sp|P01911|DRB1_HUMAN | HLA-DRB1 | P01911 | MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF... |
| 3 | sp|P01889|HLAB_HUMAN HLA class I histocompatib... | sp|P01889|HLAB_HUMAN | HLA-B | P01889 | MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... |
| 4 | sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... | sp|P31689|DNJA1_HUMAN | DNAJA1 | P31689 | MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK... |
| ... | ... | ... | ... | ... | ... |
| 20391 | sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... | sp|Q8WVZ7|RN133_HUMAN | RNF133 | Q8WVZ7 | MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF... |
| 20392 | sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... | sp|P05387|RLA2_HUMAN | RPLP2 | P05387 | MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL... |
| 20393 | sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... | sp|P51991|ROA3_HUMAN | HNRNPA3 | P51991 | MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET... |
| 20394 | sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... | sp|Q9BZX4|ROP1B_HUMAN | ROPN1B | Q9BZX4 | MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR... |
| 20395 | sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... | sp|P34096|RNAS4_HUMAN | RNASE4 | P34096 | MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET... |
20396 rows × 5 columns
[ ]:
import os, psutil
import numpy as np
process = psutil.Process(os.getpid())
print(f'{len(fasta_lib.precursor_df)*1e-6:.2f}M precursors with {np.prod(fasta_lib.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments used {process.memory_info().rss/1024**3:.4f} GB memory')
3.65M precursors with 241.62M fragments used 6.0145 GB memory
[ ]:
fasta_lib.append_protein_name()
[ ]:
from peptdeep.spec_lib.translate import translate_to_tsv
if 'decoy' in fasta_lib.precursor_df.columns:
fasta_lib._precursor_df = fasta_lib.precursor_df[fasta_lib._precursor_df.decoy == 0]
translate_to_tsv(
fasta_lib,
output_diann_tsv,
keep_k_highest_fragments=top_k_frag,
min_frag_nAA=min_frag_nAA,
min_frag_mz=min_frag_mz,
min_frag_intensity=frag_inten,
batch_size=100000,
translate_mod_dict=None,
multiprocessing=True,
)
100%|██████████| 37/37 [1:02:05<00:00, 100.70s/it]
Translation finished, it will take several minutes to export the rest precursors to the tsv file...