{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Translate spectral libraries\n", "\n", "AlphaBase/PeptDeep to TSV (DiaNN/Spectronaut/OpenSWATH)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "#%pip install peptdeep" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "alphapeptdeep_hdf = r'y:\\User\\Feng\\speclib\\human_swissprot.speclib.hdf'\n", "top_k_frag = 16" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Do not change below" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "data": { "text/plain": [ "'y:\\\\User\\\\Feng\\\\speclib\\\\human_swissprot_frags=16.speclib.tsv'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frag_inten = 0.001\n", "\n", "\n", "min_frag_mz = 200\n", "min_frag_nAA = 0\n", "\n", "output_diann_tsv = (\n", " f\"{alphapeptdeep_hdf[:-len('.speclib.hdf')]}_frags={top_k_frag}.speclib.tsv\"\n", ")\n", "output_diann_tsv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "from peptdeep.protein.fasta import PredictSpecLibFasta\n", "\n", "fasta_lib = PredictSpecLibFasta(\n", " None, \n", " decoy=None\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "fasta_lib.load_hdf(alphapeptdeep_hdf, load_mod_seq=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptionfull_namegene_nameprotein_idsequence
0sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en...sp|Q9H9K5|MER34_HUMANERVMER34-1Q9H9K5MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...
1sp|P04439|HLAA_HUMAN HLA class I histocompatib...sp|P04439|HLAA_HUMANHLA-AP04439MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2sp|P01911|DRB1_HUMAN HLA class II histocompati...sp|P01911|DRB1_HUMANHLA-DRB1P01911MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...
3sp|P01889|HLAB_HUMAN HLA class I histocompatib...sp|P01889|HLAB_HUMANHLA-BP01889MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A...sp|P31689|DNJA1_HUMANDNAJA1P31689MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...
..................
20391sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig...sp|Q8WVZ7|RN133_HUMANRNF133Q8WVZ7MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...
20392sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot...sp|P05387|RLA2_HUMANRPLP2P05387MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...
20393sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib...sp|P51991|ROA3_HUMANHNRNPA3P51991MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...
20394sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi...sp|Q9BZX4|ROP1B_HUMANROPN1BQ9BZX4MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...
20395sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s...sp|P34096|RNAS4_HUMANRNASE4P34096MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET...
\n", "

20396 rows × 5 columns

\n", "
" ], "text/plain": [ " description \\\n", "0 sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... \n", "1 sp|P04439|HLAA_HUMAN HLA class I histocompatib... \n", "2 sp|P01911|DRB1_HUMAN HLA class II histocompati... \n", "3 sp|P01889|HLAB_HUMAN HLA class I histocompatib... \n", "4 sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... \n", "... ... \n", "20391 sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... \n", "20392 sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... \n", "20393 sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... \n", "20394 sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... \n", "20395 sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... \n", "\n", " full_name gene_name protein_id \\\n", "0 sp|Q9H9K5|MER34_HUMAN ERVMER34-1 Q9H9K5 \n", "1 sp|P04439|HLAA_HUMAN HLA-A P04439 \n", "2 sp|P01911|DRB1_HUMAN HLA-DRB1 P01911 \n", "3 sp|P01889|HLAB_HUMAN HLA-B P01889 \n", "4 sp|P31689|DNJA1_HUMAN DNAJA1 P31689 \n", "... ... ... ... \n", "20391 sp|Q8WVZ7|RN133_HUMAN RNF133 Q8WVZ7 \n", "20392 sp|P05387|RLA2_HUMAN RPLP2 P05387 \n", "20393 sp|P51991|ROA3_HUMAN HNRNPA3 P51991 \n", "20394 sp|Q9BZX4|ROP1B_HUMAN ROPN1B Q9BZX4 \n", "20395 sp|P34096|RNAS4_HUMAN RNASE4 P34096 \n", "\n", " sequence \n", "0 MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL... \n", "1 MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF... \n", "2 MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF... \n", "3 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... \n", "4 MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK... \n", "... ... \n", "20391 MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF... \n", "20392 MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL... \n", "20393 MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET... \n", "20394 MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR... \n", "20395 MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET... \n", "\n", "[20396 rows x 5 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.protein_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ccs_predchargefrag_stop_idxfrag_start_idxmiss_cleavagemobility_predmod_seq_charge_hashmod_seq_hashnAAprecursor_mz...rt_predinstrumentirt_predis_prot_ctermis_prot_ntermmod_sitesmodsnceprotein_idxessequence
0315.52902226010.7754384716625009702196304716625009702196287434.249018...0.115377timsTOF-37.187631FalseFalse3019786RIHTGQR
1304.965790212600.748912-5301076820607700088-53010768206077000907414.216952...0.208976timsTOF-16.331142FalseFalse309448PMPMPVR
2304.0805362181200.746970605746413674144983360574641367414498317422.214409...0.158058timsTOF-27.677099FalseFalse4Oxidation@M309448PMPMPVR
3305.8253482241800.751256-6431722582867031754-64317225828670317567422.214409...0.157143timsTOF-27.881022FalseFalse2Oxidation@M309448PMPMPVR
4330.5476382302400.814317-7409729050206298799-74097290502062988017513.726727...0.423747timsTOF31.526291FalseFalse5Carbamidomethyl@C3012819QEWFCTR
..................................................................
3654202891.7484134604048666040483211.1088247192344052213098708719234405221309870435866.228888...0.831350timsTOF122.352159FalseFalse30978NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654203785.4786993604049006040486611.302269-1485306056792248108-1485306056792248111351159.967730...0.826977timsTOF121.377815FalseFalse17Oxidation@M30978NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654204892.4596564604049346040490011.109729-1485306056792248107-148530605679224811135870.227616...0.826977timsTOF121.377815FalseFalse17Oxidation@M30978NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR
3654205791.3222664604049686040493410.9843985191231126132273755519123112613227375135976.910866...0.670129timsTOF86.427514FalseFalse302299KNQAADDDDEDLNDTNYDEFNGYAGSLFSSGPYEK
3654206823.8192144604050026040496811.024754-7707559913944666934-770755991394466693835958.434460...0.725150timsTOF98.687774FalseFalse24Carbamidomethyl@C3010080AYDADSGFNGKVLFTISDGNTDSCFNIDMETGQLK
\n", "

3654207 rows × 21 columns

\n", "
" ], "text/plain": [ " ccs_pred charge frag_stop_idx frag_start_idx miss_cleavage \\\n", "0 315.529022 2 6 0 1 \n", "1 304.965790 2 12 6 0 \n", "2 304.080536 2 18 12 0 \n", "3 305.825348 2 24 18 0 \n", "4 330.547638 2 30 24 0 \n", "... ... ... ... ... ... \n", "3654202 891.748413 4 60404866 60404832 1 \n", "3654203 785.478699 3 60404900 60404866 1 \n", "3654204 892.459656 4 60404934 60404900 1 \n", "3654205 791.322266 4 60404968 60404934 1 \n", "3654206 823.819214 4 60405002 60404968 1 \n", "\n", " mobility_pred mod_seq_charge_hash mod_seq_hash nAA \\\n", "0 0.775438 471662500970219630 471662500970219628 7 \n", "1 0.748912 -5301076820607700088 -5301076820607700090 7 \n", "2 0.746970 6057464136741449833 6057464136741449831 7 \n", "3 0.751256 -6431722582867031754 -6431722582867031756 7 \n", "4 0.814317 -7409729050206298799 -7409729050206298801 7 \n", "... ... ... ... ... \n", "3654202 1.108824 7192344052213098708 7192344052213098704 35 \n", "3654203 1.302269 -1485306056792248108 -1485306056792248111 35 \n", "3654204 1.109729 -1485306056792248107 -1485306056792248111 35 \n", "3654205 0.984398 5191231126132273755 5191231126132273751 35 \n", "3654206 1.024754 -7707559913944666934 -7707559913944666938 35 \n", "\n", " precursor_mz ... rt_pred instrument irt_pred is_prot_cterm \\\n", "0 434.249018 ... 0.115377 timsTOF -37.187631 False \n", "1 414.216952 ... 0.208976 timsTOF -16.331142 False \n", "2 422.214409 ... 0.158058 timsTOF -27.677099 False \n", "3 422.214409 ... 0.157143 timsTOF -27.881022 False \n", "4 513.726727 ... 0.423747 timsTOF 31.526291 False \n", "... ... ... ... ... ... ... \n", "3654202 866.228888 ... 0.831350 timsTOF 122.352159 False \n", "3654203 1159.967730 ... 0.826977 timsTOF 121.377815 False \n", "3654204 870.227616 ... 0.826977 timsTOF 121.377815 False \n", "3654205 976.910866 ... 0.670129 timsTOF 86.427514 False \n", "3654206 958.434460 ... 0.725150 timsTOF 98.687774 False \n", "\n", " is_prot_nterm mod_sites mods nce protein_idxes \\\n", "0 False 30 19786 \n", "1 False 30 9448 \n", "2 False 4 Oxidation@M 30 9448 \n", "3 False 2 Oxidation@M 30 9448 \n", "4 False 5 Carbamidomethyl@C 30 12819 \n", "... ... ... ... .. ... \n", "3654202 False 30 978 \n", "3654203 False 17 Oxidation@M 30 978 \n", "3654204 False 17 Oxidation@M 30 978 \n", "3654205 False 30 2299 \n", "3654206 False 24 Carbamidomethyl@C 30 10080 \n", "\n", " sequence \n", "0 RIHTGQR \n", "1 PMPMPVR \n", "2 PMPMPVR \n", "3 PMPMPVR \n", "4 QEWFCTR \n", "... ... \n", "3654202 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR \n", "3654203 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR \n", "3654204 NLTYVRGSVGPATSTLMFVAGVVGNGLALGILSAR \n", "3654205 KNQAADDDDEDLNDTNYDEFNGYAGSLFSSGPYEK \n", "3654206 AYDADSGFNGKVLFTISDGNTDSCFNIDMETGQLK \n", "\n", "[3654207 rows x 21 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.precursor_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptionfull_namegene_nameprotein_idsequence
0sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en...sp|Q9H9K5|MER34_HUMANERVMER34-1Q9H9K5MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...
1sp|P04439|HLAA_HUMAN HLA class I histocompatib...sp|P04439|HLAA_HUMANHLA-AP04439MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2sp|P01911|DRB1_HUMAN HLA class II histocompati...sp|P01911|DRB1_HUMANHLA-DRB1P01911MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...
3sp|P01889|HLAB_HUMAN HLA class I histocompatib...sp|P01889|HLAB_HUMANHLA-BP01889MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A...sp|P31689|DNJA1_HUMANDNAJA1P31689MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...
..................
20391sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig...sp|Q8WVZ7|RN133_HUMANRNF133Q8WVZ7MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...
20392sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot...sp|P05387|RLA2_HUMANRPLP2P05387MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...
20393sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib...sp|P51991|ROA3_HUMANHNRNPA3P51991MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...
20394sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi...sp|Q9BZX4|ROP1B_HUMANROPN1BQ9BZX4MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...
20395sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s...sp|P34096|RNAS4_HUMANRNASE4P34096MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET...
\n", "

20396 rows × 5 columns

\n", "
" ], "text/plain": [ " description \\\n", "0 sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en... \n", "1 sp|P04439|HLAA_HUMAN HLA class I histocompatib... \n", "2 sp|P01911|DRB1_HUMAN HLA class II histocompati... \n", "3 sp|P01889|HLAB_HUMAN HLA class I histocompatib... \n", "4 sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A... \n", "... ... \n", "20391 sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig... \n", "20392 sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot... \n", "20393 sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib... \n", "20394 sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi... \n", "20395 sp|P34096|RNAS4_HUMAN Ribonuclease 4 OS=Homo s... \n", "\n", " full_name gene_name protein_id \\\n", "0 sp|Q9H9K5|MER34_HUMAN ERVMER34-1 Q9H9K5 \n", "1 sp|P04439|HLAA_HUMAN HLA-A P04439 \n", "2 sp|P01911|DRB1_HUMAN HLA-DRB1 P01911 \n", "3 sp|P01889|HLAB_HUMAN HLA-B P01889 \n", "4 sp|P31689|DNJA1_HUMAN DNAJA1 P31689 \n", "... ... ... ... \n", "20391 sp|Q8WVZ7|RN133_HUMAN RNF133 Q8WVZ7 \n", "20392 sp|P05387|RLA2_HUMAN RPLP2 P05387 \n", "20393 sp|P51991|ROA3_HUMAN HNRNPA3 P51991 \n", "20394 sp|Q9BZX4|ROP1B_HUMAN ROPN1B Q9BZX4 \n", "20395 sp|P34096|RNAS4_HUMAN RNASE4 P34096 \n", "\n", " sequence \n", "0 MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL... \n", "1 MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF... \n", "2 MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF... \n", "3 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... \n", "4 MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK... \n", "... ... \n", "20391 MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF... \n", "20392 MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL... \n", "20393 MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET... \n", "20394 MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR... \n", "20395 MALQRTHSLLLLLLLTLLGLGLVQPSYGQDGMYQRFLRQHVHPEET... \n", "\n", "[20396 rows x 5 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if 'id' in fasta_lib.protein_df.columns:\n", " fasta_lib.protein_df.rename(columns={'id':'protein_id'}, inplace=True)\n", "fasta_lib.protein_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.65M precursors with 241.62M fragments used 6.0145 GB memory\n" ] } ], "source": [ "import os, psutil\n", "import numpy as np\n", "process = psutil.Process(os.getpid())\n", "print(f'{len(fasta_lib.precursor_df)*1e-6:.2f}M precursors with {np.prod(fasta_lib.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments used {process.memory_info().rss/1024**3:.4f} GB memory')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "fasta_lib.append_protein_name()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 37/37 [1:02:05<00:00, 100.70s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Translation finished, it will take several minutes to export the rest precursors to the tsv file...\n" ] } ], "source": [ "from peptdeep.spec_lib.translate import translate_to_tsv\n", "if 'decoy' in fasta_lib.precursor_df.columns:\n", " fasta_lib._precursor_df = fasta_lib.precursor_df[fasta_lib._precursor_df.decoy == 0]\n", "translate_to_tsv(\n", " fasta_lib, \n", " output_diann_tsv,\n", " keep_k_highest_fragments=top_k_frag, \n", " min_frag_nAA=min_frag_nAA,\n", " min_frag_mz=min_frag_mz, \n", " min_frag_intensity=frag_inten,\n", " batch_size=100000,\n", " translate_mod_dict=None,\n", " multiprocessing=True,\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }