diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 5fa60135..00000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -ms2pip/_models_c/**/*.c filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml index 68f1bb3a..a400f556 100644 --- a/.github/workflows/build_and_publish.yml +++ b/.github/workflows/build_and_publish.yml @@ -5,49 +5,20 @@ on: types: [created] jobs: - build-sdist: - name: Build source distribution + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - with: - lfs: "true" - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build sdist - run: python -m build --sdist --outdir dist - - uses: actions/upload-artifact@v4 - with: - name: dist-source - path: dist - - build-wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest, macos-15, macos-15-intel] - steps: - - uses: actions/checkout@v4 - with: - lfs: "true" - - name: Build wheels - uses: pypa/cibuildwheel@v3.4.0 - with: - output-dir: dist + - uses: astral-sh/setup-uv@v6 + - name: Build sdist and wheel + run: uv build --out-dir dist - uses: actions/upload-artifact@v4 with: - name: dist-${{ matrix.os }} + name: dist path: dist publish-to-pypi: - needs: [build-sdist, build-wheels] + needs: [build] runs-on: ubuntu-latest environment: name: pypi @@ -56,10 +27,8 @@ jobs: id-token: write steps: - uses: actions/download-artifact@v4 - - name: Move wheels to dist - run: | - mkdir dist/ - mv dist-*/*.whl dist/ - mv dist-*/*.tar.gz dist/ + with: + name: dist + path: dist - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 023fd57c..bc5ed0c2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,40 +8,36 @@ on: workflow_dispatch: jobs: - build: + lint: runs-on: ubuntu-latest - strategy: - max-parallel: 4 - fail-fast: false - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v6 + - run: uvx ruff check --output-format=github ./ms2pip + typecheck: + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v6 with: - lfs: "true" + enable-cache: true + - run: uv run --all-extras --group dev --python 3.13 ty check ms2pip/ - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v6 with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies + enable-cache: true + - name: Install and test run: | - python -m pip install --upgrade pip - pip install ruff - - - name: Check with Ruff - run: ruff check --output-format=github ./ms2pip - - - name: Build and install ms2pip - run: | - pip install .[dev] - - - name: Test with pytest - run: | - pytest - - - name: Test installation + uv run --python ${{ matrix.python-version }} pytest + - name: Test CLI run: | - ms2pip --help + uv run --python ${{ matrix.python-version }} ms2pip --help diff --git a/.gitignore b/.gitignore index 4e76cd43..8a137c9c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,5 @@ -.pytest_cache/ - -# Custom MS2PIP stuff +# Custom data/ -*_pyx.c -*_pyx_*.c -ms2pip/models_xgboost/*.xgboost # Pytest .pytest_cache/ @@ -98,6 +93,7 @@ celerybeat-schedule .venv/ venv/ ENV/ +.venv-*/ # Spyder project settings .spyderproject diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 248bf4ca..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include ms2pip/_cython_modules/*.c -include ms2pip/_models_c/*/*.c -include ms2pip/_models_c/*.h diff --git a/README.rst b/README.rst index 58a3f933..6557943e 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,8 @@ peptide fragmentation spectrum that accurately resembles its observed equivalent can be used to validate peptide identifications, generate proteome-wide spectral libraries, or to select discriminative transitions for targeted proteomics. MS²PIP employs the `XGBoost `_ machine learning algorithm and is written in -Python and C. +Python, with helper functions in Rust +(`ms2rescore-rs `_). .. figure:: https://raw.githubusercontent.com/compomics/ms2pip/v4.0.0/img/mirror-DVAQIFNNILR-2.png diff --git a/docs/source/api/ms2pip.result.rst b/docs/source/api/ms2pip.result.rst index aee97a08..5946933f 100644 --- a/docs/source/api/ms2pip.result.rst +++ b/docs/source/api/ms2pip.result.rst @@ -3,3 +3,4 @@ ms2pip.result ************* .. automodule:: ms2pip.result + :members: diff --git a/docs/source/api/ms2pip.spectrum.rst b/docs/source/api/ms2pip.spectrum.rst index 0bcea63b..0374cab5 100644 --- a/docs/source/api/ms2pip.spectrum.rst +++ b/docs/source/api/ms2pip.spectrum.rst @@ -3,3 +3,4 @@ ms2pip.spectrum *************** .. automodule:: ms2pip.spectrum + :members: diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5ca4efbd..3604ef14 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -6,17 +6,12 @@ Pip package .. image:: https://flat.badgen.net/badge/install%20with/pip/green :target: https://pypi.org/project/ms2pip/ -With Python 3.9 or higher, run: +With Python 3.11 or higher, run: .. code-block:: bash pip install ms2pip -Compiled wheels are available for various Python versions on 64bit Linux, -Windows, and macOS. This should install MS²PIP in a few seconds. For other -platforms, MS²PIP can be built from source, although it can take a while -to compile the large prediction models. - We recommend using a `venv `__ or `conda `__ virtual environment. diff --git a/ms2pip/__init__.py b/ms2pip/__init__.py index 2eaef20c..a0fbe937 100644 --- a/ms2pip/__init__.py +++ b/ms2pip/__init__.py @@ -1,13 +1,14 @@ # isort: skip_file """MS2PIP: Accurate and versatile peptide fragmentation spectrum prediction.""" -__version__ = "4.2.0-alpha.2" +__version__ = "4.2.0-alpha.3" __all__ = [ "predict_single", "predict_batch", "predict_library", "correlate", "correlate_preloaded", + "correlate_single", "get_training_data", "annotate_spectra", "download_models", @@ -26,6 +27,7 @@ predict_library, correlate, correlate_preloaded, + correlate_single, get_training_data, annotate_spectra, download_models, diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py index b01ec7e1..1dc6beaf 100644 --- a/ms2pip/__main__.py +++ b/ms2pip/__main__.py @@ -1,7 +1,6 @@ import logging import sys from pathlib import Path -from typing import Optional import click import rich @@ -32,7 +31,7 @@ def _infer_output_name( input_filename: str, - output_name: Optional[str] = None, + output_name: str | None = None, ) -> Path: """Infer output filename from input filename if output_filename was not defined.""" if output_name: @@ -50,9 +49,7 @@ def cli(*args, **kwargs): format="%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=LOGGING_LEVELS[kwargs["logging_level"]], - handlers=[ - RichHandler(rich_tracebacks=True, show_level=True, show_path=False) - ], + handlers=[RichHandler(rich_tracebacks=True, show_level=True, show_path=False)], ) rich.print(build_credits()) @@ -79,7 +76,7 @@ def predict_single(*args, **kwargs): # Write output rich.print(build_prediction_table(predicted_spectrum)) write_spectra(output_name, [result], output_format) - if plot: + if plot and predicted_spectrum: spectrum_to_png(predicted_spectrum, output_name) @@ -142,6 +139,9 @@ def predict_library(*args, **kwargs): @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--model-dir") @click.option("--ms2-tolerance", type=float, default=0.02) +@click.option( + "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da" +) @click.option("--processes", "-n", type=int) def correlate(*args, **kwargs): # Parse arguments @@ -171,6 +171,9 @@ def correlate(*args, **kwargs): @click.option("--spectrum-id-pattern", "-p") @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--ms2-tolerance", type=float, default=0.02) +@click.option( + "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da" +) @click.option("--processes", "-n", type=int) def get_training_data(*args, **kwargs): # Parse arguments @@ -193,6 +196,9 @@ def get_training_data(*args, **kwargs): @click.option("--spectrum-id-pattern", "-p") @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--ms2-tolerance", type=float, default=0.02) +@click.option( + "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da" +) @click.option("--processes", "-n", type=int) def annotate_spectra(*args, **kwargs): # Parse arguments @@ -203,8 +209,8 @@ def annotate_spectra(*args, **kwargs): results = ms2pip.core.annotate_spectra(*args, **kwargs) # Write intensities - output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix() - logger.info(f"Writing intensities to {output_name_int.with_suffix('.tsv')}") + output_name = output_name.with_name(output_name.stem + "_observations") + logger.info(f"Writing intensities to {output_name.with_suffix('.tsv')}") write_spectra(output_name, results, "tsv") diff --git a/ms2pip/_cython_modules/__init__.py b/ms2pip/_cython_modules/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/ms2pip/_cython_modules/ms2pip_features_c.c b/ms2pip/_cython_modules/ms2pip_features_c.c deleted file mode 100644 index 19749192..00000000 --- a/ms2pip/_cython_modules/ms2pip_features_c.c +++ /dev/null @@ -1,128 +0,0 @@ -// Compute feature vectors from peptide -unsigned int* get_v_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge) - { - int i,j,k; - - int fnum = 1; //first value in v is its length - - for (i=0; i < 19; i++) { - count_n[i] = 0; - count_c[i] = 0; - } - - //I need this for Omega - //important for sptms!! - peptide_buf[0] = peptide[0]; - for (i=0; i < peplen; i++) { - if (peptide[i+1] > 18) { - peptide_buf[i+1] = sptm_mapper[peptide[i+1]]; - } - else { - peptide_buf[i+1] = peptide[i+1]; - } - count_c[peptide_buf[i+1]]++; - } - - int num_shared = 0; - - shared_features[num_shared++] = peplen; - shared_features[num_shared++] = charge; - - shared_features[num_shared] = 0; - if (charge == 1) { - shared_features[num_shared] = 1; - } - num_shared++; - shared_features[num_shared] = 0; - if (charge == 2) { - shared_features[num_shared] =1; - } - num_shared++; - shared_features[num_shared] = 0; - if (charge == 3) { - shared_features[num_shared] =1; - } - num_shared++; - shared_features[num_shared] = 0; - if (charge == 4) { - shared_features[num_shared] =1; - } - num_shared++; - shared_features[num_shared] = 0; - if (charge >= 5) { - shared_features[num_shared] =1; - } - num_shared++; - - for (j=0; j < num_props; j++) { - for (i=0; i < peplen; i++) { - props_buffer[i] = props[j][peptide_buf[i+1]]; - } - qsort(props_buffer,peplen,sizeof(unsigned int),cmpfunc); - shared_features[num_shared++] = props_buffer[0]; - shared_features[num_shared++] = props_buffer[(int)(0.25*(peplen-1))]; - shared_features[num_shared++] = props_buffer[(int)(0.5*(peplen-1))]; - shared_features[num_shared++] = props_buffer[(int)(0.75*(peplen-1))]; - shared_features[num_shared++] = props_buffer[peplen-1]; - } - - for (i=0; i < peplen-1; i++) { - for (j=0; j -#include -#include - - -float* amino_masses; -unsigned short* amino_F; -unsigned short* sptm_mapper; -float ntermmod; - - -// For ms2pip_features_c.c -unsigned int v[300000]; - -int num_props = 4; -unsigned int props[5][19] = { - {37,35,59,129,94,0,210,81,191,106,101,117,115,343,49,90,60,134,104}, //basicity - {68,23,33,29,70,58,41,73,32,66,38,0,40,39,44,53,71,51,55}, //helicity - {51,75,25,35,100,16,3,94,0,82,12,0,22,22,21,39,80,98,70}, //hydrophobicity - {32,23,0,4,27,32,48,32,69,29,26,35,28,79,29,28,31,31,28}, //pI - //{71,103,115,129,147,57,137,113,128,131,114,97,128,156,87,101,99,186,163} //mass -}; - -unsigned int props_buffer[100]; //100 is max pep length -unsigned int shared_features[100]; //100 is max num shared features -unsigned int count_n[19]; -unsigned int count_c[19]; -unsigned short peptide_buf[200]; //IONBOT - - -// Function required in ms2pip_features_c_general.c and ms2pip_features_c_catboost.c -int cmpfunc (const void * a, const void * b) { - return ( *(int*)a - *(int*)b ); -} - - -// This function initializes amino acid masses and PTMs from a configuration file generated by Omega -void init_ms2pip(char* amino_masses_fname, char* modifications_fname, char* modifications_fname_sptm) { - int i; - int nummods; - int nummods_sptm; - float mz; - int numptm; - int before; - int after; - - FILE* f = fopen(modifications_fname,"rt"); - fscanf(f,"%i\n",&nummods); - fclose(f); - - f = fopen(modifications_fname_sptm,"rt"); - fscanf(f,"%i\n",&nummods_sptm); - fclose(f); - - //malloc - amino_masses = (float*) malloc((38+nummods+nummods_sptm)*sizeof(float)); - amino_F = (unsigned short*) malloc((38+nummods+nummods_sptm)*sizeof(unsigned short)); - sptm_mapper = (unsigned short*) malloc((38+nummods+nummods_sptm)*sizeof(unsigned short)); - - f = fopen(amino_masses_fname,"rt"); - for (i=0; i< 19; i++) { - fscanf(f,"%f\n",&amino_masses[i]); - amino_F[i] = (unsigned short) (amino_masses[i]-57.021464); - } - fscanf(f,"%f\n",&ntermmod); - fclose(f); - - for (i=0; i< 19; i++) { - amino_masses[19+i]=amino_masses[i]; - amino_F[19+i] = amino_F[i]; - } - - f = fopen(modifications_fname_sptm,"rt"); - fscanf(f,"%i\n",&nummods_sptm); - for (i=0; i< nummods_sptm; i++) { - fscanf(f,"%f,%i,%i,%i\n",&mz,&numptm,&before,&after); - sptm_mapper[after] = before; - sptm_mapper[after] = before; - if (after > 18) { - if (before<0) { - amino_masses[after] = mz; - } - else - { - amino_masses[after] = amino_masses[before]+mz; - amino_F[after] = (unsigned short) (amino_masses[before]+mz - 57.021464); - } - } - } - fclose(f); - f = fopen(modifications_fname,"rt"); - fscanf(f,"%i\n",&nummods); - for (i=0; i< nummods; i++) { - fscanf(f,"%f,%i,%i,%i\n",&mz,&numptm,&before,&after); - if (after > 18) { - if (before<0) { - amino_masses[after] = mz; - } - else - { - amino_masses[after] = amino_masses[before]+mz; - amino_F[after] = (unsigned short) (amino_masses[before]+mz - 57.021464); - } - } - } - fclose(f); -} diff --git a/ms2pip/_cython_modules/ms2pip_peaks_c.c b/ms2pip/_cython_modules/ms2pip_peaks_c.c deleted file mode 100644 index 607e4250..00000000 --- a/ms2pip/_cython_modules/ms2pip_peaks_c.c +++ /dev/null @@ -1,982 +0,0 @@ -#include -#include -#include - -#include "ms2pip_init_c.c" -#include "ms2pip_features_c.c" - -#include "../_models_c/HCD-2019.h" -#include "../_models_c/TMT.h" - -float membuffer[10000]; -float ions[2000]; -float mzs[2000]; -float predictions[1000]; - -struct annotations{ - float* peaks; - float* msms; -}; -typedef struct annotations annotations; - -//compute feature vector from peptide + predict intensities -float* get_p_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge, int model_id, int ce) - { - unsigned int* v = get_v_ms2pip(peplen, peptide, modpeptide, charge); - int fnum = v[0]/(peplen-1); - int i; - - // HCD - if (model_id == 1) { - for (i=0; i < peplen-1; i++) { - predictions[0*(peplen-1)+i] = score_HCD_B(v+1+(i*fnum))+0.5; - predictions[2*(peplen-1)-i-1] = score_HCD_Y(v+1+(i*fnum))+0.5; - } - } - - // TMT - else if (model_id == 3) { - for (i=0; i < peplen-1; i++) { - predictions[0*(peplen-1)+i] = score_TMT_B(v+1+(i*fnum))+0.5; - predictions[2*(peplen-1)-i-1] = score_TMT_Y(v+1+(i*fnum))+0.5; - } - } - // EThcD - // else if (model_id == 6) { - // for (i=0; i < peplen-1; i++) { - // predictions[0*(peplen-1)+i] = score_EThcD_B(v+1+(i*fnum))+0.5; - // predictions[2*(peplen-1)-i-1] = score_EThcD_Y(v+1+(i*fnum))+0.5; - // predictions[2*(peplen-1)+i] = score_EThcD_C(v+1+(i*fnum))+0.5; - // predictions[4*(peplen-1)-i-1] = score_EThcD_Z(v+1+(i*fnum))+0.5; - // } - // } - - // HCDch2 - else if (model_id == 7) { - for (i=0; i < peplen-1; i++) { - predictions[0*(peplen-1)+i] = score_HCD_B(v+1+(i*fnum))+0.5; - predictions[2*(peplen-1)-i-1] = score_HCD_Y(v+1+(i*fnum))+0.5; - predictions[2*(peplen-1)+i] = score_HCD_B2(v+1+(i*fnum))+0.5; - predictions[4*(peplen-1)-i-1] = score_HCD_Y2(v+1+(i*fnum))+0.5; - } - } - - else { - return NULL; - } - return predictions; -} - - -//get fragment ion mz values (b, y) -float* get_mz_ms2pip_general(int peplen, unsigned short* modpeptide) - { - int i,j; - float mz; - j=0; - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = mz+1.007236; //b-ion - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion - } - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = (mz + 1.007236 + 1.007236)/2; //b2-ion: (b-ion + H)/2 - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = (18.0105647 + mz + 1.007236 + 1.007236)/2; //y2-ion: (y-ion + H)/2 - } - - return membuffer; -} - - -//get fragment ion mz values (b, y, c, z) -float* get_mz_ms2pip_etd(int peplen, unsigned short* modpeptide) - { - int i,j; - float mz; - j=0; - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = mz + 1.007236; //b-ion - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion - } - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = mz + 1.007825032 + 17.0265491; //c-ion: peptide + H + NH3 - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = mz + 17.00273965 - 15.01089904 + 1.007825032; //z-ion: peptide + OH - NH - } - - return membuffer; -} - - -//get fragment ion mz values (b, y, b++, y++) -float* get_mz_ms2pip_ch2(int peplen, unsigned short* modpeptide) - { - int i,j; - float mz; - j=0; - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = mz+1.007236; //b-ion - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion - } - - mz = 0; - if (modpeptide[0] != 0) { - mz = amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = (mz + 1.007236 + 1.007236)/2; //b2-ion: (b-ion + H)/2 - } - - mz = 0; - if (modpeptide[peplen+1] != 0) { - mz = amino_masses[modpeptide[peplen+1]]; - } - for (i=peplen; i > 1; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j++] = (18.0105647 + mz + 1.007236 + 1.007236)/2; //y2-ion: (y-ion + H)/2 - } - - return membuffer; -} - -//get all fragment ion peaks from spectrum -annotations get_t_ms2pip_all(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - { - int i,tmp; - float mz; - int msms_pos; - int mem_pos; - float max, maxmz, tmp2; - - //for (i=0; i < numpeaks; i++) { - // fprintf(stderr,"m %f\n",msms[i]); - //} - - for (i=0; i < 18*(peplen-1); i++) { // 2*9 iontypes: b: a -H2O -NH3 b c y: -H2O z y x - ions[i] = -9.96578428466; //HARD CODED!! - mzs[i] = 0; //HARD CODED!! - } - - //b-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - int pos = 0; - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[pos++] = mz+1.007236-27.99492; // a - membuffer[pos++] = mz+1.007236-18.010565; // -H2O - membuffer[pos++] = mz+1.007236-17.026001; // -NH3 - membuffer[pos++] = mz+1.007236; // b - membuffer[pos++] = mz+1.007236+17.02654; // c - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= 5*(peplen-1)) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - maxmz = msms[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - maxmz = msms[tmp]; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[mem_pos] = max; - mzs[mem_pos] = maxmz; - mem_pos += 1; - } - } - - //b2-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - pos = 0; - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[pos++] = (mz+1.007236+1.007236-27.99492)/2.; // a - membuffer[pos++] = (mz+1.007236+1.007236-18.010565)/2.; // -H2O - membuffer[pos++] = (mz+1.007236+1.007236-17.026001)/2.; // -NH3 - membuffer[pos++] = (mz+1.007236+1.007236)/2.; // b - membuffer[pos++] = (mz+1.007236+1.007236+17.02654)/2.; // c - } - - msms_pos = 0; - mem_pos=0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= 5*(peplen-1)) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - maxmz = msms[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - maxmz = msms[tmp]; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[5*(peplen-1)+mem_pos] = max; - mzs[5*(peplen-1)+mem_pos] = maxmz; - mem_pos += 1; - } - } - - - // y-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - pos = 0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[pos++] = 18.0105647+mz+1.007236-18.010565; //-H2O - membuffer[pos++] = 18.0105647+mz+1.007236-17.02545; // z - membuffer[pos++] = 18.0105647+mz+1.007236; // y - membuffer[pos++] = 18.0105647+mz+1.007236+25.97926; // x - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= 4*(peplen-1)) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - maxmz = msms[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - maxmz = msms[tmp]; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[10*(peplen-1)+mem_pos] = max; - mzs[10*(peplen-1)+mem_pos] = maxmz; - mem_pos += 1; - } - } - - // y2-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - pos = 0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[pos++] = (18.0105647+mz+1.007236+1.007236-18.010565)/2.; //-H2O - membuffer[pos++] = (18.0105647+mz+1.007236+1.007236-17.02545)/2.; // z - membuffer[pos++] = (18.0105647+mz+1.007236+1.007236)/2.; // y - membuffer[pos++] = (18.0105647+mz+1.007236+1.007236+25.97926)/2.; // x - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= 4*(peplen-1)) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - maxmz = msms[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - maxmz = msms[tmp]; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[14*(peplen-1)+mem_pos] = max; - mzs[14*(peplen-1)+mem_pos] = maxmz; - mem_pos += 1; - } - } - - //for (i=0; i < 18*(peplen-1); i++) { // 2*9 iontypes: b: a -H2O -NH3 b c y: -H2O z y x - // fprintf(stderr,"%f ",ions[i]); //HARD CODED!! - //} - //fprintf(stderr,"\n"); - - struct annotations r = {ions,mzs}; - - return r; -} - -//get fragment ion peaks from spectrum -float* get_t_ms2pip_general(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - { - int i,j,tmp; - float mz; - int msms_pos; - int mem_pos; - float max, tmp2; - - //for (i=0; i < numpeaks; i++) { - // fprintf(stderr,"m %f\n",msms[i]); - //} - - for (i=0; i < 2*(peplen-1); i++) { - ions[i] = -9.96578428466; //HARD CODED!! - } - - //b-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[i-1] = mz+1.007236; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[mem_pos] = max; - mem_pos += 1; - } - } - - // y-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - j=0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j] = 18.0105647+mz+1.007236; - j++; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - - return ions; -} - - - -//get fragment ion peaks from spectrum (b, y, c, z) -float* get_t_ms2pip_etd(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - { - int i,j,tmp; - float mz; - int msms_pos; - int mem_pos; - float max, tmp2; - - for (i=0; i < 4*(peplen-1); i++) { - ions[i] = -9.96578428466; //HARD CODED!! - } - - //b-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[i-1] = mz+1.007236; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[mem_pos] = max; - mem_pos += 1; - } - } - - // y-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - j=0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j] = 18.0105647+mz+1.007236; - j++; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - //c-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[i-1] = mz + 1.007825032 + 17.026549; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[2*(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - // z-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - j=0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j] = mz + 17.00274 - 15.010899 + 1.007825032; - j++; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[3*(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - return ions; -} - - -//get fragment ion peaks from spectrum (b, y, b++, y++) -float* get_t_ms2pip_ch2(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - { - int i,j,tmp; - float mz; - int msms_pos; - int mem_pos; - float max, tmp2; - - //for (i=0; i < numpeaks; i++) { - // fprintf(stderr,"m %f\n",msms[i]); - //} - - for (i=0; i < 4*(peplen-1); i++) { - ions[i] = -9.96578428466; //HARD CODED!! - } - - //b-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[i-1] = mz+1.007236; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[mem_pos] = max; - mem_pos += 1; - } - } - - // y-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - j=0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j] = 18.0105647+mz+1.007236; - j++; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - //b2-ions - mz = ntermmod; - if (modpeptide[0] != 0) { - mz += amino_masses[modpeptide[0]]; - } - for (i=1; i < peplen; i++) { - mz += amino_masses[modpeptide[i]]; - membuffer[i-1] = (mz + 1.007236 + 1.007236)/2; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[2*(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - // y2-ions - mz = 0.; - if (modpeptide[peplen+1] != 0) { - mz += modpeptide[peplen+1]; - } - j=0; - for (i=peplen; i >= 2; i--) { - mz += amino_masses[modpeptide[i]]; - membuffer[j] = (18.0105647 + mz + 1.007236 + 1.007236)/2; - j++; - } - - msms_pos = 0; - mem_pos = 0; - while (1) { - if (msms_pos >= numpeaks) { - break; - } - if (mem_pos >= peplen-1) { - break; - } - mz = membuffer[mem_pos]; - if (msms[msms_pos] > (mz+tolmz)) { - mem_pos += 1; - } - else if (msms[msms_pos] < (mz-tolmz)) { - msms_pos += 1; - } - else { - max = peaks[msms_pos]; - tmp = msms_pos + 1; - if (tmp < numpeaks) { - while (msms[tmp] <= (mz+tolmz)) { - tmp2 = peaks[tmp]; - if (max < tmp2) { - max = tmp2; - } - tmp += 1; - if (tmp == numpeaks) { - break; - } - } - } - ions[3*(peplen-1)+mem_pos] = max; - mem_pos += 1; - } - } - - return ions; -} diff --git a/ms2pip/_cython_modules/ms2pip_pyx.pyx b/ms2pip/_cython_modules/ms2pip_pyx.pyx deleted file mode 100644 index 0e980ba6..00000000 --- a/ms2pip/_cython_modules/ms2pip_pyx.pyx +++ /dev/null @@ -1,187 +0,0 @@ -#cython: language_level=3 -import sys -import numpy as np -cimport numpy as np - - -NUM_ION_TYPES_MAPPING = {'general': 2, 'etd': 4, 'ch2': 4, 'all': 18} - - -cdef extern from "ms2pip_peaks_c.c": - - ctypedef struct annotations: - float* peaks - float* msms - - void init_ms2pip(char* amino_masses_fname, char* modifications_fname, char* modifications_fname_sptm) - - unsigned int* get_v_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge) - - float* get_p_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge, int model_id, int ce) - - float* get_mz_ms2pip_general(int peplen, unsigned short* modpeptide) - float* get_mz_ms2pip_etd(int peplen, unsigned short* modpeptide) - float* get_mz_ms2pip_ch2(int peplen, unsigned short* modpeptide) - - annotations get_t_ms2pip_all(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - float* get_t_ms2pip_general(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - float* get_t_ms2pip_etd(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - float* get_t_ms2pip_ch2(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz) - - -def ms2pip_init(amino_masses_fname, modifications_fname, modifications_fname_sptm): - if not isinstance(amino_masses_fname, bytearray): - amino_masses_fname = bytearray(amino_masses_fname.encode()) - if not isinstance(modifications_fname, bytearray): - modifications_fname = bytearray(modifications_fname.encode()) - if not isinstance(modifications_fname_sptm, bytearray): - modifications_fname_sptm = bytearray(modifications_fname_sptm.encode()) - init_ms2pip(amino_masses_fname, modifications_fname, modifications_fname_sptm) - - -def get_vector(np.ndarray[unsigned short, ndim=1, mode="c"] peptide, - np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - charge): - - cdef unsigned int* results = get_v_ms2pip(len(peptide)-2, &peptide[0], &modpeptide[0], charge) - - r = [] - offset = 0 - fnum = int(results[0] / (len(peptide) - 3)) - for i in range(len(peptide) - 3): - v = [] - for j in range(fnum): - v.append(results[j + 1 + offset]) - offset += fnum - r.append(np.array(v, dtype=np.uint16)) - - return r - - -def get_predictions(np.ndarray[unsigned short, ndim=1, mode="c"] peptide, - np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - charge, model_id, peaks_version, ce): - cdef float* results = get_p_ms2pip(len(peptide)-2, &peptide[0], &modpeptide[0], charge, model_id, ce) - if results is NULL: - raise NotImplementedError(model_id) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - - -def get_targets(*args): - if args[4] == 'general': - result = get_targets_general(*args) - if args[4] == 'etd': - result = get_targets_etd(*args) - if args[4] == 'ch2': - result = get_targets_ch2(*args) - if args[4] == 'all': - result = get_targets_all(*args) - return result - - -def get_targets_all(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - np.ndarray[float, ndim=1, mode="c"] msms, - np.ndarray[float, ndim=1, mode="c"] peaks, - fragerror, peaks_version): - cdef annotations results - results = get_t_ms2pip_all(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror) - result_peaks = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]*(len(modpeptide)-3)): - result_peaks.append(results.peaks[i]) - result_mzs = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]*(len(modpeptide)-3)): - result_mzs.append(results.msms[i]) - return (result_mzs,result_peaks) - - -def get_targets_general(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - np.ndarray[float, ndim=1, mode="c"] msms, - np.ndarray[float, ndim=1, mode="c"] peaks, - fragerror, peaks_version): - cdef float* results = get_t_ms2pip_general(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): #SD: HAd to change this - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - -def get_targets_etd(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - np.ndarray[float, ndim=1, mode="c"] msms, - np.ndarray[float, ndim=1, mode="c"] peaks, - fragerror, peaks_version): - cdef float* results = get_t_ms2pip_etd(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - - -def get_targets_ch2(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - np.ndarray[float, ndim=1, mode="c"] msms, - np.ndarray[float, ndim=1, mode="c"] peaks, - fragerror, peaks_version): - cdef float* results = get_t_ms2pip_ch2(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - - -def get_mzs(*args): - if args[1] == 'general': - result = get_mzs_general(*args) - if args[1] == 'etd': - result = get_mzs_etd(*args) - if args[1] == 'ch2': - result = get_mzs_ch2(*args) - return result - - -def get_mzs_general(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - peaks_version): - cdef float* results = get_mz_ms2pip_general(len(modpeptide)-2, &modpeptide[0]) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - - -def get_mzs_etd(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - peaks_version): - cdef float* results = get_mz_ms2pip_etd(len(modpeptide)-2, &modpeptide[0]) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed - - -def get_mzs_ch2(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide, - peaks_version): - cdef float* results = get_mz_ms2pip_ch2(len(modpeptide)-2, &modpeptide[0]) - result_parsed = [] - for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): - tmp = [] - for j in range(len(modpeptide)-3): - tmp.append(results[(len(modpeptide)-3) * i + j]) - result_parsed.append(tmp) - return result_parsed diff --git a/ms2pip/_models_c/HCD-2019.h b/ms2pip/_models_c/HCD-2019.h deleted file mode 100644 index 4887791b..00000000 --- a/ms2pip/_models_c/HCD-2019.h +++ /dev/null @@ -1,4 +0,0 @@ -float score_HCD_B(unsigned int* v); -float score_HCD_B2(unsigned int* v); -float score_HCD_Y(unsigned int* v); -float score_HCD_Y2(unsigned int* v); diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c deleted file mode 100644 index ae8fcf62..00000000 --- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0ab6fdc7c3b2324228912757d1c55bd56fdebbe066a0b5fc0305f0c7bbb9969 -size 8726156 diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c deleted file mode 100644 index 19a3d6a4..00000000 --- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2e27aab327de1a883500f7bb019ff25c9412141fb19ffe1a189eb092bdebd5b -size 1664299 diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c deleted file mode 100644 index 755926fc..00000000 --- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:105a9323043df6a192d300adaf3867391ef9e6e03599f6bc393289c3f62b982d -size 9116794 diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c deleted file mode 100644 index 4dd0a7b4..00000000 --- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c1be44093ce4b3085b49142ed5a263c01a79c90b3bcaea4c0e829e3b587f2a3 -size 9063001 diff --git a/ms2pip/_models_c/TMT.h b/ms2pip/_models_c/TMT.h deleted file mode 100644 index 6ec67231..00000000 --- a/ms2pip/_models_c/TMT.h +++ /dev/null @@ -1,2 +0,0 @@ -float score_TMT_B(unsigned int* v); -float score_TMT_Y(unsigned int* v); diff --git a/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c b/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c deleted file mode 100644 index c23a7f9a..00000000 --- a/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49252e3d1b42df924b422f118870f237ec541d58db0037b4800444999277d055 -size 2348087 diff --git a/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c b/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c deleted file mode 100644 index f3635860..00000000 --- a/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5429cc869363e3383ce41a78329870898ce6e6c3b5fcf9ed1ca5eaac9417df42 -size 4323734 diff --git a/ms2pip/_spectrum_processing.py b/ms2pip/_spectrum_processing.py new file mode 100644 index 00000000..8aed4cb6 --- /dev/null +++ b/ms2pip/_spectrum_processing.py @@ -0,0 +1,271 @@ +"""Internal spectrum annotation, target extraction, and PSM-spectrum matching.""" + +from __future__ import annotations + +import re +from collections import defaultdict +from collections.abc import Generator +from pathlib import Path + +import numpy as np +from psm_utils import PSM, PSMList +from psm_utils import Peptidoform +from ms2rescore_rs import ( + MS2Spectrum, # type: ignore[ty:unresolved-import] + Precursor, # type: ignore[ty:unresolved-import] + annotate_ms2_spectra, # type: ignore[ty:unresolved-import] + get_ms2_spectra, # type: ignore[ty:unresolved-import] +) + +import ms2pip.exceptions as exceptions +from ms2pip.constants import MODELS +from ms2pip.spectrum import ObservedSpectrum + + +def proforma_to_mass_shift(peptidoform: Peptidoform) -> str: + """ + Convert a Peptidoform to a mass-shift ProForma string. + + Replaces all modification labels with numeric mass shifts so that + ms2rescore-rs can parse them. Handles sequence modifications and + N/C-terminal modifications. + + Note: This does not handle ProForma features like labile modifications, + unlocalized modifications, tagged intervals, or isotope labels. These are + not used by ms2pip. + """ + parts = [] + n_term = peptidoform.properties.get("n_term") + if n_term: + for mod in n_term: + parts.append(f"[{mod.mass:+.4f}]-") + for aa, mods in peptidoform.parsed_sequence: + parts.append(aa) + if mods: + for mod in mods: + parts.append(f"[{mod.mass:+.4f}]") # type: ignore[ty:unresolved-attribute] + c_term = peptidoform.properties.get("c_term") + if c_term: + for mod in c_term: + parts.append(f"-[{mod.mass:+.4f}]") + if peptidoform.precursor_charge: + parts.append(f"/{peptidoform.precursor_charge}") + return "".join(parts) + + +def read_raw_spectra(spectrum_file: str) -> Generator[MS2Spectrum, None, None]: + """Read MS2 spectra as raw ms2rescore-rs objects (no conversion to ObservedSpectrum).""" + try: + spectra = get_ms2_spectra(str(spectrum_file)) + except ValueError as e: + raise exceptions.UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes) from e + + for spectrum in spectra: + if str(spectrum.identifier) == "" or len(spectrum.mz) == 0 or len(spectrum.intensity) == 0: + continue + yield spectrum + + +def to_observed_spectrum(spectrum: MS2Spectrum) -> ObservedSpectrum: + """Convert an MS2Spectrum to an ObservedSpectrum.""" + return ObservedSpectrum( + mz=np.array(spectrum.mz, dtype=np.float32), + intensity=np.array(spectrum.intensity, dtype=np.float32), + identifier=str(spectrum.identifier), + precursor_mz=float(spectrum.precursor.mz), + precursor_charge=int(spectrum.precursor.charge), + retention_time=float(spectrum.precursor.rt), + ) + + +def annotate_spectrum( + spectrum: ObservedSpectrum, + psm: PSM, + model: str, + ms2_tolerance: float, + ms2_tolerance_mode: str, +) -> list[list[tuple]]: + """ + Annotate an ObservedSpectrum using ms2rescore-rs. + + Returns peak annotations as plain Python lists of ``(series, position, charge)`` tuples. + """ + ms2_spectrum = MS2Spectrum( + identifier=spectrum.identifier or "", + mz=list(spectrum.mz), + intensity=list(spectrum.intensity), + precursor=Precursor( + mz=float(spectrum.precursor_mz) if spectrum.precursor_mz else 0.0, + charge=int(spectrum.precursor_charge) if spectrum.precursor_charge else 0, + rt=float(spectrum.retention_time) if spectrum.retention_time else 0.0, + ), + ) + frag_model = MODELS[model]["fragmentation"] + proforma = proforma_to_mass_shift(psm.peptidoform) + seq_len = len(psm.peptidoform.parsed_sequence) + + annotated = annotate_ms2_spectra( + spectra=[ms2_spectrum], + proformas=[proforma], + seq_lens=[seq_len], + fragmentation_model=frag_model, + mass_mode="monoisotopic", + tolerance_value=float(ms2_tolerance), + tolerance_mode=ms2_tolerance_mode.lower(), + ) + return [ + [(a.series, a.position, a.charge) for a in peak_anns] + for peak_anns in annotated[0].peak_annotations + ] + + +def targets_from_annotations( + peak_annotations: list, + intensity: np.ndarray, + ion_types: list[str], + seq_len: int, +) -> dict[str, np.ndarray]: + """ + Extract observed intensity targets from peak annotations. + + Converts per-peak fragment annotations into per-ion-type intensity arrays. + + Parameters + ---------- + peak_annotations + Per-peak annotations. Each element is a list of annotations for that peak. + Annotations can be :py:class:`ms2rescore_rs.FragmentAnnotation` objects or + ``(series, position, charge)`` tuples. + intensity + Preprocessed intensity array (TIC-normalized, log2-transformed). + ion_types + Ion types to extract, e.g. ``["b", "y"]`` or ``["b", "y", "b2", "y2"]``. + seq_len + Length of the peptide sequence (number of amino acids). + + Returns + ------- + targets + Dict mapping ion type to intensity array of length ``seq_len - 1``. + + """ + n_ions = seq_len - 1 + floor_value = np.float32(np.log2(0.001)) + targets = {ion: np.full(n_ions, floor_value, dtype=np.float32) for ion in ion_types} + + for peak_idx, peak_anns in enumerate(peak_annotations): + for ann in peak_anns: + if isinstance(ann, tuple): + series, position, charge = ann + else: + series, position, charge = ann.series, ann.position, ann.charge + + ion_key = series if charge == 1 else f"{series}{charge}" + + if ion_key not in targets: + continue + + pos = position - 1 + if 0 <= pos < n_ions: + if intensity[peak_idx] > targets[ion_key][pos]: + targets[ion_key][pos] = intensity[peak_idx] + + return targets + + +def load_and_match_spectra( + psm_list: PSMList, + spectrum_file: str | Path, + spectrum_id_pattern: str, + model: str, + ms2_tolerance: float, + ms2_tolerance_mode: str, +) -> list[tuple[int, PSM, ObservedSpectrum, list]]: + """ + Read spectra from file, annotate, preprocess, and match to PSMs. + + Reads raw MS2Spectrum objects, matches to PSMs by spectrum ID, batch-annotates + all matched spectra in a single Rust call, then converts to ObservedSpectrum + and preprocesses. + + Returns list of (psm_index, psm, preprocessed_spectrum, peak_annotations) tuples. + """ + try: + spectrum_id_regex = re.compile(spectrum_id_pattern) + except TypeError: + spectrum_id_regex = re.compile(r"(.*)") + + psms_by_specid = defaultdict(list) + for i, psm in enumerate(psm_list): + psms_by_specid[str(psm.spectrum_id)].append((i, psm)) + + # Step 1: Read raw spectra and match to PSMs (no conversion yet) + matched_raw: list[tuple[str, MS2Spectrum, list[tuple[int, PSM]]]] = [] + for spectrum in read_raw_spectra(str(spectrum_file)): + match = spectrum_id_regex.search(str(spectrum.identifier)) + try: + spectrum_id = match[1] # type: ignore[ty:not-subscriptable] + except (TypeError, IndexError): + raise exceptions.TitlePatternError( + f"Spectrum title pattern `{spectrum_id_pattern}` could not be matched to " + f"spectrum ID `{spectrum.identifier}`. " + " Are you sure that the regex contains a capturing group?" + ) + + if spectrum_id not in psms_by_specid: + continue + + matched_raw.append((spectrum_id, spectrum, psms_by_specid[spectrum_id])) + + if not matched_raw: + return [] + + # Step 2: Batch annotate all matched spectra (single Rust call, Rayon-parallelized) + batch_spectra = [] + batch_proformas = [] + batch_seq_lens = [] + batch_indices = [] # (matched_raw_idx, psm_within_spectrum_idx) + + for raw_idx, (_, spectrum, psm_pairs) in enumerate(matched_raw): + for psm_idx, (_, psm) in enumerate(psm_pairs): + batch_spectra.append(spectrum) + batch_proformas.append(proforma_to_mass_shift(psm.peptidoform)) + batch_seq_lens.append(len(psm.peptidoform.parsed_sequence)) + batch_indices.append((raw_idx, psm_idx)) + + frag_model = MODELS[model]["fragmentation"] + annotated_spectra = annotate_ms2_spectra( + spectra=batch_spectra, + proformas=batch_proformas, + seq_lens=batch_seq_lens, + fragmentation_model=frag_model, + mass_mode="monoisotopic", + tolerance_value=float(ms2_tolerance), + tolerance_mode=ms2_tolerance_mode.lower(), + ) + + # Step 3: Convert to ObservedSpectrum, preprocess, and assemble results + preprocessed_cache: dict[str, ObservedSpectrum] = {} + results = [] + + for batch_idx, (raw_idx, psm_idx) in enumerate(batch_indices): + spec_id, raw_spectrum, psm_pairs = matched_raw[raw_idx] + psm_index, psm = psm_pairs[psm_idx] + + if spec_id not in preprocessed_cache: + obs = to_observed_spectrum(raw_spectrum) + for label_type in ["iTRAQ", "TMT"]: + if label_type in model: + obs.remove_reporter_ions(label_type) + obs.tic_norm() + obs.log2_transform() + preprocessed_cache[spec_id] = obs + + peak_annotations = [ + [(a.series, a.position, a.charge) for a in peak_anns] + for peak_anns in annotated_spectra[batch_idx].peak_annotations + ] + + results.append((psm_index, psm, preprocessed_cache[spec_id], peak_annotations)) + + return results diff --git a/ms2pip/_utils/dlib.py b/ms2pip/_utils/dlib.py index 50714dad..004ef3b7 100644 --- a/ms2pip/_utils/dlib.py +++ b/ms2pip/_utils/dlib.py @@ -2,7 +2,6 @@ import zlib from pathlib import Path -from typing import Union import numpy import sqlalchemy @@ -44,9 +43,9 @@ def process_result_value(self, value, dialect): decompressed = zlib.decompress(value) return numpy.frombuffer(decompressed, dtype=self.dtype).tolist() - def copy(self): + def copy(self): # type: ignore[ty:invalid-method-override] # NOTE: length will be passed through to BLOB - return CompressedArray(self.dtype, self.impl.length) + return CompressedArray(self.dtype, self.impl.length) # type: ignore[ty:unresolved-attribute] metadata = MetaData() @@ -100,6 +99,6 @@ def copy(self): ) -def open_sqlite(filename: Union[str, Path]) -> Connection: +def open_sqlite(filename: str | Path) -> Connection: engine = sqlalchemy.create_engine(f"sqlite:///{filename}") return engine.connect() diff --git a/ms2pip/_utils/encoder.py b/ms2pip/_utils/encoder.py deleted file mode 100644 index da1dc0be..00000000 --- a/ms2pip/_utils/encoder.py +++ /dev/null @@ -1,322 +0,0 @@ -"""Peptide and modification handling for MS2PIP.""" -from __future__ import annotations - -import logging -import os -import tempfile -from typing import Generator - -import numpy as np -from psm_utils import Peptidoform, PSMList -from pyteomics import proforma - -import ms2pip.exceptions as exceptions - -logger = logging.getLogger(__name__) - -AMINO_ACIDS = [ - "A", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "K", - "M", - "N", - "P", - "Q", - "R", - "S", - "T", - "V", - "W", - "Y", -] - -AMINO_ACID_MASSES = [ - 71.037114, - 103.00919, - 115.026943, - 129.042593, - 147.068414, - 57.021464, - 137.058912, - 113.084064, - 128.094963, - 131.040485, - 114.042927, - 97.052764, - 128.058578, - 156.101111, - 87.032028, - 101.047679, - 99.068414, - 186.079313, - 163.063329, -] - -AMINO_ACID_IDS = {a: i for i, a in enumerate(AMINO_ACIDS)} -AMINO_ACID_IDS["L"] = AMINO_ACID_IDS["I"] - - -class Encoder: - """Modification-aware encoding of peptidoforms.""" - - def __init__(self) -> None: - """ - Modification-aware encoding of peptidoforms. - - MS²PIP requires all modification mass shifts to be written to a file for use in C code - before running. This class handles the encoding of peptides and peptidoforms for - modifications that have been defined. - - Encoder files are to be passed on to the ``ms2pip_pyx.ms2pip_init`` function. E.g., - ``ms2pip_pyx.ms2pip_init(*encoder.encoder_files)``. - - Notes - ----- - - Either used as a context manager or manually call :py:meth:`write_encoder_files` before - use and :py:meth:`remove_encoder_files` after use. - - Fixed, labile, and unlocalized modifications are ignored. Fixed modifications - should therefore already have been applied (see - :py:meth:`psm_utils.psm_list.PSMList.apply_fixed_modifications`). - - """ - self.modifications = {} - self.encoder_files = None - - self._next_mod_id = 38 # Avoid clash with amino acids and mutations (ionbot compatibility) - - def __enter__(self): - if not self.encoder_files: - self.write_encoder_files() - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.remove_encoder_files() - - def __repr__(self) -> str: - return "{}.{}(modifications={})".format( - self.__class__.__module__, - self.__class__.__qualname__, - self.modifications, - ) - - @classmethod - def from_peptidoform(cls, peptidoform: Peptidoform) -> Encoder: - """ - Create Encoder instance from peptidoform. - - Parameters - ---------- - peptidoform : Peptidoform - Peptidoform to use for modification configuration. - - Returns - ------- - Encoder - Encoder instance with modifications configured. - - """ - encoder = cls() - encoder._configure_from_peptidoform(peptidoform) - encoder.write_encoder_files() - return encoder - - @classmethod - def from_psm_list(cls, psm_list: PSMList) -> Encoder: - """ - Create Encoder instance from PSMList. - - Parameters - ---------- - psm_list : PSMList - PSMList to use for modification configuration. - - Returns - ------- - Encoder - Encoder instance with modifications configured. - - """ - encoder = cls() - encoder._configure_from_psm_list(psm_list) - encoder.write_encoder_files() - return encoder - - def _configure_modification(self, target: str, modification: proforma.TagBase): - """ - Add single pyteomics.proforma modification to configuration. - - Parameters - ---------- - target : str - Target amino acid one-letter code or terminus (``n_term`` or ``c_term``). - modification : pyteomics.proforma.TagBase - Modification to add. - - """ - if target == "n_term": - amino_acid_id = -1 - elif target == "c_term": - amino_acid_id = -2 - elif target in AMINO_ACID_IDS: - amino_acid_id = AMINO_ACID_IDS[target] - else: - logger.warning(f"Skipping modification for invalid amino acid: {target}") - return None - - self.modifications[(target, str(modification))] = { - "mod_id": self._next_mod_id, - "mass_shift": modification.mass, - "amino_acid": target, - "amino_acid_id": amino_acid_id, - "modification": modification, - } - self._next_mod_id += 1 - - def _configure_from_peptidoform(self, peptidoform: Peptidoform): - """Configure encoder with modifications from single Peptidoform.""" - # Get unique modifications from psm - unique_modifications = dict() - try: - for aa, mods in peptidoform.parsed_sequence: - if mods: - unique_modifications.update({(aa, str(mod)): mod for mod in mods}) - for term in ["n_term", "c_term"]: - if peptidoform.properties[term]: - unique_modifications.update( - {(term, str(mod)): mod for mod in peptidoform.properties[term]} - ) - except KeyError as e: - raise exceptions.UnresolvableModificationError(e.args[0]) from e - - # Add modification entries - for (target, _), mod in unique_modifications.items(): - self._configure_modification(target, mod) - - def _configure_from_psm_list(self, psm_list: PSMList): - """Configure encoder with modifications from PSMList.""" - # Get unique modifications from psm_list - unique_modifications = dict() - try: - for psm in psm_list: - for aa, mods in psm.peptidoform.parsed_sequence: - if mods: - unique_modifications.update({(aa, str(mod)): mod for mod in mods}) - for term in ["n_term", "c_term"]: - if psm.peptidoform.properties[term]: - unique_modifications.update( - {(term, str(mod)): mod for mod in psm.peptidoform.properties[term]} - ) - except KeyError as e: - raise exceptions.UnresolvableModificationError(e.args[0]) from e - - # Add modification entries - for (target, _), mod in unique_modifications.items(): - self._configure_modification(target, mod) - - def write_encoder_files(self) -> str: - """Write configured masses to temporary files for use in C code.""" - # AA file - amino_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n") - for m in AMINO_ACID_MASSES: - amino_file.write("{}\n".format(m)) - amino_file.write("0\n") - amino_file.close() - - # PTM file - mod_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n") - mod_file.write("{}\n".format(len(self.modifications))) - for mod in self.modifications.values(): - mod_file.write( - "{},1,{},{}\n".format(mod["mass_shift"], mod["amino_acid_id"], mod["mod_id"]) - ) - mod_file.close() - - # SPTM file (ionbot compatibility) - mod_file2 = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n") - mod_file2.write("0\n") - mod_file2.close() - - # Store temporary file names - self.encoder_files = (amino_file.name, mod_file.name, mod_file2.name) - - def remove_encoder_files(self): - """Remove temporary encoder files.""" - if self.encoder_files: - for f in self.encoder_files: - os.remove(f) - self.encoder_files = None - - @staticmethod - def validate_peptidoform(peptidoform: Peptidoform): - """Validate whether a peptidoform can be encoded for MS²PIP.""" - # Charge required - if peptidoform.precursor_charge is None: - raise exceptions.InvalidPeptidoformError("Peptidoform charge is required.") - - # Peptides longer then 101 lead to "Segmentation fault (core dumped)" - if len(peptidoform.parsed_sequence) > 100: - raise exceptions.InvalidPeptidoformError( - "Peptidoform sequence cannot be longer than 100 amino acids." - ) - elif len(peptidoform.parsed_sequence) < 4: - raise exceptions.InvalidPeptidoformError( - "Peptidoform sequence cannot be shorter than 4 amino acids." - ) - - def encode_peptide(self, peptidoform: Peptidoform) -> np.ndarray: - """Encode a peptide (without modifications) for MS²PIP.""" - self.validate_peptidoform(peptidoform) - - try: - encoded = [0] + [AMINO_ACID_IDS[aa] for aa, _ in peptidoform.parsed_sequence] + [0] - except KeyError as e: - raise exceptions.InvalidAminoAcidError( - f"Unsupported amino acid found in peptide `{peptidoform.proforma}`" - ) from e - return np.array(encoded, dtype=np.uint16) - - def encode_peptidoform(self, peptidoform: Peptidoform) -> np.ndarray: - """ - Encode a peptidoform for MS²PIP. - - Notes - ----- - - Multiple modifications per site is not supported. The first modification is used. - - Fixed, labile, and unlocalized modifications are ignored. Fixed modifications - should therefore already have been applied (see - :py:meth:`psm_utils.PSMList.apply_fixed_modifications`). - - """ - - def _generate_encoding(peptidoform) -> Generator[int, None, None]: - if peptidoform.properties["n_term"]: - mod_str = str(peptidoform.properties["n_term"][0]) - yield self.modifications["n_term", mod_str]["mod_id"] - else: - yield 0 - - for aa, mods in peptidoform.parsed_sequence: - try: - if not mods: - yield AMINO_ACID_IDS[aa] - else: - yield self.modifications[aa, str(mods[0])]["mod_id"] - except KeyError as e: - raise exceptions.InvalidAminoAcidError( - f"Unsupported amino acid found in peptide `{peptidoform.proforma}`" - ) from e - - if peptidoform.properties["c_term"]: - mod_str = str(peptidoform.properties["c_term"][0]) - yield self.modifications["c_term", mod_str]["mod_id"] - else: - yield 0 - - self.validate_peptidoform(peptidoform) - return np.array(list(_generate_encoding(peptidoform)), dtype=np.uint16) diff --git a/ms2pip/_utils/feature_names.py b/ms2pip/_utils/feature_names.py index 35632ab8..ea0c8fa7 100644 --- a/ms2pip/_utils/feature_names.py +++ b/ms2pip/_utils/feature_names.py @@ -1,4 +1,7 @@ -from ms2pip._utils.encoder import AMINO_ACIDS +AMINO_ACIDS = [ + "A", "C", "D", "E", "F", "G", "H", "I", "K", "M", + "N", "P", "Q", "R", "S", "T", "V", "W", "Y", +] def get_feature_names(): diff --git a/ms2pip/_utils/ion_mobility.py b/ms2pip/_utils/ion_mobility.py index 7dceb9f6..530703cd 100644 --- a/ms2pip/_utils/ion_mobility.py +++ b/ms2pip/_utils/ion_mobility.py @@ -14,7 +14,7 @@ class IonMobility: def __init__(self, processes=1) -> None: # Lazy import to avoid loading loading heavy dependencies when not needed try: - from im2deep.im2deep import predict_ccs # noqa: F401 + from im2deep.im2deep import predict_ccs # noqa: F401 # type: ignore[ty:unresolved-import] self.predict_fn = predict_ccs self.processes = processes diff --git a/ms2pip/_utils/psm_input.py b/ms2pip/_utils/psm_input.py index 7a0cacc6..ee57e461 100644 --- a/ms2pip/_utils/psm_input.py +++ b/ms2pip/_utils/psm_input.py @@ -1,8 +1,5 @@ -from __future__ import annotations - import logging from pathlib import Path -from typing import Union import psm_utils.io.peptide_record from psm_utils import PSMList @@ -10,7 +7,7 @@ logger = logging.getLogger(__name__) -def read_psms(psms: Union[str, Path, PSMList], filetype: Union[str, None]) -> PSMList: +def read_psms(psms: str | Path | PSMList, filetype: str | None) -> PSMList: """Read PSMList or PSM file.""" # Read PSMs if isinstance(psms, (str, Path)): diff --git a/ms2pip/_utils/retention_time.py b/ms2pip/_utils/retention_time.py index 043f24a2..675688cb 100644 --- a/ms2pip/_utils/retention_time.py +++ b/ms2pip/_utils/retention_time.py @@ -71,7 +71,7 @@ def _init_deeplc(self): """ # Only import if DeepLC will be used, otherwise lots of extra heavy # dependencies (e.g. Tensorflow) are imported as well - import deeplc + import deeplc # type: ignore[ty:unresolved-import] deeplc_params = self.config["deeplc"] if "calibration_file" in deeplc_params and deeplc_params["calibration_file"]: @@ -109,9 +109,7 @@ def _prepare_deeplc_peptide_df(self): def _run_deeplc(self): """Run DeepLC.""" logger.info("Predicting retention times with DeepLC...") - self.deeplc_preds = self.deeplc_predictor.make_preds( - seq_df=self.deeplc_pep_df.fillna("") - ) + self.deeplc_preds = self.deeplc_predictor.make_preds(seq_df=self.deeplc_pep_df.fillna("")) # type: ignore[ty:unresolved-attribute] def _parse_deeplc_preds(self): """Add DeepLC predictions to peprec DataFrame.""" diff --git a/ms2pip/_utils/xgb_models.py b/ms2pip/_utils/xgb_models.py index 6e933a91..f20a5733 100644 --- a/ms2pip/_utils/xgb_models.py +++ b/ms2pip/_utils/xgb_models.py @@ -5,58 +5,130 @@ import os import urllib.request from itertools import islice +from pathlib import Path import numpy as np import xgboost as xgb -from ms2pip.exceptions import InvalidXGBoostModelError +import ms2pip.exceptions as exceptions +from ms2pip.constants import MODELS logger = logging.getLogger(__name__) +_MAX_PREDICTION_THREADS = 16 -def validate_requested_xgb_model(xgboost_model_files, xgboost_model_hashes, model_dir): - """Validate requested XGBoost models, and download if necessary""" - for _, model_file in xgboost_model_files.items(): + +def validate_model(model: str, model_dir: str | Path | None = None) -> Path: + """ + Validate model name and ensure XGBoost model files are available. + + Downloads missing model files if necessary. + + Parameters + ---------- + model + Model name (must be a key in :data:`ms2pip.constants.MODELS`). + model_dir + Directory for XGBoost model files. Default: ``~/.ms2pip``. + + Returns + ------- + model_dir + Resolved model directory as Path. + + """ + model_dir = Path(model_dir) if model_dir else Path.home() / ".ms2pip" + if model not in MODELS: + raise exceptions.UnknownModelError(model) + logger.debug("Using %s model", model) + _validate_requested_xgb_model( + MODELS[model]["xgboost_model_files"], + MODELS[model]["model_hash"], + model_dir, + ) + return model_dir + + +def _validate_requested_xgb_model(xgboost_model_files, xgboost_model_hashes, model_dir): + """Validate requested XGBoost models, and download if necessary.""" + for model_file in xgboost_model_files.values(): if not _check_model_presence(model_file, xgboost_model_hashes[model_file], model_dir): _download_model(model_file, xgboost_model_hashes[model_file], model_dir) -def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1): +def load_xgb_models( + model_params: dict, + model_dir, + processes: int | None = None, +) -> dict: """ - Get predictions starting from feature vectors in DMatrix object. + Load XGBoost models from disk. + + Returns a dict of ion_type -> xgb.Booster that can be passed to + :func:`predict_intensities` to avoid re-loading on every call. Parameters ---------- - features: xgb.DMatrix, np.ndarray - Feature vectors in DMatrix object or as Numpy array. - num_ions: list[int] - List with number of ions (per series) for each peptide, i.e. peptide length - 1 - model_params: dict - Model configuration as defined in ms2pip.ms2pipC.MODELS. - model_dir: str + model_params + Model configuration dict (must contain ``xgboost_model_files``). + model_dir Directory where model files are stored. - processes: int - Number of CPUs to use in multiprocessing + processes + Number of threads for XGBoost prediction. Capped internally. """ - # Init models - xgboost_models = _initialize_xgb_models( - model_params["xgboost_model_files"], - model_dir, - processes, + nthread = min( + processes if processes is not None else (os.cpu_count() or 1), + _MAX_PREDICTION_THREADS, ) - if isinstance(features, np.ndarray): - features = xgb.DMatrix(features) + return _initialize_xgb_models(model_params["xgboost_model_files"], model_dir, nthread) + + +def predict_intensities( + features: np.ndarray, + num_ions: list[int], + model_params: dict, + model_dir, + processes: int | None = None, + xgb_models: dict | None = None, +) -> list[dict[str, np.ndarray]]: + """ + Predict intensities from feature vectors using XGBoost models. + + Parameters + ---------- + features + Feature vectors as numpy array. Will be converted to DMatrix internally. + num_ions + Number of ions (per series) for each peptide, i.e. peptide length - 1. + model_params + Model configuration dict (must contain ``xgboost_model_files``). + model_dir + Directory where model files are stored. + processes + Number of threads for XGBoost prediction. Capped internally to avoid + diminishing returns. By default, uses all available cores (up to cap). + xgb_models + Pre-loaded XGBoost models (from :func:`load_xgb_models`). When provided, + ``model_params``, ``model_dir``, and ``processes`` are ignored for model + loading. + + Returns + ------- + predictions + List of dicts mapping ion type to predicted intensity array, one per peptide. + + """ + if xgb_models is None: + xgb_models = load_xgb_models(model_params, model_dir, processes) + dmatrix = xgb.DMatrix(features) logger.debug("Predicting intensities from XGBoost model files...") prediction_dict = {} - for ion_type, xgb_model in xgboost_models.items(): - # Get predictions from XGBoost model - preds = xgb_model.predict(features) - preds = preds.clip(min=np.log2(0.001)) # Clip negative intensities - xgb_model.__del__() + for ion_type, xgb_model in xgb_models.items(): + preds = xgb_model.predict(dmatrix) + preds = preds.clip(min=np.log2(0.001)) - # Reshape into arrays for each peptide if ion_type.lower() in ["x", "y", "y2", "z"]: preds = _split_list_by_lengths(preds, num_ions, reverse=True) elif ion_type.lower() in ["a", "b", "b2", "c"]: @@ -65,10 +137,8 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1 raise ValueError(f"Unsupported ion_type: {ion_type}") prediction_dict[ion_type] = preds - # Convert to list per peptide with dicts per ion type num_peptides = len(list(prediction_dict.values())[0]) - predictions = [{k: v[i] for k, v in prediction_dict.items()} for i in range(num_peptides)] - return predictions + return [{k: v[i] for k, v in prediction_dict.items()} for i in range(num_peptides)] def _split_list_by_lengths(list_in, lengths, reverse=False): @@ -91,18 +161,19 @@ def _check_model_presence(model, model_hash, model_dir): def _download_model(model, model_hash, model_dir): """Download the xgboost model from the Genesis server.""" - if not os.path.isdir(model_dir): - os.mkdir(model_dir) + os.makedirs(model_dir, exist_ok=True) filename = os.path.join(model_dir, model) logger.info(f"Downloading {model} to {filename}...") try: - urllib.request.urlretrieve(f"https://genesis.ugent.be/uvpublicdata/ms2pip/{model}", filename) + urllib.request.urlretrieve( + f"https://genesis.ugent.be/uvpublicdata/ms2pip/{model}", filename + ) except Exception: logger.warning("Falling back to Zenodo for model downloads.") urllib.request.urlretrieve(f"https://zenodo.org/records/13270668/files/{model}", filename) if not _check_model_integrity(filename, model_hash): - raise InvalidXGBoostModelError() + raise exceptions.InvalidXGBoostModelError() def _check_model_integrity(filename, model_hash): @@ -117,7 +188,7 @@ def _check_model_integrity(filename, model_hash): if sha1_hash.hexdigest() == model_hash: return True else: - logger.warn("Model hash not recognized.") + logger.warning("Model hash not recognized.") return False diff --git a/ms2pip/constants.py b/ms2pip/constants.py index c72affe2..b55a2c31 100644 --- a/ms2pip/constants.py +++ b/ms2pip/constants.py @@ -1,15 +1,9 @@ """Constants and fixed configurations for MS²PIP.""" -# Models and their properties -# id is passed to get_predictions to select model -# ion_types is required to write the ion types in the headers of the result files -# features_version is required to select the features version MODELS = { "CID": { - "id": 0, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20190107_CID_train_B.xgboost", "y": "model_20190107_CID_train_Y.xgboost", @@ -20,16 +14,20 @@ }, }, "HCD2019": { - "id": 1, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", + "xgboost_model_files": { + "b": "model_20190107_HCD_train_B.xgboost", + "y": "model_20190107_HCD_train_Y.xgboost", + }, + "model_hash": { + "model_20190107_HCD_train_B.xgboost": "2503856c382806672e4b85f6b0ccc1f3093acc1b", + "model_20190107_HCD_train_Y.xgboost": "867bbc9940f75845b3f4f845d429b3780c997a02", + }, }, "TTOF5600": { - "id": 2, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20190107_TTOF5600_train_B.xgboost", "y": "model_20190107_TTOF5600_train_Y.xgboost", @@ -40,16 +38,20 @@ }, }, "TMT": { - "id": 3, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", + "xgboost_model_files": { + "b": "model_20190107_TMT_train_B.xgboost", + "y": "model_20190107_TMT_train_Y.xgboost", + }, + "model_hash": { + "model_20190107_TMT_train_B.xgboost": "352073a591d45a2e3181818f5feef99c22755af7", + "model_20190107_TMT_train_Y.xgboost": "d9a73bff21ab504bb91eb386f20cd8a86d60c95d", + }, }, "iTRAQ": { - "id": 4, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20190107_iTRAQ_train_B.xgboost", "y": "model_20190107_iTRAQ_train_Y.xgboost", @@ -60,10 +62,8 @@ }, }, "iTRAQphospho": { - "id": 5, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20190107_iTRAQphospho_train_B.xgboost", "y": "model_20190107_iTRAQphospho_train_Y.xgboost", @@ -73,18 +73,25 @@ "model_20190107_iTRAQphospho_train_Y.xgboost": "261b2e1810a299ed7ebf193ce1fb81a608c07d3b", }, }, - # ETD': {'id': 6, 'ion_types': ['B', 'Y', 'C', 'Z'], 'peaks_version': 'etd', 'features_version': 'normal'}, "HCDch2": { - "id": 7, "ion_types": ["B", "Y", "B2", "Y2"], - "peaks_version": "ch2", - "features_version": "normal", + "fragmentation": "cidhcd", + "xgboost_model_files": { + "b": "model_20190107_HCD_train_B.xgboost", + "y": "model_20190107_HCD_train_Y.xgboost", + "b2": "model_20190107_HCD_train_B2.xgboost", + "y2": "model_20190107_HCD_train_Y2.xgboost", + }, + "model_hash": { + "model_20190107_HCD_train_B.xgboost": "2503856c382806672e4b85f6b0ccc1f3093acc1b", + "model_20190107_HCD_train_Y.xgboost": "867bbc9940f75845b3f4f845d429b3780c997a02", + "model_20190107_HCD_train_B2.xgboost": "2df86d3576e85bfd25cc149d723f1613baf854d0", + "model_20190107_HCD_train_Y2.xgboost": "0a116ad9f14925fc70e3eceed9484b16ca8edddb", + }, }, "CIDch2": { - "id": 8, "ion_types": ["B", "Y", "B2", "Y2"], - "peaks_version": "ch2", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20190107_CID_train_B.xgboost", "y": "model_20190107_CID_train_Y.xgboost", @@ -99,10 +106,8 @@ }, }, "HCD2021": { - "id": 9, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20210416_HCD2021_B.xgboost", "y": "model_20210416_HCD2021_Y.xgboost", @@ -113,10 +118,8 @@ }, }, "Immuno-HCD": { - "id": 10, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20210316_Immuno_HCD_B.xgboost", "y": "model_20210316_Immuno_HCD_Y.xgboost", @@ -127,10 +130,8 @@ }, }, "CID-TMT": { - "id": 11, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20220104_CID_TMT_B.xgboost", "y": "model_20220104_CID_TMT_Y.xgboost", @@ -141,10 +142,8 @@ }, }, "timsTOF2023": { - "id": 12, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20230912_timsTOF_B.xgboost", "y": "model_20230912_timsTOF_Y.xgboost", @@ -155,10 +154,8 @@ }, }, "timsTOF2024": { - "id": 13, "ion_types": ["B", "Y"], - "peaks_version": "general", - "features_version": "normal", + "fragmentation": "cidhcd", "xgboost_model_files": { "b": "model_20240105_timsTOF_B.xgboost", "y": "model_20240105_timsTOF_Y.xgboost", diff --git a/ms2pip/core.py b/ms2pip/core.py index 5bf93c6b..70445f0e 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -1,87 +1,325 @@ #!/usr/bin/env python from __future__ import annotations -import itertools import logging -import multiprocessing -import multiprocessing.dummy -import re -from collections import defaultdict +import os +from collections.abc import Generator from math import ceil from pathlib import Path -from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union import numpy as np import pandas as pd +from ms2rescore_rs import ( + AnnotatedMS2Spectrum, # type: ignore[ty:unresolved-import] + MS2Spectrum, # type: ignore[ty:unresolved-import] + Precursor, # type: ignore[ty:unresolved-import] + annotate_ms2_spectra, # type: ignore[ty:unresolved-import] + ms2pip_compute_features, # type: ignore[ty:unresolved-import] + ms2pip_compute_theoretical_mz, # type: ignore[ty:unresolved-import] +) from psm_utils import PSM, Peptidoform, PSMList from rich.progress import track -from ms2rescore_rs import MS2Spectrum import ms2pip.exceptions as exceptions -from ms2pip._cython_modules import ms2pip_pyx -from ms2pip._utils.encoder import Encoder -from ms2pip._utils.feature_names import get_feature_names from ms2pip._utils.ion_mobility import IonMobility from ms2pip._utils.psm_input import read_psms from ms2pip._utils.retention_time import RetentionTime -from ms2pip._utils.xgb_models import get_predictions_xgb, validate_requested_xgb_model +from ms2pip._utils.xgb_models import load_xgb_models, predict_intensities, validate_model from ms2pip.constants import MODELS from ms2pip.result import ProcessingResult, calculate_correlations from ms2pip.search_space import ProteomeSearchSpace from ms2pip.spectrum import ObservedSpectrum -from ms2pip.spectrum_input import read_spectrum_file -from ms2pip.spectrum_output import SUPPORTED_FORMATS +from ms2pip._spectrum_processing import ( + annotate_spectrum, + load_and_match_spectra, + proforma_to_mass_shift, + targets_from_annotations, +) logger = logging.getLogger(__name__) +NUM_FEATURES = 139 -def predict_single( - peptidoform: Union[Peptidoform, str], - model: Optional[str] = "HCD", - model_dir: Optional[Union[str, Path]] = None, -) -> ProcessingResult: + +def _set_rayon_threads(processes: int | None) -> None: + """Set RAYON_NUM_THREADS if processes is specified and not already set.""" + if processes is None: + return + if "RAYON_NUM_THREADS" in os.environ: + logger.debug( + "RAYON_NUM_THREADS already set to %s; not overriding with processes=%d", + os.environ["RAYON_NUM_THREADS"], + processes, + ) + return + os.environ["RAYON_NUM_THREADS"] = str(processes) + + +def _predict_batch_internal( + psm_list: PSMList, + model: str, + model_dir: str | Path | None = None, + processes: int | None = None, + xgb_models: dict | None = None, +) -> list[ProcessingResult]: """ - Predict fragmentation spectrum for a single peptide.\f + Batch predict features, m/z, and intensities for all PSMs. + + Uses ms2rescore-rs for feature computation and theoretical m/z calculation + (both internally parallelized via Rayon), then XGBoost for intensity prediction. """ - if isinstance(peptidoform, str): - peptidoform = Peptidoform(peptidoform) - psm = PSM(peptidoform=peptidoform, spectrum_id=0) - model_dir = model_dir if model_dir else Path.home() / ".ms2pip" + model_dir = validate_model(model, model_dir) + if not psm_list: + return [] + + ion_types = [it.lower() for it in MODELS[model]["ion_types"]] + frag_model = MODELS[model]["fragmentation"] + + proformas = [proforma_to_mass_shift(psm.peptidoform) for psm in psm_list] + num_ions = [len(psm.peptidoform.parsed_sequence) - 1 for psm in psm_list] + + _set_rayon_threads(processes) + + # Batch compute theoretical m/z (single Rust call, Rayon-parallelized) + logger.debug("Computing theoretical m/z for %d peptides...", len(proformas)) + all_mz = ms2pip_compute_theoretical_mz(proformas, ion_types, frag_model, "monoisotopic") + + # Batch compute features (single Rust call, Rayon-parallelized) + logger.debug("Computing features for %d peptides...", len(proformas)) + all_features = ms2pip_compute_features(proformas) + + # Predict intensities with XGBoost + logger.debug("Predicting intensities with XGBoost...") + predictions = predict_intensities( + np.concatenate([f.reshape(-1, NUM_FEATURES) for f in all_features]), + num_ions, + MODELS[model], + model_dir, + processes=processes, + xgb_models=xgb_models, + ) + + # Assemble results + results = [] + for i, psm in enumerate(psm_list): + results.append( + ProcessingResult( + psm_index=i, + psm=psm, + theoretical_mz={k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()}, + predicted_intensity=predictions[i], + ) + ) + return results + + +def _correlate_internal( + psm_spectrum_annotations: list[tuple[int, PSM, ObservedSpectrum, list]], + model: str, + model_dir: str | Path | None = None, + vector_file: bool = False, + annotations_only: bool = False, + processes: int | None = None, +) -> list[ProcessingResult]: + """ + Core correlation logic: extract targets, compute features/predictions, assemble results. + + Parameters + ---------- + psm_spectrum_annotations + List of (psm_index, psm, preprocessed_spectrum, peak_annotations) tuples. + Annotations are per-peak lists of ``(series, position, charge)`` tuples. + model + Name of prediction model. + model_dir + Directory for XGBoost model files. + vector_file + If True, return feature vectors instead of predictions (for training). + annotations_only + If True, return only m/z and observed intensities (no predictions). + + """ + if not annotations_only and not vector_file: + model_dir = validate_model(model, model_dir) ion_types = [it.lower() for it in MODELS[model]["ion_types"]] + frag_model = MODELS[model]["fragmentation"] + + _set_rayon_threads(processes) + + if not psm_spectrum_annotations: + return [] + + # Step 1: Extract targets from pre-computed annotations + all_targets = [] + for psm_index, psm, spectrum, peak_annotations in psm_spectrum_annotations: + if not psm.peptidoform.precursor_charge: + psm.peptidoform.precursor_charge = spectrum.precursor_charge # type: ignore[ty:invalid-assignment] - with Encoder.from_peptidoform(peptidoform) as encoder: - ms2pip_pyx.ms2pip_init(*encoder.encoder_files) - result = _process_peptidoform(0, psm, model, encoder, ion_types=ion_types) + seq_len = len(psm.peptidoform.parsed_sequence) + targets = targets_from_annotations( + peak_annotations, spectrum.intensity.astype(np.float32), ion_types, seq_len + ) + all_targets.append(targets) + + proformas = [proforma_to_mass_shift(psm.peptidoform) for _, psm, _, _ in psm_spectrum_annotations] + num_ions = [ + len(psm.peptidoform.parsed_sequence) - 1 for _, psm, _, _ in psm_spectrum_annotations + ] + + # Step 2: Compute features (needed for training and prediction, not annotation-only) + all_features = None + if not annotations_only: + logger.debug("Computing features for %d peptides...", len(proformas)) + all_features = ms2pip_compute_features(proformas) + + # Step 3: Compute theoretical m/z (needed for annotation and prediction, not training) + all_mz = None + if not vector_file: + logger.debug("Computing theoretical m/z for %d peptides...", len(proformas)) + all_mz = ms2pip_compute_theoretical_mz(proformas, ion_types, frag_model, "monoisotopic") - if "xgboost_model_files" in MODELS[model].keys(): - validate_requested_xgb_model( - MODELS[model]["xgboost_model_files"], - MODELS[model]["model_hash"], - model_dir, + # Step 4: Assemble results based on mode + results = [] + + if vector_file: + # Training mode: return feature vectors + observed targets + assert all_features is not None + for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations): + results.append( + ProcessingResult( + psm_index=psm_index, + psm=psm, + theoretical_mz=None, + predicted_intensity=None, + observed_intensity=all_targets[i], + correlation=None, + feature_vectors=all_features[i], + ) ) - predictions = np.array( - get_predictions_xgb( - result.feature_vectors, - [len(peptidoform.parsed_sequence) - 1], - MODELS[model], - model_dir, + + elif annotations_only: + # Annotation mode: return m/z + observed targets + assert all_mz is not None + for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations): + mz = {k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()} + results.append( + ProcessingResult( + psm_index=psm_index, + psm=psm, + theoretical_mz=mz, + predicted_intensity=None, + observed_intensity=all_targets[i], + correlation=None, + feature_vectors=None, ) ) - result.predicted_intensity = predictions[0] # Only one spectrum in predictions - result.feature_vectors = None - return result + else: + # Prediction mode: compute XGBoost predictions + assert all_features is not None and all_mz is not None + logger.debug("Predicting intensities with XGBoost...") + predictions = predict_intensities( + np.concatenate([f.reshape(-1, NUM_FEATURES) for f in all_features]), + num_ions, + MODELS[model], + model_dir, + processes=processes, + ) + + for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations): + mz = {k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()} + results.append( + ProcessingResult( + psm_index=psm_index, + psm=psm, + theoretical_mz=mz, + predicted_intensity=predictions[i], + observed_intensity=all_targets[i], + ) + ) + + return results + + +def _into_batches(iterable, batch_size: int) -> Generator[list, None, None]: + """Accumulate iterator elements into batches of a given size.""" + batch = [] + for item in iterable: + batch.append(item) + if len(batch) == batch_size: + yield batch + batch = [] + if batch: + yield batch + + +def _assemble_training_data(results: list[ProcessingResult], model: str) -> pd.DataFrame: + """Assemble training data from results list to single pandas DataFrame.""" + from ms2pip._utils.feature_names import get_feature_names + + ion_types = [it.lower() for it in MODELS[model]["ion_types"]] + + training_data = pd.DataFrame( + np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]), + columns=get_feature_names(), + ) + training_data["psm_index"] = np.concatenate( + [ + np.repeat(r.psm_index, r.feature_vectors.shape[0]) + for r in results + if r.feature_vectors is not None + ] + ) + for ion_type in ion_types: + if ion_type in ["a", "b", "b2", "c"]: + training_data[f"target_{ion_type}"] = np.concatenate( + [ + r.observed_intensity[ion_type] + for r in results + if r.feature_vectors is not None and r.observed_intensity is not None + ] + ) + elif ion_type in ["x", "y", "y2", "z"]: + training_data[f"target_{ion_type}"] = np.concatenate( + [ + r.observed_intensity[ion_type][::-1] + for r in results + if r.feature_vectors is not None and r.observed_intensity is not None + ] + ) + + training_data = training_data[ + ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types] + ] + + return training_data + + +def predict_single( + peptidoform: Peptidoform | str, + model: str = "HCD", + model_dir: str | Path | None = None, +) -> ProcessingResult: + """ + Predict fragmentation spectrum for a single peptide.\f + """ + if isinstance(peptidoform, str): + peptidoform = Peptidoform(peptidoform) + psm = PSM(peptidoform=peptidoform, spectrum_id=0) + psm_list = PSMList(psm_list=[psm]) + results = _predict_batch_internal(psm_list, model, model_dir) + return results[0] def predict_batch( - psms: Union[PSMList, str, Path], + psms: PSMList | str | Path, add_retention_time: bool = False, add_ion_mobility: bool = False, - psm_filetype: Optional[str] = None, - model: Optional[str] = "HCD", - model_dir: Optional[Union[str, Path]] = None, - processes: Optional[int] = None, -) -> List[ProcessingResult]: + psm_filetype: str | None = None, + model: str = "HCD", + model_dir: str | Path | None = None, + processes: int | None = None, +) -> list[ProcessingResult]: """ Predict fragmentation spectra for a batch of peptides.\f @@ -101,11 +339,12 @@ def predict_batch( model_dir Directory where XGBoost model files are stored. Default: `~/.ms2pip`. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Returns ------- - predictions: List[ProcessingResult] + predictions: list[ProcessingResult] Predicted spectra with theoretical m/z and predicted intensity values. """ @@ -123,29 +362,20 @@ def predict_batch( im_predictor = IonMobility(processes=processes) im_predictor.add_im_predictions(psm_list) - with Encoder.from_psm_list(psm_list) as encoder: - ms2pip_parallelized = _Parallelized( - encoder=encoder, - model=model, - model_dir=model_dir, - processes=processes, - ) - logger.info("Processing peptides...") - results = ms2pip_parallelized.process_peptides(psm_list) - - return results + logger.info("Processing peptides...") + return _predict_batch_internal(psm_list, model, model_dir, processes=processes) def predict_library( - fasta_file: Optional[Union[str, Path]] = None, - config: Optional[Union[ProteomeSearchSpace, dict, str, Path]] = None, + fasta_file: str | Path | None = None, + config: ProteomeSearchSpace | dict | str | Path | None = None, add_retention_time: bool = False, add_ion_mobility: bool = False, - model: Optional[str] = "HCD", - model_dir: Optional[Union[str, Path]] = None, + model: str = "HCD", + model_dir: str | Path | None = None, batch_size: int = 100000, - processes: Optional[int] = None, -) -> Generator[ProcessingResult, None, None]: + processes: int | None = None, +) -> Generator[list[ProcessingResult], None, None]: """ Predict spectral library from protein FASTA file.\f @@ -168,58 +398,66 @@ def predict_library( batch_size Number of peptides to process in each batch. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Yields ------ - predictions: List[ProcessingResult] + predictions: list[ProcessingResult] Predicted spectra with theoretical m/z and predicted intensity values. """ if fasta_file and config: - # Use provided proteome, but overwrite fasta_file - config = ProteomeSearchSpace.from_any(config) - config.fasta_file = fasta_file + search_space = ProteomeSearchSpace.from_any(config) + search_space.fasta_file = Path(fasta_file) elif fasta_file and not config: - # Default proteome search space with provided fasta_file - config = ProteomeSearchSpace(fasta_file=fasta_file) + search_space = ProteomeSearchSpace(fasta_file=fasta_file) elif not fasta_file and config: - # Use provided proteome - config = ProteomeSearchSpace.from_any(config) + search_space = ProteomeSearchSpace.from_any(config) else: raise ValueError("Either `fasta_file` or `config` must be provided.") - search_space = ProteomeSearchSpace.from_any(config) search_space.build() + # Pre-load XGBoost models once for all batches + model_dir = validate_model(model, model_dir) + xgb_models = load_xgb_models(MODELS[model], model_dir, processes) + for batch in track( _into_batches(search_space, batch_size=batch_size), description="Predicting spectra...", total=ceil(len(search_space) / batch_size), ): - yield predict_batch( - search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))), - add_retention_time=add_retention_time, - add_ion_mobility=add_ion_mobility, - model=model, - model_dir=model_dir, + psm_list = search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))) + + if add_retention_time: + RetentionTime(processes=processes).add_rt_predictions(psm_list) + if add_ion_mobility: + IonMobility(processes=processes).add_im_predictions(psm_list) + + yield _predict_batch_internal( + psm_list, + model, + model_dir, processes=processes, + xgb_models=xgb_models, ) def correlate( - psms: Union[PSMList, str, Path], - spectrum_file: Union[str, Path], - psm_filetype: Optional[str] = None, - spectrum_id_pattern: Optional[str] = None, + psms: PSMList | str | Path, + spectrum_file: str | Path, + psm_filetype: str | None = None, + spectrum_id_pattern: str | None = None, compute_correlations: bool = False, add_retention_time: bool = False, add_ion_mobility: bool = False, - model: Optional[str] = "HCD", - model_dir: Optional[Union[str, Path]] = None, + model: str = "HCD", + model_dir: str | Path | None = None, ms2_tolerance: float = 0.02, - processes: Optional[int] = None, -) -> List[ProcessingResult]: + ms2_tolerance_mode: str = "Da", + processes: int | None = None, +) -> list[ProcessingResult]: """ Compare predicted and observed intensities and optionally compute correlations.\f @@ -246,13 +484,16 @@ def correlate( model_dir Directory where XGBoost model files are stored. Default: `~/.ms2pip`. ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. + MS2 tolerance for observed spectrum peak annotation. By default, 0.02. + ms2_tolerance_mode + Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Returns ------- - results: List[ProcessingResult] + results: list[ProcessingResult] Predicted spectra with theoretical m/z and predicted intensity values, and optionally, correlations. @@ -270,18 +511,22 @@ def correlate( im_predictor = IonMobility(processes=processes) im_predictor.add_im_predictions(psm_list) - with Encoder.from_psm_list(psm_list) as encoder: - ms2pip_parallelized = _Parallelized( - encoder=encoder, - model=model, - model_dir=model_dir, - ms2_tolerance=ms2_tolerance, - processes=processes, + # Validate runs and collections + if len(psm_list.collections) != 1 or len(psm_list.runs) != 1: + raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") + + logger.info("Processing spectra and peptides...") + matched = load_and_match_spectra( + psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode + ) + + if not matched: + raise exceptions.NoMatchingSpectraFound( + "No spectra matching spectrum IDs from PSM list could be found in provided file." ) - logger.info("Processing spectra and peptides...") - results = ms2pip_parallelized.process_spectra(psm_list, spectrum_file, spectrum_id_pattern) - # Correlations also requested + results = _correlate_internal(matched, model, model_dir, processes=processes) + if compute_correlations: logger.info("Computing correlations") calculate_correlations(results) @@ -291,25 +536,28 @@ def correlate( def correlate_preloaded( - psms: Union[PSMList, List[PSM]], + psms: PSMList | list[PSM], compute_correlations: bool = False, - model: Optional[str] = "HCD", - model_dir: Optional[Union[str, Path]] = None, + model: str = "HCD", + model_dir: str | Path | None = None, ms2_tolerance: float = 0.02, - processes: Optional[int] = None, -) -> List[ProcessingResult]: + ms2_tolerance_mode: str = "Da", + processes: int | None = None, +) -> list[ProcessingResult]: """ Compare predicted and observed intensities for PSMs with preloaded spectra.\f - Processes PSMs that already have :py:class:`ms2rescore_rs.MS2Spectrum` - objects in their ``spectrum`` attribute. It extracts the spectra, performs predictions, - and optionally computes correlations. + Processes PSMs that already have :py:class:`ms2rescore_rs.MS2Spectrum` or + :py:class:`ms2rescore_rs.AnnotatedMS2Spectrum` objects in their ``spectrum`` + attribute. Parameters ---------- psms PSMList or list of PSM objects. Each PSM must have an - :py:class:`ms2rescore_rs.MS2Spectrum` object in its ``spectrum`` attribute. + :py:class:`ms2rescore_rs.MS2Spectrum` or + :py:class:`ms2rescore_rs.AnnotatedMS2Spectrum` object in its ``spectrum`` + attribute. compute_correlations Compute correlations between predictions and targets. Default: False. model @@ -317,21 +565,25 @@ def correlate_preloaded( model_dir Directory where XGBoost model files are stored. Default: `~/.ms2pip`. ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. + MS2 tolerance for observed spectrum peak annotation. By default, 0.02. + Only used when spectra are not already annotated. + ms2_tolerance_mode + Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``. + Only used when spectra are not already annotated. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Returns ------- - results: List[ProcessingResult] + results: list[ProcessingResult] ProcessingResult objects with theoretical m/z, predicted intensity, and observed intensity values, and optionally, correlations. Raises ------ ValueError - If PSMs do not contain :py:class:`ms2rescore_rs.MS2Spectrum` objects in the - ``spectrum`` attribute. + If PSMs do not contain spectrum objects in the ``spectrum`` attribute. """ if isinstance(psms, list): @@ -339,16 +591,26 @@ def correlate_preloaded( else: psm_list = psms - if not all(psm_list["spectrum"]) or not (isinstance(psm_list["spectrum"][0], MS2Spectrum)): - raise ValueError("PSMs must contain MS2Spectrum objects in the 'spectrum' attribute.") + first_spectrum = psm_list["spectrum"][0] + if not all(psm_list["spectrum"]) or not isinstance( + first_spectrum, (MS2Spectrum, AnnotatedMS2Spectrum) + ): + raise ValueError( + "PSMs must contain MS2Spectrum or AnnotatedMS2Spectrum objects " + "in the 'spectrum' attribute." + ) + + spectra_are_annotated = isinstance(first_spectrum, AnnotatedMS2Spectrum) - # Convert MS2Spectrum -> ObservedSpectrum and preprocess - preloaded_spectra: Dict[str, ObservedSpectrum] = {} + # Convert to ObservedSpectrum and preprocess; store annotations if present + preloaded_spectra: dict[str, ObservedSpectrum] = {} + preloaded_annotations: dict[str, list] | None = {} if spectra_are_annotated else None for psm in psm_list: spec_id = str(psm.spectrum_id) if spec_id in preloaded_spectra: continue spectrum = psm.spectrum + assert spectrum is not None obs = ObservedSpectrum( mz=np.array(spectrum.mz, dtype=np.float32), intensity=np.array(spectrum.intensity, dtype=np.float32), @@ -363,23 +625,86 @@ def correlate_preloaded( obs.tic_norm() obs.log2_transform() preloaded_spectra[spec_id] = obs + if spectra_are_annotated: + assert isinstance(spectrum, AnnotatedMS2Spectrum) + assert preloaded_annotations is not None + preloaded_annotations[spec_id] = [ + [(a.series, a.position, a.charge) for a in peak_anns] + for peak_anns in spectrum.peak_annotations + ] + + # Build PSM-spectrum-annotation tuples + # For unannotated spectra, batch annotate using ms2rescore-rs + psm_spectrum_annotations = [] + needs_annotation = [] # indices into psm_spectrum_annotations that need annotation + + for i, psm in enumerate(psm_list): + spec_id = str(psm.spectrum_id) + spectrum = preloaded_spectra.get(spec_id) + if spectrum is None: + continue + if preloaded_annotations is not None and spec_id in preloaded_annotations: + psm_spectrum_annotations.append((i, psm, spectrum, preloaded_annotations[spec_id])) + else: + psm_spectrum_annotations.append((i, psm, spectrum, None)) + needs_annotation.append(len(psm_spectrum_annotations) - 1) - # Delegate to _Parallelized - with Encoder.from_psm_list(psm_list) as encoder: - ms2pip_parallelized = _Parallelized( - encoder=encoder, - model=model, - model_dir=model_dir, - ms2_tolerance=ms2_tolerance, - processes=processes, + if not psm_spectrum_annotations: + raise exceptions.NoMatchingSpectraFound( + "No spectra matching spectrum IDs from PSM list could be found." + ) + + # Batch annotate any unannotated spectra + if needs_annotation: + frag_model = MODELS[model]["fragmentation"] + batch_spectra = [] + batch_proformas = [] + batch_seq_lens = [] + for idx in needs_annotation: + _, psm, spectrum, _ = psm_spectrum_annotations[idx] + batch_spectra.append( + MS2Spectrum( + identifier=spectrum.identifier or "", + mz=list(spectrum.mz), + intensity=list(spectrum.intensity), + precursor=Precursor( + mz=float(spectrum.precursor_mz) if spectrum.precursor_mz else 0.0, + charge=int(spectrum.precursor_charge) if spectrum.precursor_charge else 0, + rt=float(spectrum.retention_time) if spectrum.retention_time else 0.0, + ), + ) + ) + batch_proformas.append(proforma_to_mass_shift(psm.peptidoform)) + batch_seq_lens.append(len(psm.peptidoform.parsed_sequence)) + + annotated = annotate_ms2_spectra( + spectra=batch_spectra, + proformas=batch_proformas, + seq_lens=batch_seq_lens, + fragmentation_model=frag_model, + mass_mode="monoisotopic", + tolerance_value=float(ms2_tolerance), + tolerance_mode=ms2_tolerance_mode.lower(), ) - logger.info("Processing spectra and peptides...") - results = ms2pip_parallelized.process_preloaded_spectra(psm_list, preloaded_spectra) + + for j, idx in enumerate(needs_annotation): + psm_index, psm, spectrum, _ = psm_spectrum_annotations[idx] + peak_annotations = [ + [(a.series, a.position, a.charge) for a in peak_anns] + for peak_anns in annotated[j].peak_annotations + ] + psm_spectrum_annotations[idx] = (psm_index, psm, spectrum, peak_annotations) + + logger.info("Processing spectra and peptides...") + results = _correlate_internal(psm_spectrum_annotations, model, model_dir, processes=processes) if compute_correlations: logger.info("Computing correlations") calculate_correlations(results) - logger.info(f"Median correlation: {np.median([r.correlation for r in results if r.correlation is not None])}") + logger.info( + f"Median correlation: " + f"{np.median([r.correlation for r in results if r.correlation is not None])}" + ) return results @@ -387,6 +712,7 @@ def correlate_preloaded( def correlate_single( observed_spectrum: ObservedSpectrum, ms2_tolerance: float = 0.02, + ms2_tolerance_mode: str = "Da", model: str = "HCD", ) -> ProcessingResult: """ @@ -397,7 +723,9 @@ def correlate_single( observed_spectrum ObservedSpectrum instance with observed m/z and intensity values and peptidoform. ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. + MS2 tolerance for observed spectrum peak annotation. By default, 0.02. + ms2_tolerance_mode + Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``. model Model to use for prediction. Default: "HCD". @@ -407,47 +735,41 @@ def correlate_single( Result with theoretical m/z, predicted intensity, observed intensity, and correlation. """ - # Check peptidoform in observed spectrum if not isinstance(observed_spectrum.peptidoform, Peptidoform): raise ValueError("Peptidoform must be set in observed spectrum to correlate.") - # Annotate spectrum and get target intensities - with Encoder.from_peptidoform(observed_spectrum.peptidoform) as encoder: - ms2pip_pyx.ms2pip_init(*encoder.encoder_files) - enc_peptidoform = encoder.encode_peptidoform(observed_spectrum.peptidoform) - targets = ms2pip_pyx.get_targets( - enc_peptidoform, - observed_spectrum.mz.astype(np.float32), - observed_spectrum.intensity.astype(np.float32), - float(ms2_tolerance), - MODELS[model]["peaks_version"], - ) + # Preprocess a copy of the spectrum (TIC normalization + log2 transform) + preprocessed = observed_spectrum.model_copy(deep=True) + for label_type in ["iTRAQ", "TMT"]: + if label_type in model: + preprocessed.remove_reporter_ions(label_type) + preprocessed.tic_norm() + preprocessed.log2_transform() - # Reshape to dict with intensities per ion type + psm = PSM(peptidoform=observed_spectrum.peptidoform, spectrum_id=0) + annotated = annotate_spectrum(preprocessed, psm, model, ms2_tolerance, ms2_tolerance_mode) ion_types = [it.lower() for it in MODELS[model]["ion_types"]] - observed_intensity = { - i: np.array(p, dtype=np.float32).clip(min=np.log2(0.001)) # Clip negative intensities - for i, p in zip(ion_types, targets) - } + seq_len = len(observed_spectrum.peptidoform.parsed_sequence) + observed_intensity = targets_from_annotations( + annotated, preprocessed.intensity.astype(np.float32), ion_types, seq_len + ) - # Predict spectrum and add target intensities result = predict_single(observed_spectrum.peptidoform, model=model) result.observed_intensity = observed_intensity - # Add correlation calculate_correlations([result]) - return result def get_training_data( - psms: Union[PSMList, str, Path], - spectrum_file: Union[str, Path], - psm_filetype: Optional[str] = None, - spectrum_id_pattern: Optional[str] = None, - model: Optional[str] = "HCD", + psms: PSMList | str | Path, + spectrum_file: str | Path, + psm_filetype: str | None = None, + spectrum_id_pattern: str | None = None, + model: str = "HCD", ms2_tolerance: float = 0.02, - processes: Optional[int] = None, + ms2_tolerance_mode: str = "Da", + processes: int | None = None, ): """ Extract feature vectors and target intensities from observed spectra for training.\f @@ -468,9 +790,12 @@ def get_training_data( Model to use as reference for the ion types that are extracted from the observed spectra. Default: "HCD", which results in the extraction of singly charged b- and y-ions. ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. + MS2 tolerance for observed spectrum peak annotation. By default, 0.02. + ms2_tolerance_mode + Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Returns ------- @@ -481,32 +806,40 @@ def get_training_data( psm_list = read_psms(psms, filetype=psm_filetype) spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)" - with Encoder.from_psm_list(psm_list) as encoder: - ms2pip_parallelized = _Parallelized( - encoder=encoder, - model=model, - ms2_tolerance=ms2_tolerance, - processes=processes, - ) - logger.info("Processing spectra and peptides...") - results = ms2pip_parallelized.process_spectra( - psm_list, spectrum_file, spectrum_id_pattern, vector_file=True + if len(psm_list.collections) != 1 or len(psm_list.runs) != 1: + raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") + + logger.info("Processing spectra and peptides...") + matched = load_and_match_spectra( + psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode + ) + + if not matched: + raise exceptions.NoMatchingSpectraFound( + "No spectra matching spectrum IDs from PSM list could be found in provided file." ) - logger.info("Assembling training data in DataFrame...") - training_data = _assemble_training_data(results, model) + results = _correlate_internal( + matched, + model, + model_dir=None, + vector_file=True, + processes=processes, + ) - return training_data + logger.info("Assembling training data in DataFrame...") + return _assemble_training_data(results, model) def annotate_spectra( - psms: Union[PSMList, str, Path], - spectrum_file: Union[str, Path], - psm_filetype: Optional[str] = None, - spectrum_id_pattern: Optional[str] = None, - model: Optional[str] = "HCD", + psms: PSMList | str | Path, + spectrum_file: str | Path, + psm_filetype: str | None = None, + spectrum_id_pattern: str | None = None, + model: str = "HCD", ms2_tolerance: float = 0.02, - processes: Optional[int] = None, + ms2_tolerance_mode: str = "Da", + processes: int | None = None, ): """ Annotate observed spectra.\f @@ -527,37 +860,45 @@ def annotate_spectra( Model to use as reference for the ion types that are extracted from the observed spectra. Default: "HCD", which results in the extraction of singly charged b- and y-ions. ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. + MS2 tolerance for observed spectrum peak annotation. By default, 0.02. + ms2_tolerance_mode + Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``. processes - Number of parallel processes for multiprocessing steps. By default, all available. + Number of threads for Rayon (Rust) and XGBoost parallelism. By default, + all available. Returns ------- - results: List[ProcessingResult] + results: list[ProcessingResult] List of ProcessingResult objects with theoretical m/z and observed intensity values. """ psm_list = read_psms(psms, filetype=psm_filetype) spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)" - with Encoder.from_psm_list(psm_list) as encoder: - ms2pip_parallelized = _Parallelized( - encoder=encoder, - model=model, - ms2_tolerance=ms2_tolerance, - processes=processes, - ) - logger.info("Processing spectra and peptides...") - results = ms2pip_parallelized.process_spectra( - psm_list, spectrum_file, spectrum_id_pattern, vector_file=False, annotations_only=True + if len(psm_list.collections) != 1 or len(psm_list.runs) != 1: + raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") + + logger.info("Processing spectra and peptides...") + matched = load_and_match_spectra( + psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode + ) + + if not matched: + raise exceptions.NoMatchingSpectraFound( + "No spectra matching spectrum IDs from PSM list could be found in provided file." ) - return results + return _correlate_internal( + matched, + model, + model_dir=None, + annotations_only=True, + processes=processes, + ) -def download_models( - models: Optional[List[str]] = None, model_dir: Optional[Union[str, Path]] = None -): +def download_models(models: list[str] | None = None, model_dir: str | Path | None = None): """ Download all specified models to the specified directory. @@ -577,619 +918,9 @@ def download_models( models = list(MODELS.keys()) for model in models: - try: - if "xgb_model_files" in MODELS[model].keys(): - continue - except KeyError: + if model not in MODELS: raise exceptions.UnknownModelError(model) + if "xgboost_model_files" not in MODELS[model]: + continue logger.debug("Downloading %s model files", model) - validate_requested_xgb_model( - MODELS[model]["xgboost_model_files"], - MODELS[model]["model_hash"], - model_dir, - ) - - -class _Parallelized: - """Implementations of common multiprocessing functionality across MS²PIP usage modes.""" - - def __init__( - self, - encoder: Encoder = None, - model: Optional[str] = None, - model_dir: Optional[Union[str, Path]] = None, - ms2_tolerance: float = 0.02, - processes: Optional[int] = None, - ): - """ - Implementations of common multiprocessing functionality across MS²PIP usage modes. - - Parameters - ---------- - encoding - Configured encoding class instance. Required if input peptides contain modifications. - model - Name of the model to use for predictions. Overrides configuration file. - model_dir - Custom directory for downloaded XGBoost model files. By default, `~/.ms2pip` is used. - ms2_tolerance - MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da. - processes - Number of parallel processes for multiprocessing steps. By default, all available. - - """ - # Input parameters - self.encoder = encoder - self.model = model - self.model_dir = model_dir if model_dir else Path.home() / ".ms2pip" - self.ms2_tolerance = ms2_tolerance - self.processes = processes if processes else multiprocessing.cpu_count() - - # Setup encoder if not configured - if not self.encoder: - self.encoder = Encoder() - self.encoder.write_encoder_files() - - # Validate requested model - if self.model in MODELS.keys(): - logger.debug("Using %s model", self.model) - if "xgboost_model_files" in MODELS[self.model].keys(): - validate_requested_xgb_model( - MODELS[self.model]["xgboost_model_files"], - MODELS[self.model]["model_hash"], - self.model_dir, - ) - else: - raise exceptions.UnknownModelError(self.model) - - def _get_pool(self): - """Get multiprocessing pool.""" - logger.debug(f"Starting workers (processes={self.processes})...") - if multiprocessing.current_process().daemon: - logger.warning( - "MS²PIP is running in a daemon process. Disabling multiprocessing as daemonic " - "processes cannot have children." - ) - return multiprocessing.dummy.Pool(1) - elif self.processes == 1: - logger.debug("Using dummy multiprocessing pool.") - return multiprocessing.dummy.Pool(1) - else: - return multiprocessing.get_context("spawn").Pool(self.processes) - - def _validate_output_formats(self, output_formats: List[str]) -> List[str]: - """Validate requested output formats.""" - if not output_formats: - self.output_formats = ["csv"] - else: - for output_format in output_formats: - if output_format not in SUPPORTED_FORMATS: - raise exceptions.UnknownOutputFormatError(output_format) - self.output_formats = output_formats - - def _execute_in_pool(self, psm_list: PSMList, func: Callable, args: tuple): - """Execute function in multiprocessing pool.""" - - def get_chunk_size(n_items, n_processes): - """Get optimal chunk size for multiprocessing.""" - if n_items < 5000: - return n_items - else: - max_chunk_size = 50000 - n_chunks = ceil(ceil(n_items / n_processes) / max_chunk_size) * n_processes - return ceil(n_items / n_chunks) - - def to_chunks(_list, chunk_size): - """Split _list into chunks of size chunk_size.""" - - def _generate_chunks(): - for i in range(0, len(_list), chunk_size): - yield _list[i : i + chunk_size] - - _list = list(_list) - return list(_generate_chunks()) - - def _enumerated_psm_list_by_spectrum_id(psm_list, spectrum_ids_chunk): - selected_indices = np.flatnonzero(np.isin(psm_list["spectrum_id"], spectrum_ids_chunk)) - return [(i, psm_list.psm_list[i]) for i in selected_indices] - - with self._get_pool() as pool: - if not psm_list: - logger.warning("No PSMs to process.") - return [] - - # Split PSMList into chunks - if func == _process_spectra: - # Split by spectrum_id to keep PSMs for same spectrum together - spectrum_ids = set(psm_list["spectrum_id"]) - chunk_size = get_chunk_size(len(spectrum_ids), pool._processes) - chunks = [ - _enumerated_psm_list_by_spectrum_id(psm_list, spectrum_ids_chunk) - for spectrum_ids_chunk in to_chunks(spectrum_ids, chunk_size) - ] - else: - # Simple split by PSM - chunk_size = get_chunk_size(len(psm_list), pool._processes) - chunks = to_chunks(list(enumerate(psm_list)), chunk_size) - - logger.debug(f"Processing {len(chunks)} chunk(s) of ~{chunk_size} entries each.") - - # Add jobs to pool - mp_results = [] - for psm_list_chunk in chunks: - mp_results.append(pool.apply_async(func, args=(psm_list_chunk, *args))) - - # Gather results - # results = [ - # r.get() - # for r in track( - # mp_results, - # disable=len(chunks) == 1, - # description="Processing chunks...", - # transient=True, - # show_speed=False, - # ) - # ] - results = [r.get() for r in mp_results] - - # Sort results by input order - results = list( - sorted( - itertools.chain.from_iterable(results), - key=lambda result: result.psm_index, - ) - ) - - return results - - def process_peptides(self, psm_list: PSMList) -> List[ProcessingResult]: - """Process peptides in parallel.""" - results = self._execute_in_pool( - psm_list, - _process_peptides, - (self.encoder, self.model), - ) - logger.debug(f"Gathered data for {len(results)} peptides.") - - # Add XGBoost predictions if required - if "xgboost_model_files" in MODELS[self.model].keys(): - results = self._add_xgboost_predictions(results) - - return results - - def process_spectra( - self, - psm_list: PSMList, - spectrum_file: Union[str, Path], - spectrum_id_pattern: str, - vector_file: bool = False, - annotations_only: bool = False, - ) -> List[ProcessingResult]: - """ - Process PSMs and observed spectra in parallel - - Parameters - ---------- - psm_list - psm_utils.PSMList instance with PSMs to process - spectrum_file - Filename of spectrum file - spectrum_id_pattern - Regular expression pattern to apply to spectrum titles before matching to - peptide file entries - vector_file - If feature vectors should be extracted instead of predictions - annotations_only - If only peak annotations should be extracted from the spectrum file - - """ - # Validate runs and collections - if not len(psm_list.collections) == 1 or not len(psm_list.runs) == 1: - raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") - - args = ( - spectrum_file, - vector_file, - self.encoder, - self.model, - self.ms2_tolerance, - spectrum_id_pattern, - annotations_only, - None, # preloaded_spectra - ) - results = self._execute_in_pool(psm_list, _process_spectra, args) - - # Validate number of results - if not results: - raise exceptions.NoMatchingSpectraFound( - "No spectra matching spectrum IDs from PSM list could be found in provided file." - ) - logger.debug(f"Gathered data for {len(results)} PSMs.") - - # Add XGBoost predictions if required - if ( - not (vector_file or annotations_only) - and "xgboost_model_files" in MODELS[self.model].keys() - ): - results = self._add_xgboost_predictions(results) - - return results - - def process_preloaded_spectra( - self, - psm_list: PSMList, - preloaded_spectra: Dict[str, ObservedSpectrum], - ) -> List[ProcessingResult]: - """ - Process PSMs with pre-loaded and preprocessed ObservedSpectrum objects. - - Parameters - ---------- - psm_list - psm_utils.PSMList instance with PSMs to process. - preloaded_spectra - Dictionary mapping spectrum IDs to preprocessed ObservedSpectrum objects. - - """ - args = ( - None, # spec_file - False, # vector_file - self.encoder, - self.model, - self.ms2_tolerance, - None, # spectrum_id_pattern - False, # annotations_only - preloaded_spectra, - ) - results = self._execute_in_pool(psm_list, _process_spectra, args) - - if not results: - raise exceptions.NoMatchingSpectraFound( - "No spectra matching spectrum IDs from PSM list could be found." - ) - logger.debug(f"Gathered data for {len(results)} PSMs.") - - if "xgboost_model_files" in MODELS[self.model].keys(): - results = self._add_xgboost_predictions(results) - - return results - - def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[ProcessingResult]: - """ - Add XGBoost predictions to results. - - Notes - ----- - This functions is applied after the parallel processing, as XGBoost implements its own - multiprocessing. - """ - - if "xgboost_model_files" not in MODELS[self.model].keys(): - raise ValueError("XGBoost model files not found in MODELS dictionary.") - - logger.debug("Converting feature vectors to XGBoost DMatrix...") - import xgboost as xgb - - results_to_predict = [r for r in results if r.feature_vectors is not None] - - if not results_to_predict: - return results - - num_ions = [len(r.psm.peptidoform.parsed_sequence) - 1 for r in results_to_predict] - xgb_vector = xgb.DMatrix(np.vstack(list(r.feature_vectors for r in results_to_predict))) - - predictions = get_predictions_xgb( - xgb_vector, - num_ions, - MODELS[self.model], - self.model_dir, - processes=self.processes, - ) - - logger.debug("Adding XGBoost predictions to results...") - for result, preds in zip(results_to_predict, predictions): - result.predicted_intensity = preds - result.feature_vectors = None - - return results - - -def _process_peptidoform( - psm_index: int, - psm: PSM, - model: str, - encoder: Encoder, - ion_types: Optional[List[str]] = None, -) -> ProcessingResult: - """ - Process a single peptidoform from a PSM. - - Get theoretical m/z and predicted intensities (from C model) or feature vectors (for XGBoost - model) for a single peptidoform from a PSM. - - Notes - ----- - - ``ms2pip_pyx.init()`` must be called before this function is called. - - Optionally, lowercase version of ``ion_types`` from the model configuration can be provided - to save computational time. - - """ - peptidoform = psm.peptidoform - if not ion_types: - ion_types = [it.lower() for it in MODELS[model]["ion_types"]] - - enc_peptide = encoder.encode_peptide(peptidoform) - enc_peptidoform = encoder.encode_peptidoform(peptidoform) - - # Get ion mzs and map to ion types - mz = ms2pip_pyx.get_mzs(enc_peptidoform, MODELS[model]["peaks_version"]) - mz = {i: np.array(mz, dtype=np.float32) for i, mz in zip(ion_types, mz)} - - # Get predictions from XGBoost models. - if "xgboost_model_files" in MODELS[model].keys(): - predictions = None - feature_vectors = np.array( - ms2pip_pyx.get_vector(enc_peptide, enc_peptidoform, peptidoform.precursor_charge), - dtype=np.uint16, - ) - # Or get predictions from C models. - else: - predictions = ms2pip_pyx.get_predictions( - enc_peptide, - enc_peptidoform, - peptidoform.precursor_charge, - MODELS[model]["id"], - MODELS[model]["peaks_version"], - 30.0, # TODO: Remove CE feature - ) - predictions = { - i: np.array(p, dtype=np.float32).clip(min=np.log2(0.001)) # Clip negative intensities - for i, p in zip(ion_types, predictions) - } - feature_vectors = None - - return ProcessingResult( - psm_index=psm_index, - psm=psm, - theoretical_mz=mz, - predicted_intensity=predictions, - observed_intensity=None, - feature_vectors=feature_vectors, - ) - - -def _process_peptides( - enumerated_psm_list: List[Tuple[int, PSM]], - encoder: Encoder, - model: str, -) -> List[ProcessingResult]: - """ - Predict spectrum for each entry in PeptideRecord DataFrame. - - Parameters - ---------- - enumerated_psm_list - List of tuples of (index, PSM) for each PSM in the input file. - encoder - Configured encoder to use for peptide and peptidoform encoding - model - Name of prediction model to be used - - """ - ms2pip_pyx.ms2pip_init(*encoder.encoder_files) - results = [] - ion_types = [it.lower() for it in MODELS[model]["ion_types"]] - - for psm_index, psm in enumerated_psm_list: - try: - result = _process_peptidoform(psm_index, psm, model, encoder, ion_types) - except ( - exceptions.InvalidPeptidoformError, - exceptions.InvalidAminoAcidError, - ): - result = ProcessingResult(psm_index=psm_index, psm=psm) - results.append(result) - - return results - - -def _process_spectra( - enumerated_psm_list: List[Tuple[int, PSM]], - spec_file: Optional[str], - vector_file: bool, - encoder: Encoder, - model: str, - ms2_tolerance: float, - spectrum_id_pattern: Optional[str], - annotations_only: bool = False, - preloaded_spectra: Optional[Dict[str, ObservedSpectrum]] = None, -) -> List[ProcessingResult]: - """ - Perform requested tasks for each spectrum in spectrum file or from preloaded spectra. - - Parameters - ---------- - enumerated_psm_list - List of tuples of (index, PSM) for each PSM in the input file. - spec_file - Filename of spectrum file. Not used when ``preloaded_spectra`` is provided. - vector_file - If feature vectors should be extracted instead of predictions. - encoder: Encoder - Configured encoder to use for peptide and peptidoform encoding. - model - Name of prediction model to be used. - ms2_tolerance - Fragmentation spectrum m/z error tolerance in Dalton. - spectrum_id_pattern - Regular expression pattern to apply to spectrum titles before matching to - peptide file entries. Not used when ``preloaded_spectra`` is provided. - annotations_only - If only peak annotations should be extracted from the spectrum file. - preloaded_spectra - Optional dictionary mapping spectrum IDs to preprocessed ObservedSpectrum objects. - When provided, spectra are looked up directly instead of reading from ``spec_file``. - - """ - ms2pip_pyx.ms2pip_init(*encoder.encoder_files) - results = [] - ion_types = [it.lower() for it in MODELS[model]["ion_types"]] - - def _process_psm_with_spectrum(psm_index, psm, spectrum): - """Process a single PSM against an observed spectrum.""" - try: - enc_peptidoform = encoder.encode_peptidoform(psm.peptidoform) - except exceptions.InvalidAminoAcidError: - return ProcessingResult(psm_index=psm_index, psm=psm) - - targets = ms2pip_pyx.get_targets( - enc_peptidoform, - spectrum.mz.astype(np.float32), - spectrum.intensity.astype(np.float32), - float(ms2_tolerance), - MODELS[model]["peaks_version"], - ) - targets = {i: np.array(t, dtype=np.float32) for i, t in zip(ion_types, targets)} - - if not psm.peptidoform.precursor_charge: - psm.peptidoform.precursor_charge = spectrum.precursor_charge - - if vector_file: - enc_peptide = encoder.encode_peptide(psm.peptidoform) - feature_vectors = np.array( - ms2pip_pyx.get_vector( - enc_peptide, enc_peptidoform, psm.peptidoform.precursor_charge - ), - dtype=np.uint16, - ) - return ProcessingResult( - psm_index=psm_index, - psm=psm, - theoretical_mz=None, - predicted_intensity=None, - observed_intensity=targets, - correlation=None, - feature_vectors=feature_vectors, - ) - - elif annotations_only: - mz = ms2pip_pyx.get_mzs(enc_peptidoform, MODELS[model]["peaks_version"]) - mz = {i: np.array(mz, dtype=np.float32) for i, mz in zip(ion_types, mz)} - return ProcessingResult( - psm_index=psm_index, - psm=psm, - theoretical_mz=mz, - predicted_intensity=None, - observed_intensity=targets, - correlation=None, - feature_vectors=None, - ) - - else: - try: - result = _process_peptidoform(psm_index, psm, model, encoder, ion_types) - except ( - exceptions.InvalidPeptidoformError, - exceptions.InvalidAminoAcidError, - ): - return ProcessingResult(psm_index=psm_index, psm=psm) - else: - result.observed_intensity = targets - return result - - if preloaded_spectra is not None: - # Spectra are already preprocessed; look up by spectrum ID - for psm_index, psm in enumerated_psm_list: - spectrum = preloaded_spectra.get(str(psm.spectrum_id)) - if spectrum is None: - continue - results.append(_process_psm_with_spectrum(psm_index, psm, spectrum)) - else: - # Read spectra from file and preprocess - assert spec_file is not None, "spec_file is required when preloaded_spectra is not provided" - try: - spectrum_id_regex = re.compile(spectrum_id_pattern) - except TypeError: - spectrum_id_regex = re.compile(r"(.*)") - - psms_by_specid = defaultdict(list) - for psm_index, psm in enumerated_psm_list: - psms_by_specid[str(psm.spectrum_id)].append((psm_index, psm)) - - for spectrum in read_spectrum_file(spec_file): - match = spectrum_id_regex.search(spectrum.identifier) - try: - spectrum_id = match[1] - except (TypeError, IndexError): - raise exceptions.TitlePatternError( - f"Spectrum title pattern `{spectrum_id_pattern}` could not be matched to " - f"spectrum ID `{spectrum.identifier}`. " - " Are you sure that the regex contains a capturing group?" - ) - - if spectrum_id not in psms_by_specid: - continue - - # Spectrum preprocessing - for label_type in ["iTRAQ", "TMT"]: - if label_type in model: - spectrum.remove_reporter_ions(label_type) - # spectrum.remove_precursor() # TODO: Decide to implement this or not - spectrum.tic_norm() - spectrum.log2_transform() - - for psm_index, psm in psms_by_specid[spectrum_id]: - results.append(_process_psm_with_spectrum(psm_index, psm, spectrum)) - - return results - - -def _assemble_training_data(results: List[ProcessingResult], model: str) -> pd.DataFrame: - """Assemble training data from results list to single pandas DataFrame.""" - # Get ion types - ion_types = [it.lower() for it in MODELS[model]["ion_types"]] - - # Assemble feature vectors, PSM indices, and targets - training_data = pd.DataFrame( - np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]), - columns=get_feature_names(), - ) - training_data["psm_index"] = np.concatenate( - [ - np.repeat(r.psm_index, r.feature_vectors.shape[0]) - for r in results - if r.feature_vectors is not None - ] - ) - for ion_type in ion_types: - if ion_type in ["a", "b", "b2", "c"]: - training_data[f"target_{ion_type}"] = np.concatenate( - [r.observed_intensity[ion_type] for r in results if r.feature_vectors is not None] - ) - elif ion_type in ["x", "y", "y2", "z"]: - training_data[f"target_{ion_type}"] = np.concatenate( - [ - r.observed_intensity[ion_type][::-1] - for r in results - if r.feature_vectors is not None - ] - ) - - # Reorder columns - training_data = training_data[ - ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types] - ] - - return training_data - - -def _into_batches(iterable: Iterable[Any], batch_size: int) -> Generator[List[Any], None, None]: - """Accumulate iterator elements into batches of a given size.""" - batch = [] - for item in iterable: - batch.append(item) - if len(batch) == batch_size: - yield batch - batch = [] - if batch: - yield batch + validate_model(model, model_dir) diff --git a/ms2pip/plot.py b/ms2pip/plot.py index 56f7bc63..5a9c2b97 100644 --- a/ms2pip/plot.py +++ b/ms2pip/plot.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Union from ms2pip.spectrum import Spectrum @@ -12,7 +11,7 @@ _can_plot = False -def spectrum_to_png(spectrum: Spectrum, filepath: Union[str, Path]): +def spectrum_to_png(spectrum: Spectrum, filepath: str | Path): """Plot a single spectrum and write to a PNG file.""" if not _can_plot: raise ImportError("Matplotlib and spectrum_utils are required to plot spectra.") diff --git a/ms2pip/result.py b/ms2pip/result.py index 3cf7eb73..37ef6beb 100644 --- a/ms2pip/result.py +++ b/ms2pip/result.py @@ -3,8 +3,9 @@ from __future__ import annotations import csv -from typing import Any, Dict, List, Optional, Tuple from logging import getLogger +from pathlib import Path +from typing import Any import numpy as np from psm_utils import PSM @@ -12,32 +13,31 @@ try: import spectrum_utils.plot as sup - import spectrum_utils.spectrum as sus except ImportError: - sus = None - sup = None + sup = None # type: ignore[ty:invalid-assignment] from ms2pip.spectrum import ObservedSpectrum, PredictedSpectrum logger = getLogger(__name__) + class ProcessingResult(BaseModel): """Result of processing a single PSM.""" psm_index: int - psm: Optional[PSM] = None - theoretical_mz: Optional[Dict[str, np.ndarray]] = None - predicted_intensity: Optional[Dict[str, np.ndarray]] = None - observed_intensity: Optional[Dict[str, np.ndarray]] = None - correlation: Optional[float] = None - feature_vectors: Optional[np.ndarray] = None + psm: PSM + theoretical_mz: dict[str, np.ndarray] | None = None + predicted_intensity: dict[str, np.ndarray] | None = None + observed_intensity: dict[str, np.ndarray] | None = None + correlation: float | None = None + feature_vectors: np.ndarray | None = None model_config = ConfigDict(arbitrary_types_allowed=True) def __init__(__pydantic_self__, **data: Any) -> None: """Result of processing a single PSM.""" super().__init__(**data) - def as_spectra(self) -> Tuple[Optional[PredictedSpectrum], Optional[ObservedSpectrum]]: + def as_spectra(self) -> tuple[PredictedSpectrum | None, ObservedSpectrum | None]: """Convert result to predicted and observed spectra.""" if not self.theoretical_mz: raise ValueError("Theoretical m/z values required to convert to spectra.") @@ -110,15 +110,17 @@ def plot_spectra(self): return ax -def calculate_correlations(results: List[ProcessingResult]) -> None: +def calculate_correlations(results: list[ProcessingResult]) -> None: """Calculate and add Pearson correlations to list of results.""" for result in results: - pred_int = np.concatenate([i for i in result.predicted_intensity.values()]) - obs_int = np.concatenate([i for i in result.observed_intensity.values()]) + if result.predicted_intensity is None or result.observed_intensity is None: + continue + pred_int = np.concatenate(list(result.predicted_intensity.values())) + obs_int = np.concatenate(list(result.observed_intensity.values())) result.correlation = np.corrcoef(pred_int, obs_int)[0][1] -def write_correlations(results: List["ProcessingResult"], output_file: str) -> None: +def write_correlations(results: list[ProcessingResult], output_file: str | Path) -> None: """Write correlations to CSV file.""" with open(output_file, "wt") as f: fieldnames = ["psm_index", "correlation"] diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py index 72eab413..c9c88240 100644 --- a/ms2pip/search_space.py +++ b/ms2pip/search_space.py @@ -73,12 +73,14 @@ import multiprocessing import multiprocessing.dummy +import multiprocessing.pool from collections import defaultdict +from collections.abc import Generator from functools import partial from itertools import chain, combinations, product from logging import getLogger from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Union +from typing import Any import numpy as np import pyteomics.fasta @@ -94,12 +96,12 @@ class ModificationConfig(BaseModel): """Configuration for a single modification in the search space.""" label: str - amino_acid: Optional[str] = None - peptide_n_term: Optional[bool] = False - protein_n_term: Optional[bool] = False - peptide_c_term: Optional[bool] = False - protein_c_term: Optional[bool] = False - fixed: Optional[bool] = False + amino_acid: str | None = None + peptide_n_term: bool | None = False + protein_n_term: bool | None = False + peptide_c_term: bool | None = False + protein_c_term: bool | None = False + fixed: bool | None = False def __init__(self, **data: Any): """ @@ -159,15 +161,15 @@ class ProteomeSearchSpace(BaseModel): fasta_file: Path min_length: int = 8 max_length: int = 30 - min_precursor_mz: Optional[float] = 0 - max_precursor_mz: Optional[float] = np.inf + min_precursor_mz: float | None = 0 + max_precursor_mz: float | None = np.inf cleavage_rule: str = "trypsin" missed_cleavages: int = 2 semi_specific: bool = False add_decoys: bool = False - modifications: List[ModificationConfig] = DEFAULT_MODIFICATIONS + modifications: list[ModificationConfig] = DEFAULT_MODIFICATIONS max_variable_modifications: int = 3 - charges: List[int] = [2, 3] + charges: list[int] = [2, 3] def __init__(self, **data: Any): """ @@ -204,7 +206,7 @@ def __init__(self, **data: Any): """ super().__init__(**data) - self._peptidoform_spaces: List[_PeptidoformSearchSpace] = [] + self._peptidoform_spaces: list[_PeptidoformSearchSpace] = [] @field_validator("modifications") @classmethod @@ -234,7 +236,7 @@ def __len__(self): return sum(len(pep_space) for pep_space in self._peptidoform_spaces) @classmethod - def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> ProteomeSearchSpace: + def from_any(cls, _input: dict | str | Path | ProteomeSearchSpace) -> ProteomeSearchSpace: """ Create ProteomeSearchSpace from various input types. @@ -271,7 +273,7 @@ def build(self, processes: int = 1): self._add_modifications(processes) self._add_charges() - def __iter__(self) -> Generator[PSM, None, None]: + def __iter__(self) -> Generator[PSM, None, None]: # type: ignore[ty:invalid-method-override] """ Generate PSMs from search space. @@ -304,14 +306,14 @@ def filter_psms_by_mz(self, psms: PSMList) -> PSMList: psm_list=[ psm for psm in psms - if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz + if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz # type: ignore[ty:unsupported-operator] ] ) def _digest_fasta(self, processes: int = 1): """Digest FASTA file to peptides and populate search space.""" # Convert to string to avoid issues with Path objects - self.fasta_file = str(self.fasta_file) + self.fasta_file = str(self.fasta_file) # type: ignore[ty:invalid-assignment] n_proteins = _count_fasta_entries(self.fasta_file) if self.add_decoys: fasta_db = pyteomics.fasta.decoy_db( @@ -394,11 +396,11 @@ class _PeptidoformSearchSpace(BaseModel): """Search space for a given amino acid sequence.""" sequence: str - proteins: List[str] - is_n_term: Optional[bool] = None - is_c_term: Optional[bool] = None - modification_options: List[Dict[int, ModificationConfig]] = [] - charge_options: List[int] = [] + proteins: list[str] + is_n_term: bool | None = None + is_c_term: bool | None = None + modification_options: list[dict[int, ModificationConfig]] = [] + charge_options: list[int] = [] def __init__(self, **data: Any): """ @@ -425,7 +427,7 @@ def __init__(self, **data: Any): def __len__(self): return len(self.modification_options) * len(self.charge_options) - def __iter__(self) -> Generator[str, None, None]: + def __iter__(self) -> Generator[str, None, None]: # type: ignore[ty:invalid-method-override] """Yield peptidoform strings with given charges and modifications.""" if not self.charge_options: raise ValueError("Peptide charge options not defined.") @@ -437,7 +439,7 @@ def __iter__(self) -> Generator[str, None, None]: @staticmethod def _construct_peptidoform_string( - sequence: str, modifications: Dict[int, ModificationConfig], charge: int + sequence: str, modifications: dict[int, ModificationConfig], charge: int ) -> str: if not modifications: return f"{sequence}/{charge}" @@ -469,7 +471,7 @@ def _digest_single_protein( cleavage_rule: str = "trypsin", missed_cleavages: int = 2, semi_specific: bool = False, -) -> List[_PeptidoformSearchSpace]: +) -> list[_PeptidoformSearchSpace]: """Digest protein sequence and return a list of validated peptides.""" def valid_residues(sequence: str) -> bool: @@ -516,8 +518,8 @@ def _count_fasta_entries(filename: Path) -> int: def _restructure_modifications_by_target( - modifications: List[ModificationConfig], -) -> Dict[str, Dict[str, List[ModificationConfig]]]: + modifications: list[ModificationConfig], +) -> dict[str, dict[str, list[ModificationConfig]]]: """Restructure variable modifications to options per side chain or terminus.""" modifications_by_target = { "sidechain": defaultdict(lambda: []), @@ -552,9 +554,9 @@ def add_mod(mod, target, amino_acid): def _get_modification_possibilities_by_site( peptide: _PeptidoformSearchSpace, - modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]], - modifications: List[ModificationConfig], -) -> Dict[Union[str, int], List[ModificationConfig]]: + modifications_by_target: dict[str, dict[str, list[ModificationConfig]]], + modifications: list[ModificationConfig], +) -> dict[str | int, list[ModificationConfig]]: """Get all possible modifications for each site in a peptide sequence.""" possibilities_by_site = defaultdict(list) @@ -609,10 +611,10 @@ def _get_modification_possibilities_by_site( def _get_peptidoform_modification_versions( peptide: _PeptidoformSearchSpace, - modifications: List[ModificationConfig], - modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]], + modifications: list[ModificationConfig], + modifications_by_target: dict[str, dict[str, list[ModificationConfig]]], max_variable_modifications: int = 3, -) -> List[Dict[Union[str, int], List[ModificationConfig]]]: +) -> list[dict[str | int, list[ModificationConfig]]]: """ Get all potential combinations of modifications for a peptide sequence. @@ -660,7 +662,7 @@ def _get_peptidoform_modification_versions( return modification_versions -def _get_pool(processes: int) -> Union[multiprocessing.Pool, multiprocessing.dummy.Pool]: +def _get_pool(processes: int) -> multiprocessing.pool.Pool: """Get a multiprocessing pool with the given number of processes.""" # TODO: fix None default value for processes if processes > 1: diff --git a/ms2pip/spectrum.py b/ms2pip/spectrum.py index 8e49a389..c4830b0c 100644 --- a/ms2pip/spectrum.py +++ b/ms2pip/spectrum.py @@ -3,17 +3,18 @@ from __future__ import annotations import warnings -from typing import Any, Optional, Union +from typing import Any import numpy as np from psm_utils import Peptidoform from pydantic import model_validator, field_validator, ConfigDict, BaseModel + try: import spectrum_utils.spectrum as sus import spectrum_utils.plot as sup except ImportError: - sus = None - sup = None + sus = None # type: ignore[ty:invalid-assignment] + sup = None # type: ignore[ty:invalid-assignment] class Spectrum(BaseModel): @@ -21,14 +22,14 @@ class Spectrum(BaseModel): mz: np.ndarray intensity: np.ndarray - annotations: Optional[np.ndarray] = None - identifier: Optional[str] = None - peptidoform: Optional[Union[Peptidoform, str]] = None - precursor_mz: Optional[float] = None - precursor_charge: Optional[int] = None - retention_time: Optional[float] = None - mass_tolerance: Optional[float] = None - mass_tolerance_unit: Optional[str] = None + annotations: np.ndarray | None = None + identifier: str | None = None + peptidoform: Peptidoform | str | None = None + precursor_mz: float | None = None + precursor_charge: int | None = None + retention_time: float | None = None + mass_tolerance: float | None = None + mass_tolerance_unit: str | None = None model_config = ConfigDict(arbitrary_types_allowed=True) @@ -71,7 +72,7 @@ def __repr__(self) -> str: @model_validator(mode="after") @classmethod - def check_array_lengths(cls, data: dict): + def check_array_lengths(cls, data): if len(data.mz) != len(data.intensity): raise ValueError("Array lengths do not match.") if data.annotations is not None: @@ -149,7 +150,7 @@ def to_spectrum_utils(self): if not self.peptidoform: raise ValueError("`precursor_charge` or `peptidoform` must be set.") else: - precursor_charge = self.peptidoform.precursor_charge + precursor_charge = self.peptidoform.precursor_charge # type: ignore[ty:unresolved-attribute] if self.precursor_mz: precursor_mz = self.precursor_mz @@ -158,19 +159,21 @@ def to_spectrum_utils(self): raise ValueError("`precursor_mz` or `peptidoform` must be set.") else: warnings.warn("precursor_mz not set, using theoretical precursor m/z.") - precursor_mz = self.peptidoform.theoretical_mz + precursor_mz = self.peptidoform.theoretical_mz # type: ignore[ty:unresolved-attribute] spectrum = sus.MsmsSpectrum( identifier=self.identifier if self.identifier else "spectrum", - precursor_mz=precursor_mz, - precursor_charge=precursor_charge, + precursor_mz=precursor_mz, # type: ignore[ty:invalid-argument-type] + precursor_charge=precursor_charge, # type: ignore[ty:invalid-argument-type] mz=self.mz, intensity=self.intensity, - retention_time=self.retention_time, + retention_time=self.retention_time, # type: ignore[ty:invalid-argument-type] ) if self.peptidoform: spectrum.annotate_proforma( - str(self.peptidoform), self.mass_tolerance, self.mass_tolerance_unit + str(self.peptidoform), + self.mass_tolerance, # type: ignore[ty:invalid-argument-type] + self.mass_tolerance_unit, # type: ignore[ty:invalid-argument-type] ) return spectrum @@ -184,7 +187,5 @@ class ObservedSpectrum(Spectrum): class PredictedSpectrum(Spectrum): """Predicted MS2 spectrum.""" - mass_tolerance: Optional[float] = 0.001 - mass_tolerance_unit: Optional[str] = "Da" - - pass + mass_tolerance: float | None = 0.001 + mass_tolerance_unit: str | None = "Da" diff --git a/ms2pip/spectrum_input.py b/ms2pip/spectrum_input.py index 8674c722..1ff8cab3 100644 --- a/ms2pip/spectrum_input.py +++ b/ms2pip/spectrum_input.py @@ -1,12 +1,14 @@ """Read MS2 spectra.""" +from __future__ import annotations + +from collections.abc import Generator from pathlib import Path -from typing import Generator import numpy as np -from ms2rescore_rs import get_ms2_spectra +from ms2rescore_rs import get_ms2_spectra # type: ignore[ty:unresolved-import] -from ms2pip.exceptions import UnsupportedSpectrumFiletypeError +import ms2pip.exceptions as exceptions from ms2pip.spectrum import ObservedSpectrum @@ -31,8 +33,8 @@ def read_spectrum_file(spectrum_file: str) -> Generator[ObservedSpectrum, None, """ try: spectra = get_ms2_spectra(str(spectrum_file)) - except ValueError: - raise UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes) + except ValueError as e: + raise exceptions.UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes) from e for spectrum in spectra: obs_spectrum = ObservedSpectrum( @@ -40,7 +42,7 @@ def read_spectrum_file(spectrum_file: str) -> Generator[ObservedSpectrum, None, intensity=np.array(spectrum.intensity, dtype=np.float32), identifier=str(spectrum.identifier), precursor_mz=float(spectrum.precursor.mz), - precursor_charge=float(spectrum.precursor.charge), + precursor_charge=int(spectrum.precursor.charge), retention_time=float(spectrum.precursor.rt), ) # Workaround for mobiusklein/mzdata#3 diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index c7690246..da3c081e 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -45,11 +45,12 @@ import warnings from abc import ABC, abstractmethod from collections import defaultdict +from collections.abc import Generator from io import StringIO -from pathlib import Path from os import PathLike +from pathlib import Path from time import localtime, strftime -from typing import Any, Dict, Generator, List, Optional, Union +from typing import Any import numpy as np from psm_utils import PSM, Peptidoform @@ -64,8 +65,8 @@ def write_spectra( - filename: Union[str, PathLike], - processing_results: List[ProcessingResult], + filename: str | PathLike, + processing_results: list[ProcessingResult], file_format: str = "tsv", write_mode: str = "w", ): @@ -94,7 +95,7 @@ class _Writer(ABC): suffix = "" - def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"): + def __init__(self, filename: str | PathLike, write_mode: str = "w"): self.filename = Path(filename).with_suffix(self.suffix) self.write_mode = write_mode @@ -136,7 +137,7 @@ def _file_object(self): self.open() return self._open_file - def write(self, processing_results: List[ProcessingResult]): + def write(self, processing_results: list[ProcessingResult]): """Write multiple processing results to file.""" for result in processing_results: self._write_result(result) @@ -162,7 +163,7 @@ class TSV(_Writer): "im", ] - def write(self, processing_results: List[ProcessingResult]): + def write(self, processing_results: list[ProcessingResult]): """Write multiple processing results to file.""" writer = csv.DictWriter( self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n" @@ -172,7 +173,7 @@ def write(self, processing_results: List[ProcessingResult]): for result in processing_results: self._write_result(result, writer) - def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): + def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): # type: ignore[ty:invalid-method-override] """Write single processing result to file.""" # Only write results with predictions or observations if not result.theoretical_mz: @@ -189,7 +190,7 @@ def _write_row(result: ProcessingResult, ion_type: str, ion_index: int): "psm_index": result.psm_index, "ion_type": ion_type, "ion_number": ion_index + 1, - "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]), + "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]), # type: ignore[ty:not-subscriptable] "predicted": "{:.8f}".format(result.predicted_intensity[ion_type][ion_index]) if result.predicted_intensity else None, @@ -206,7 +207,7 @@ class MSP(_Writer): suffix = ".msp" - def write(self, results: List[ProcessingResult]): + def write(self, results: list[ProcessingResult]): # type: ignore[ty:invalid-method-override] """Write multiple processing results to file.""" for result in results: self._write_result(result) @@ -214,20 +215,20 @@ def write(self, results: List[ProcessingResult]): def _write_result(self, result: ProcessingResult): """Write single processing result to file.""" predicted_spectrum = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 - peaks = zip(predicted_spectrum.mz, intensity_normalized, predicted_spectrum.annotations) + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute] + peaks = zip(predicted_spectrum.mz, intensity_normalized, predicted_spectrum.annotations) # type: ignore[ty:invalid-argument-type, ty:unresolved-attribute] # Header lines = [ f"Name: {result.psm.peptidoform.sequence}/{result.psm.get_precursor_charge()}", f"MW: {result.psm.peptidoform.theoretical_mass}", self._format_comment_line(result.psm), - f"Num peaks: {len(predicted_spectrum.mz)}", + f"Num peaks: {len(predicted_spectrum.mz)}", # type: ignore[ty:unresolved-attribute] ] # Peaks lines.extend( - f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks + f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks # type: ignore[ty:not-iterable] ) # Write to file @@ -241,8 +242,8 @@ def _format_modifications(peptidoform: Peptidoform): def _format_single_modification( amino_acid: str, position: int, - modifications: Optional[List[proforma.ModificationBase]], - ) -> Union[str, None]: + modifications: list[proforma.ModificationBase] | None, + ) -> str | None: """Get modification label from :py:class:`proforma.ModificationBase` list.""" if not modifications: return None @@ -255,7 +256,7 @@ def _format_single_modification( return f"{position},{amino_acid},{modification.value}" sequence_mods = [ - _format_single_modification(aa, pos + 1, mods) + _format_single_modification(aa, pos + 1, mods) # type: ignore[ty:invalid-argument-type] for pos, (aa, mods) in enumerate(peptidoform.parsed_sequence) ] n_term = _format_single_modification( @@ -278,7 +279,7 @@ def _format_parent_mass(peptidoform: Peptidoform) -> str: return f"Parent={peptidoform.theoretical_mz}" @staticmethod - def _format_protein_string(psm: PSM) -> Union[str, None]: + def _format_protein_string(psm: PSM) -> str | None: """Format protein list as string.""" if psm.protein_list: return f"Protein={','.join(psm.protein_list)}" @@ -286,7 +287,7 @@ def _format_protein_string(psm: PSM) -> Union[str, None]: return None @staticmethod - def _format_retention_time(psm: PSM) -> Union[str, None]: + def _format_retention_time(psm: PSM) -> str | None: """Format retention time as string.""" if psm.retention_time: return f"RetentionTime={psm.retention_time}" @@ -294,7 +295,7 @@ def _format_retention_time(psm: PSM) -> Union[str, None]: return None @staticmethod - def _format_ion_mobility(psm: PSM) -> Union[str, None]: + def _format_ion_mobility(psm: PSM) -> str | None: """Format ion mobility as string.""" if psm.ion_mobility: return f"IonMobility={psm.ion_mobility}" @@ -334,7 +335,7 @@ class MGF(_Writer): suffix = ".mgf" - def write(self, results: List[ProcessingResult]): + def write(self, results: list[ProcessingResult]): # type: ignore[ty:invalid-method-override] """Write multiple processing results to file.""" for result in results: self._write_result(result) @@ -342,8 +343,8 @@ def write(self, results: List[ProcessingResult]): def _write_result(self, result: ProcessingResult): """Write single processing result to file.""" predicted_spectrum = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 - peaks = zip(predicted_spectrum.mz, intensity_normalized) + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute] + peaks = zip(predicted_spectrum.mz, intensity_normalized) # type: ignore[ty:unresolved-attribute] # Header lines = [ @@ -384,7 +385,7 @@ class Spectronaut(_Writer): "FragmentLossType", ] - def write(self, processing_results: List[ProcessingResult]): + def write(self, processing_results: list[ProcessingResult]): """Write multiple processing results to file.""" writer = csv.DictWriter( self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n" @@ -394,7 +395,7 @@ def write(self, processing_results: List[ProcessingResult]): for result in processing_results: self._write_result(result, writer) - def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): + def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): # type: ignore[ty:invalid-method-override] """Write single processing result to file.""" # Only write results with predictions if result.predicted_intensity is None: @@ -404,7 +405,7 @@ def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): writer.writerow({**psm_info, **fragment_info}) @staticmethod - def _process_psm(psm: PSM) -> Dict[str, Any]: + def _process_psm(psm: PSM) -> dict[str, Any]: """Process PSM to Spectronaut format.""" return { "ModifiedPeptide": _peptidoform_str_without_charge(psm.peptidoform), @@ -417,23 +418,23 @@ def _process_psm(psm: PSM) -> Dict[str, Any]: } @staticmethod - def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], None, None]: + def _yield_fragment_info(result: ProcessingResult) -> Generator[dict[str, Any], None, None]: """Yield fragment information for a processing result.""" # Normalize intensities intensities = { ion_type: _unlogarithmize(intensities) - for ion_type, intensities in result.predicted_intensity.items() + for ion_type, intensities in result.predicted_intensity.items() # type: ignore[ty:unresolved-attribute] } max_intensity = max(itertools.chain(*intensities.values())) intensities = { ion_type: _basepeak_normalize(intensities[ion_type], basepeak=max_intensity) for ion_type in intensities } - for ion_type in result.predicted_intensity: + for ion_type in result.predicted_intensity: # type: ignore[ty:not-iterable] fragment_type = ion_type[0].lower() fragment_charge = ion_type[1:] if len(ion_type) > 1 else "1" for ion_index, (intensity, mz) in enumerate( - zip(intensities[ion_type], result.theoretical_mz[ion_type]) + zip(intensities[ion_type], result.theoretical_mz[ion_type]) # type: ignore[ty:invalid-argument-type, ty:not-subscriptable] ): yield { "RelativeFragmentIntensity": f"{intensity:.8f}", @@ -468,7 +469,7 @@ class Bibliospec(_Writer): "ion-mobility", ] - def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"): + def __init__(self, filename: str | PathLike, write_mode: str = "w"): super().__init__(filename, write_mode) self.ssl_file = self.filename.with_suffix(self.ssl_suffix) self.ms2_file = self.filename.with_suffix(self.ms2_suffix) @@ -514,7 +515,7 @@ def _ms2_file_object(self): self.open() return self._open_ms2_file - def write(self, processing_results: List[ProcessingResult]): + def write(self, processing_results: list[ProcessingResult]): """Write multiple processing results to file.""" # Create CSV writer ssl_dict_writer = csv.DictWriter( @@ -553,7 +554,7 @@ def _write_result( modified_sequence: str, scan_number: int, writer: csv.DictWriter, - ): + ): # type: ignore[ty:invalid-method-override] """Write single processing result to files.""" self._write_result_to_ssl(result, modified_sequence, scan_number, writer) self._write_result_to_ms2(result, modified_sequence, scan_number) @@ -584,8 +585,8 @@ def _write_result_to_ms2( ): """Write single processing result to the MS2 file.""" predicted_spectrum = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 - peaks = zip(predicted_spectrum.mz, intensity_normalized) + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute] + peaks = zip(predicted_spectrum.mz, intensity_normalized) # type: ignore[ty:unresolved-attribute] # Header lines = [ @@ -607,8 +608,8 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str: """Format modified sequence as string for Spectronaut.""" modification_dict = defaultdict(list) for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]: - if peptidoform.properties[term]: - modification_dict[position].extend(peptidoform.properties[term]) + if peptidoform.properties[term]: # type: ignore[ty:invalid-key] + modification_dict[position].extend(peptidoform.properties[term]) # type: ignore[ty:invalid-key] for position, (_, mods) in enumerate(peptidoform.parsed_sequence): if mods: modification_dict[position].extend(mods) @@ -620,7 +621,7 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str: ) @staticmethod - def _get_last_ssl_scan_number(ssl_file: Union[str, PathLike, StringIO]): + def _get_last_ssl_scan_number(ssl_file: str | PathLike | StringIO): """Read scan number of last line in a Bibliospec SSL file.""" if isinstance(ssl_file, StringIO): ssl_file.seek(0) @@ -652,7 +653,7 @@ def open(self): self._open_file = self.filename.unlink(missing_ok=True) self._open_file = dlib.open_sqlite(self.filename) - def write(self, processing_results: List[ProcessingResult]): + def write(self, processing_results: list[ProcessingResult]): """Write MS2PIP predictions to a DLIB SQLite file.""" connection = self._file_object dlib.metadata.create_all(connection.engine) @@ -667,13 +668,13 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str: """Format modified sequence as string for DLIB.""" # Sum all sequential mass shifts for each position masses = [ - sum(mod.mass for mod in mods) if mods else 0 for _, mods in peptidoform.parsed_sequence + sum(mod.mass for mod in mods) if mods else 0 for _, mods in peptidoform.parsed_sequence # type: ignore[ty:unresolved-attribute] ] # Add N- and C-terminal modifications for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]: - if peptidoform.properties[term]: - masses[position] += sum(mod.mass for mod in peptidoform.properties[term]) + if peptidoform.properties[term]: # type: ignore[ty:invalid-key] + masses[position] += sum(mod.mass for mod in peptidoform.properties[term]) # type: ignore[ty:invalid-key] # Format modified sequence return "".join( @@ -700,9 +701,9 @@ def _write_metadata(connection: Connection): @staticmethod def _write_entries( - processing_results: List[ProcessingResult], + processing_results: list[ProcessingResult], connection: Connection, - output_filename: Union[str, PathLike], + output_filename: str | PathLike, ): """Write spectra to DLIB SQLite file.""" with connection.begin(): @@ -711,8 +712,8 @@ def _write_entries( raise ValueError("Retention time required to write DLIB file.") spectrum = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(spectrum.intensity) * 1e4 - n_peaks = len(spectrum.mz) + intensity_normalized = _basepeak_normalize(spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute] + n_peaks = len(spectrum.mz) # type: ignore[ty:unresolved-attribute] connection.execute( dlib.Entry.insert().values( @@ -724,7 +725,7 @@ def _write_entries( RTInSeconds=result.psm.retention_time, Score=0, MassEncodedLength=n_peaks, - MassArray=spectrum.mz.tolist(), + MassArray=spectrum.mz.tolist(), # type: ignore[ty:unresolved-attribute] IntensityEncodedLength=n_peaks, IntensityArray=intensity_normalized.tolist(), SourceFile=str(output_filename), @@ -732,7 +733,7 @@ def _write_entries( ) @staticmethod - def _write_peptide_to_protein(results: List[ProcessingResult], connection: Connection): + def _write_peptide_to_protein(results: list[ProcessingResult], connection: Connection): """Write peptide-to-protein mappings to DLIB SQLite file.""" peptide_to_proteins = { (result.psm.peptidoform.sequence, protein) @@ -780,12 +781,12 @@ def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str: return re.sub(r"\/\d+$", "", str(peptidoform)) -def _unlogarithmize(intensities: np.array) -> np.array: +def _unlogarithmize(intensities: np.array) -> np.array: # type: ignore[ty:invalid-type-form] """Undo logarithmic transformation of intensities.""" return (2**intensities) - 0.001 -def _basepeak_normalize(intensities: np.array, basepeak: Optional[float] = None) -> np.array: +def _basepeak_normalize(intensities: np.array, basepeak: float | None = None) -> np.array: # type: ignore[ty:invalid-type-form] """Normalize intensities to most intense peak.""" if not basepeak: basepeak = intensities.max() diff --git a/pyproject.toml b/pyproject.toml index 002ca21e..61b4df8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,14 +24,12 @@ authors = [ classifiers = [ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", - "Operating System :: POSIX :: Linux", - "Operating System :: MacOS :: MacOS X", - "Operating System :: Microsoft :: Windows", + "Operating System :: OS Independent", "Programming Language :: Python :: 3 :: Only", "Topic :: Scientific/Engineering :: Bio-Informatics", "Development Status :: 5 - Production/Stable", ] -requires-python = ">=3.9" +requires-python = ">=3.11" dependencies = [ "numpy>=1.25,<3", "pandas>=1,<3", @@ -45,12 +43,14 @@ dependencies = [ "pydantic>=2", "werkzeug>=2", "psm_utils>=1.0", - "ms2rescore-rs>=0.5.0a0,<2", + "ms2rescore-rs>=0.5.0a1,<2", ] - [project.optional-dependencies] plotting = ["matplotlib>=3.0", "spectrum-utils>=0.4"] -dev = ["black", "isort>5", "pytest"] + +[dependency-groups] +notebook = ["ipykernel", "ipywidgets", "pip"] +dev = ["pytest", "ruff", "ty"] docs = [ "sphinx", "numpydoc>=1,<2", @@ -74,7 +74,7 @@ publication = "https://doi.org/10.1093/nar/gkad335/" ms2pip = "ms2pip.__main__:main" [build-system] -requires = ["setuptools", "cython", "numpy>=2.0"] +requires = ["setuptools"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] @@ -82,19 +82,8 @@ include = ["ms2pip*"] [tool.black] line-length = 99 -target-version = ['py310'] +target-version = ['py311'] [tool.ruff] line-length = 99 -target-version = 'py310' - -[tool.cibuildwheel] -# Keep Python targets explicit so CI updates don't start publishing untested wheels. -build = "cp3{10,11,12,13}-manylinux_x86_64 cp3{10,11,12,13}-win_amd64 cp3{10,11,12,13}-macosx_{x86_64,arm64}" -test-command = "ms2pip --help" - -# Prevent building from source for packages with complex C/C++/Rust dependencies -environment = { PIP_ONLY_BINARY = "pyarrow,pandas,numpy,lxml,xgboost,scipy,ms2rescore-rs" } - -[tool.cibuildwheel.macos] -before-all = "brew install libomp" +target-version = 'py311' diff --git a/setup.py b/setup.py deleted file mode 100644 index b389ab08..00000000 --- a/setup.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -import platform -from glob import glob - -import numpy -from Cython.Distutils import build_ext -from setuptools import setup -from setuptools.extension import Extension - - -def _get_version(): - with open("ms2pip/__init__.py") as f: - for line in f: - if line.startswith("__version__"): - return line.split("=")[1].strip().strip('"').strip("'") - - -to_remove = [ - "ms2pip/_cython_modules/ms2pip_pyx.c*", - "ms2pip/_cython_modules/ms2pip_pyx.so", -] -_ = [[os.remove(f) for f in glob(pat)] for pat in to_remove] - -# Large machine-written C model files require optimization to be disabled -compile_args = { - "Linux": [ - "-O0", - "-fno-var-tracking", - "-Wno-unused-result", - "-Wno-cpp", - "-Wno-unused-function", - ], - "Darwin": [ - "-O0", - ], - "Windows": [ - "/Od", - "/DEBUG", - "/GL-", - "/bigobj", - "/wd4244", - ], -} - -extensions = [ - Extension( - "ms2pip._cython_modules.ms2pip_pyx", - sources=["ms2pip/_cython_modules/ms2pip_pyx.pyx"] + glob("ms2pip/_models_c/*/*.c"), - extra_compile_args=compile_args[platform.system()], - ) -] - -setup( - version=_get_version(), - ext_modules=extensions, - include_dirs=[numpy.get_include()], - cmdclass={"build_ext": build_ext}, -) diff --git a/tests/test_core.py b/tests/test_core.py index 771521f9..5f90dea2 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,22 +1,10 @@ import numpy as np -from psm_utils import PSM, Peptidoform -import pandas as pd +from psm_utils import PSM, PSMList, Peptidoform -from ms2pip.core import get_training_data, predict_single +from ms2pip.core import predict_batch, predict_library, predict_single from ms2pip.result import ProcessingResult -def _test_get_training_data(): - expected_df = pd.read_feather("tests/test_data/massivekb_selected_500.feather") - output_df = get_training_data( - "tests/test_data/massivekb_selected_500.peprec", - "tests/test_data/massivekb_selected_500.mgf", - model="HCD", - ms2_tolerance=0.02, - processes=1 - ) - pd.testing.assert_frame_equal(expected_df, output_df) - def test_predict_single(): pep = Peptidoform("ACDE/2") result = predict_single(pep) @@ -25,8 +13,8 @@ def test_predict_single(): psm_index=0, psm=PSM(peptidoform=pep, spectrum_id=0), theoretical_mz={ - "b": np.array([72.04435, 175.05354, 290.08047], dtype=np.float32), - "y": np.array([148.0604, 263.0873, 366.0965], dtype=np.float32), + "b": np.array([72.04439, 175.05357, 290.0805], dtype=np.float32), + "y": np.array([148.06044, 263.08737, 366.09656], dtype=np.float32), }, predicted_intensity={ "b": np.array([-9.14031, -7.6102686, -7.746709], dtype=np.float32), @@ -41,8 +29,65 @@ def test_predict_single(): assert result.psm == expected.psm np.testing.assert_array_almost_equal(result.theoretical_mz["b"], expected.theoretical_mz["b"]) np.testing.assert_array_almost_equal(result.theoretical_mz["y"], expected.theoretical_mz["y"]) - np.testing.assert_array_almost_equal(result.predicted_intensity["b"], expected.predicted_intensity["b"]) - np.testing.assert_array_almost_equal(result.predicted_intensity["y"], expected.predicted_intensity["y"]) + np.testing.assert_array_almost_equal( + result.predicted_intensity["b"], expected.predicted_intensity["b"] + ) + np.testing.assert_array_almost_equal( + result.predicted_intensity["y"], expected.predicted_intensity["y"] + ) assert result.observed_intensity == expected.observed_intensity assert result.correlation == expected.correlation assert result.feature_vectors == expected.feature_vectors + + +def test_predict_single_modified(): + result = predict_single("AC[+57.0215]M[+15.9949]DEK/2") + + assert result.predicted_intensity is not None + assert result.theoretical_mz is not None + assert set(result.theoretical_mz.keys()) == {"b", "y"} + # 6 residues → 5 cleavage sites + assert len(result.theoretical_mz["b"]) == 5 + assert len(result.theoretical_mz["y"]) == 5 + assert len(result.predicted_intensity["b"]) == 5 + # m/z values should be positive and increasing for b-ions + assert all(result.theoretical_mz["b"] > 0) + assert all(np.diff(result.theoretical_mz["b"]) > 0) + + +def test_predict_batch(): + psm_list = PSMList( + psm_list=[ + PSM(peptidoform=Peptidoform("ACDE/2"), spectrum_id=0), + PSM(peptidoform=Peptidoform("PEPTIDEK/3"), spectrum_id=1), + PSM(peptidoform=Peptidoform("AAAAAAA/2"), spectrum_id=2), + ] + ) + results = predict_batch(psm_list) + + assert len(results) == 3 + for i, result in enumerate(results): + assert result.psm_index == i + assert result.predicted_intensity is not None + assert result.theoretical_mz is not None + assert set(result.theoretical_mz.keys()) == {"b", "y"} + assert result.feature_vectors is None + + # Check correct number of ions per peptide + assert len(results[0].theoretical_mz["b"]) == 3 # ACDE: 4 residues → 3 + assert len(results[1].theoretical_mz["b"]) == 7 # PEPTIDEK: 8 residues → 7 + assert len(results[2].theoretical_mz["b"]) == 6 # AAAAAAA: 7 residues → 6 + + +def test_predict_library(): + batches = list( + predict_library(fasta_file="tests/test_data/test.fasta", batch_size=100) + ) + + assert len(batches) >= 1 + for batch in batches: + assert isinstance(batch, list) + for result in batch: + assert isinstance(result, ProcessingResult) + assert result.predicted_intensity is not None + assert result.theoretical_mz is not None diff --git a/tests/test_correlate.py b/tests/test_correlate.py new file mode 100644 index 00000000..6056c2ab --- /dev/null +++ b/tests/test_correlate.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest +from psm_utils import PSM, Peptidoform +from ms2rescore_rs import AnnotatedMS2Spectrum, FragmentAnnotation, MS2Spectrum, Precursor + +from ms2pip.core import correlate_preloaded, correlate_single +from ms2pip.spectrum import ObservedSpectrum + + +def _make_observed_spectrum(peptidoform="ACDEFK/2"): + """Create an ObservedSpectrum with realistic peaks for testing.""" + return ObservedSpectrum( + mz=np.array( + [72.044, 147.113, 175.054, 276.167, 290.081, 389.251, 488.319, 566.192], + dtype=np.float32, + ), + intensity=np.array( + [50.0, 100.0, 80.0, 200.0, 60.0, 150.0, 300.0, 40.0], + dtype=np.float32, + ), + identifier="test_spectrum", + peptidoform=peptidoform, + precursor_mz=341.65, + precursor_charge=2, + retention_time=100.0, + ) + + +def test_correlate_single(): + obs = _make_observed_spectrum() + result = correlate_single(obs, ms2_tolerance=0.02, model="HCD") + + assert result.predicted_intensity is not None + assert result.observed_intensity is not None + assert result.correlation is not None + assert not np.isnan(result.correlation) + assert set(result.predicted_intensity.keys()) == {"b", "y"} + assert set(result.observed_intensity.keys()) == {"b", "y"} + + +def test_correlate_single_ppm_tolerance(): + obs = _make_observed_spectrum() + result = correlate_single(obs, ms2_tolerance=20.0, ms2_tolerance_mode="ppm", model="HCD") + + assert result.predicted_intensity is not None + assert result.observed_intensity is not None + assert result.correlation is not None + + +def test_correlate_single_requires_peptidoform(): + obs = ObservedSpectrum( + mz=np.array([100.0], dtype=np.float32), + intensity=np.array([1.0], dtype=np.float32), + identifier="test", + ) + with pytest.raises(ValueError, match="Peptidoform must be set"): + correlate_single(obs) + + +def test_correlate_preloaded_annotated(): + ann_spec = AnnotatedMS2Spectrum( + identifier="test_spectrum", + mz=[175.054, 276.167, 488.319], + intensity=[100.0, 200.0, 300.0], + precursor=Precursor(mz=341.65, charge=2, rt=100.0), + peak_annotations=[ + [FragmentAnnotation(series="b", position=2, charge=1)], + [FragmentAnnotation(series="y", position=4, charge=1)], + [], + ], + ) + psm = PSM( + peptidoform=Peptidoform("ACDEFK/2"), + spectrum_id="test_spectrum", + spectrum=ann_spec, + ) + + results = correlate_preloaded([psm], model="HCD") + + assert len(results) == 1 + result = results[0] + assert result.predicted_intensity is not None + assert result.observed_intensity is not None + assert set(result.observed_intensity.keys()) == {"b", "y"} + # Annotated b2 position should have non-floor intensity + floor = np.log2(0.001) + assert result.observed_intensity["b"][1] > floor + + +def test_correlate_preloaded_raw(): + raw_spec = MS2Spectrum( + identifier="test_spectrum", + mz=[175.054, 276.167, 488.319], + intensity=[100.0, 200.0, 300.0], + precursor=Precursor(mz=341.65, charge=2, rt=100.0), + ) + psm = PSM( + peptidoform=Peptidoform("ACDEFK/2"), + spectrum_id="test_spectrum", + spectrum=raw_spec, + ) + + results = correlate_preloaded( + [psm], model="HCD", ms2_tolerance=0.02, ms2_tolerance_mode="Da" + ) + + assert len(results) == 1 + assert results[0].predicted_intensity is not None + assert results[0].observed_intensity is not None + + +def test_correlate_preloaded_ppm_tolerance(): + raw_spec = MS2Spectrum( + identifier="test_spectrum", + mz=[175.054, 276.167], + intensity=[100.0, 200.0], + precursor=Precursor(mz=341.65, charge=2, rt=100.0), + ) + psm = PSM( + peptidoform=Peptidoform("ACDEFK/2"), + spectrum_id="test_spectrum", + spectrum=raw_spec, + ) + + results = correlate_preloaded( + [psm], model="HCD", ms2_tolerance=20.0, ms2_tolerance_mode="ppm" + ) + + assert len(results) == 1 + assert results[0].observed_intensity is not None + + +def test_correlate_preloaded_invalid_spectrum(): + psm = PSM( + peptidoform=Peptidoform("ACDEFK/2"), + spectrum_id="test", + ) + with pytest.raises(ValueError, match="MS2Spectrum or AnnotatedMS2Spectrum"): + correlate_preloaded([psm]) + + +def test_correlate_preloaded_multiple_psms(): + psms = [] + for i, pep in enumerate(["ACDEFK/2", "PEPTIDEK/3"]): + spec = MS2Spectrum( + identifier=f"spec_{i}", + mz=[175.054, 276.167], + intensity=[100.0, 200.0], + precursor=Precursor(mz=400.0, charge=2, rt=float(i * 10)), + ) + psms.append(PSM(peptidoform=Peptidoform(pep), spectrum_id=f"spec_{i}", spectrum=spec)) + + results = correlate_preloaded(psms, model="HCD") + + assert len(results) == 2 + for result in results: + assert result.predicted_intensity is not None + assert result.observed_intensity is not None diff --git a/tests/test_correlation.py b/tests/test_correlation.py new file mode 100644 index 00000000..62eaaaaa --- /dev/null +++ b/tests/test_correlation.py @@ -0,0 +1,55 @@ +import numpy as np + +from ms2pip.correlation import ms2pip_pearson, spectral_angle + + +def test_ms2pip_pearson(): + true = np.array([100.0, 200.0, 50.0, 300.0]) + pred = np.array([90.0, 210.0, 40.0, 310.0]) + + corr = ms2pip_pearson(true, pred) + + assert isinstance(corr, float) + assert 0.9 < corr <= 1.0 + + +def test_ms2pip_pearson_identical(): + arr = np.array([100.0, 200.0, 50.0, 300.0]) + corr = ms2pip_pearson(arr, arr) + + assert abs(corr - 1.0) < 1e-6 + + +def test_ms2pip_pearson_anticorrelated(): + true = np.array([100.0, 200.0, 300.0]) + pred = np.array([300.0, 200.0, 100.0]) + + corr = ms2pip_pearson(true, pred) + + assert corr < 0 + + +def test_spectral_angle(): + true = np.array([100.0, 200.0, 50.0, 300.0]) + pred = np.array([90.0, 210.0, 40.0, 310.0]) + + sa = spectral_angle(true, pred) + + assert isinstance(sa, float) + assert 0.0 < sa <= 1.0 + + +def test_spectral_angle_identical(): + arr = np.array([100.0, 200.0, 50.0, 300.0]) + sa = spectral_angle(arr, arr) + + assert abs(sa - 1.0) < 1e-6 + + +def test_spectral_angle_orthogonal(): + true = np.array([1.0, 0.0]) + pred = np.array([0.0, 1.0]) + + sa = spectral_angle(true, pred) + + assert sa < 0.5 diff --git a/tests/test_encoder.py b/tests/test_encoder.py deleted file mode 100644 index 5feb5f96..00000000 --- a/tests/test_encoder.py +++ /dev/null @@ -1,91 +0,0 @@ -import pytest -from psm_utils import Peptidoform, PSM, PSMList - -from ms2pip._utils.encoder import Encoder - - -class TestEncoder: - def test_from_peptidoform(self): - test_cases = [ - # Peptidoform, {(target, label): (amino_acid, amino_acid_id, mass_shift)} - ("ACDEK", {}), - ("AC[+57.021464]DEK", {("C", "+57.021464"): ("C", 1, 57.021464)}), - ("AC[U:4]", {("C", "UNIMOD:4"): ("C", 1, 57.021464)}), - ("AC[formula:H3C2NO]", {("C", "Formula:H3C2NO"): ("C", 1, 57.021464)}), - ("[Acetyl]-ACDE", {("n_term", "Acetyl"): ("n_term", -1, 42.010565)}), - ("ACDE-[Amidated]", {("c_term", "Amidated"): ("c_term", -2, -0.984016)}), - ( - "AC[+57.021464]DE-[Amidated]", - { - ("C", "+57.021464"): ("C", 1, 57.021464), - ("c_term", "Amidated"): ("c_term", -2, -0.984016), - }, - ), - ( - "[Acetyl]-AC[+57.021464]DE", - { - ("n_term", "Acetyl"): ("n_term", -1, 42.010565), - ("C", "+57.021464"): ("C", 1, 57.021464), - }, - ), - ] - - for peptidoform, expected_mods in test_cases: - encoder = Encoder.from_peptidoform(Peptidoform(peptidoform)) - for key, modification in encoder.modifications.items(): - for item_key, expected_item in zip( - ["amino_acid", "amino_acid_id", "mass_shift"], expected_mods[key] - ): - if isinstance(expected_item, float): - assert modification[item_key] == pytest.approx(expected_item) - else: - assert modification[item_key] == expected_item - - def test_from_psm_list(self): - psm_list = PSMList(psm_list=[ - PSM(peptidoform="AC[+57.021464]DEK", spectrum_id=0), - PSM(peptidoform="AC[U:4]", spectrum_id=1), - PSM(peptidoform="AC[formula:H3C2NO]", spectrum_id=2), - PSM(peptidoform="[Acetyl]-ACDE", spectrum_id=3), - PSM(peptidoform="ACDE-[Amidated]",spectrum_id= 4) - ]) - expected = { - ("C", "+57.021464"): { - "mod_id": 38, - "mass_shift": 57.021464, - "amino_acid": "C", - "amino_acid_id": 1, - }, - ("C", "UNIMOD:4"): { - "mod_id": 39, - "mass_shift": 57.021464, - "amino_acid": "C", - "amino_acid_id": 1, - }, - ("C", "Formula:H3C2NO"): { - "mod_id": 40, - "mass_shift": 57.02146372057, - "amino_acid": "C", - "amino_acid_id": 1, - }, - ("n_term", "Acetyl"): { - "mod_id": 41, - "mass_shift": 42.010565, - "amino_acid": "n_term", - "amino_acid_id": -1, - }, - ("c_term", "Amidated"): { - "mod_id": 42, - "mass_shift": -0.984016, - "amino_acid": "c_term", - "amino_acid_id": -2, - }, - } - - encoder = Encoder.from_psm_list(psm_list) - for modification_key, modification_dict in encoder.modifications.items(): - for item_key, expected_item in expected[modification_key].items(): - if isinstance(expected_item, float): - assert modification_dict[item_key] == pytest.approx(expected_item) - else: - assert modification_dict[item_key] == expected_item diff --git a/tests/test_spectrum_processing.py b/tests/test_spectrum_processing.py new file mode 100644 index 00000000..2a174b48 --- /dev/null +++ b/tests/test_spectrum_processing.py @@ -0,0 +1,138 @@ +import numpy as np +from psm_utils import PSM, Peptidoform + +from ms2pip._spectrum_processing import ( + annotate_spectrum, + proforma_to_mass_shift, + targets_from_annotations, +) + + +def test_targets_from_annotations_basic(): + """Test basic target extraction from annotations.""" + # 4-residue peptide → 3 cleavage sites + peak_annotations = [ + [("b", 1, 1)], # peak 0 matches b1 + [], # peak 1 unmatched + [("y", 2, 1)], # peak 2 matches y2 + ] + intensity = np.array([5.0, 3.0, 8.0], dtype=np.float32) + + targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4) + + floor = np.float32(np.log2(0.001)) + assert targets["b"][0] == 5.0 # b1 at position 0 + assert targets["b"][1] == floor # b2 unmatched + assert targets["b"][2] == floor # b3 unmatched + assert targets["y"][0] == floor # y1 unmatched + assert targets["y"][1] == 8.0 # y2 at position 1 + assert targets["y"][2] == floor # y3 unmatched + + +def test_targets_from_annotations_charge2(): + """Test that charge-2 annotations map to b2/y2 ion types.""" + peak_annotations = [ + [("b", 1, 2)], # charge 2 → maps to "b2" + ] + intensity = np.array([10.0], dtype=np.float32) + + targets = targets_from_annotations( + peak_annotations, intensity, ["b", "y", "b2", "y2"], seq_len=4 + ) + + assert targets["b2"][0] == 10.0 + floor = np.float32(np.log2(0.001)) + assert targets["b"][0] == floor # charge-1 b is not matched + + +def test_targets_from_annotations_highest_intensity(): + """Test that the highest intensity is kept when multiple peaks match.""" + peak_annotations = [ + [("b", 1, 1)], # peak 0: b1 with intensity 3.0 + [("b", 1, 1)], # peak 1: b1 with intensity 7.0 + ] + intensity = np.array([3.0, 7.0], dtype=np.float32) + + targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4) + + assert targets["b"][0] == 7.0 # highest wins + + +def test_targets_from_annotations_ignores_unknown_ion_types(): + """Test that annotations for ion types not in the target list are ignored.""" + peak_annotations = [ + [("c", 1, 1)], # c-ion not in requested types + ] + intensity = np.array([10.0], dtype=np.float32) + + targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4) + + floor = np.float32(np.log2(0.001)) + assert all(v == floor for v in targets["b"]) + assert all(v == floor for v in targets["y"]) + + +def test_targets_from_annotations_fragment_annotation_objects(): + """Test that FragmentAnnotation objects (not tuples) also work.""" + from ms2rescore_rs import FragmentAnnotation + + peak_annotations = [ + [FragmentAnnotation(series="b", position=1, charge=1)], + ] + intensity = np.array([5.0], dtype=np.float32) + + targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4) + + assert targets["b"][0] == 5.0 + + +def test_annotate_spectrum(): + """Test that annotate_spectrum returns annotations for matching peaks.""" + spectrum = __import__("ms2pip.spectrum", fromlist=["ObservedSpectrum"]).ObservedSpectrum( + mz=np.array([72.044, 175.054, 290.081], dtype=np.float32), + intensity=np.array([100.0, 200.0, 300.0], dtype=np.float32), + identifier="test", + precursor_mz=250.0, + precursor_charge=2, + retention_time=100.0, + ) + psm = PSM(peptidoform=Peptidoform("ACDE/2"), spectrum_id="test") + + annotations = annotate_spectrum(spectrum, psm, "HCD", 0.02, "Da") + + # Should return one list of annotations per peak + assert len(annotations) == 3 + # At least some peaks should be annotated (b1 at ~72.044, b2 at ~175.054) + annotated_peaks = [i for i, anns in enumerate(annotations) if len(anns) > 0] + assert len(annotated_peaks) > 0 + + +def test_proforma_to_mass_shift_unmodified(): + result = proforma_to_mass_shift(Peptidoform("PEPTIDE/2")) + assert result == "PEPTIDE/2" + + +def test_proforma_to_mass_shift_unimod_names(): + result = proforma_to_mass_shift(Peptidoform("PEPTC[UNIMOD:Carbamidomethyl]M[UNIMOD:Oxidation]IDE/2")) + assert "[+" in result + assert "Carbamidomethyl" not in result + assert "Oxidation" not in result + assert result.startswith("PEPTC[+57.0215]M[+15.9949]") + assert result.endswith("/2") + + +def test_proforma_to_mass_shift_nterm(): + result = proforma_to_mass_shift(Peptidoform("[UNIMOD:Acetyl]-PEPTIDE/2")) + assert result.startswith("[+42.0106]-PEPTIDE") + assert result.endswith("/2") + + +def test_proforma_to_mass_shift_already_mass_shift(): + result = proforma_to_mass_shift(Peptidoform("PEPTC[+57.0215]IDE/2")) + assert result == "PEPTC[+57.0215]IDE/2" + + +def test_proforma_to_mass_shift_no_charge(): + result = proforma_to_mass_shift(Peptidoform("PEPTIDE")) + assert result == "PEPTIDE" + assert "/" not in result