diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index 5fa60135..00000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-ms2pip/_models_c/**/*.c filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml
index 68f1bb3a..a400f556 100644
--- a/.github/workflows/build_and_publish.yml
+++ b/.github/workflows/build_and_publish.yml
@@ -5,49 +5,20 @@ on:
types: [created]
jobs:
- build-sdist:
- name: Build source distribution
+ build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- with:
- lfs: "true"
- - name: Set up Python
- uses: actions/setup-python@v5
- with:
- python-version: "3.11"
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install build
- - name: Build sdist
- run: python -m build --sdist --outdir dist
- - uses: actions/upload-artifact@v4
- with:
- name: dist-source
- path: dist
-
- build-wheels:
- name: Build wheels on ${{ matrix.os }}
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- os: [ubuntu-latest, windows-latest, macos-15, macos-15-intel]
- steps:
- - uses: actions/checkout@v4
- with:
- lfs: "true"
- - name: Build wheels
- uses: pypa/cibuildwheel@v3.4.0
- with:
- output-dir: dist
+ - uses: astral-sh/setup-uv@v6
+ - name: Build sdist and wheel
+ run: uv build --out-dir dist
- uses: actions/upload-artifact@v4
with:
- name: dist-${{ matrix.os }}
+ name: dist
path: dist
publish-to-pypi:
- needs: [build-sdist, build-wheels]
+ needs: [build]
runs-on: ubuntu-latest
environment:
name: pypi
@@ -56,10 +27,8 @@ jobs:
id-token: write
steps:
- uses: actions/download-artifact@v4
- - name: Move wheels to dist
- run: |
- mkdir dist/
- mv dist-*/*.whl dist/
- mv dist-*/*.tar.gz dist/
+ with:
+ name: dist
+ path: dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 023fd57c..bc5ed0c2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,40 +8,36 @@ on:
workflow_dispatch:
jobs:
- build:
+ lint:
runs-on: ubuntu-latest
- strategy:
- max-parallel: 4
- fail-fast: false
- matrix:
- python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: astral-sh/setup-uv@v6
+ - run: uvx ruff check --output-format=github ./ms2pip
+ typecheck:
+ runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
+ - uses: astral-sh/setup-uv@v6
with:
- lfs: "true"
+ enable-cache: true
+ - run: uv run --all-extras --group dev --python 3.13 ty check ms2pip/
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: astral-sh/setup-uv@v6
with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install dependencies
+ enable-cache: true
+ - name: Install and test
run: |
- python -m pip install --upgrade pip
- pip install ruff
-
- - name: Check with Ruff
- run: ruff check --output-format=github ./ms2pip
-
- - name: Build and install ms2pip
- run: |
- pip install .[dev]
-
- - name: Test with pytest
- run: |
- pytest
-
- - name: Test installation
+ uv run --python ${{ matrix.python-version }} pytest
+ - name: Test CLI
run: |
- ms2pip --help
+ uv run --python ${{ matrix.python-version }} ms2pip --help
diff --git a/.gitignore b/.gitignore
index 4e76cd43..8a137c9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,5 @@
-.pytest_cache/
-
-# Custom MS2PIP stuff
+# Custom
data/
-*_pyx.c
-*_pyx_*.c
-ms2pip/models_xgboost/*.xgboost
# Pytest
.pytest_cache/
@@ -98,6 +93,7 @@ celerybeat-schedule
.venv/
venv/
ENV/
+.venv-*/
# Spyder project settings
.spyderproject
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 248bf4ca..00000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,3 +0,0 @@
-include ms2pip/_cython_modules/*.c
-include ms2pip/_models_c/*/*.c
-include ms2pip/_models_c/*.h
diff --git a/README.rst b/README.rst
index 58a3f933..6557943e 100644
--- a/README.rst
+++ b/README.rst
@@ -36,7 +36,8 @@ peptide fragmentation spectrum that accurately resembles its observed equivalent
can be used to validate peptide identifications, generate proteome-wide spectral libraries, or to
select discriminative transitions for targeted proteomics. MS²PIP employs the
`XGBoost `_ machine learning algorithm and is written in
-Python and C.
+Python, with helper functions in Rust
+(`ms2rescore-rs `_).
.. figure:: https://raw.githubusercontent.com/compomics/ms2pip/v4.0.0/img/mirror-DVAQIFNNILR-2.png
diff --git a/docs/source/api/ms2pip.result.rst b/docs/source/api/ms2pip.result.rst
index aee97a08..5946933f 100644
--- a/docs/source/api/ms2pip.result.rst
+++ b/docs/source/api/ms2pip.result.rst
@@ -3,3 +3,4 @@ ms2pip.result
*************
.. automodule:: ms2pip.result
+ :members:
diff --git a/docs/source/api/ms2pip.spectrum.rst b/docs/source/api/ms2pip.spectrum.rst
index 0bcea63b..0374cab5 100644
--- a/docs/source/api/ms2pip.spectrum.rst
+++ b/docs/source/api/ms2pip.spectrum.rst
@@ -3,3 +3,4 @@ ms2pip.spectrum
***************
.. automodule:: ms2pip.spectrum
+ :members:
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 5ca4efbd..3604ef14 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -6,17 +6,12 @@ Pip package
.. image:: https://flat.badgen.net/badge/install%20with/pip/green
:target: https://pypi.org/project/ms2pip/
-With Python 3.9 or higher, run:
+With Python 3.11 or higher, run:
.. code-block:: bash
pip install ms2pip
-Compiled wheels are available for various Python versions on 64bit Linux,
-Windows, and macOS. This should install MS²PIP in a few seconds. For other
-platforms, MS²PIP can be built from source, although it can take a while
-to compile the large prediction models.
-
We recommend using a `venv `__ or
`conda `__ virtual environment.
diff --git a/ms2pip/__init__.py b/ms2pip/__init__.py
index 2eaef20c..a0fbe937 100644
--- a/ms2pip/__init__.py
+++ b/ms2pip/__init__.py
@@ -1,13 +1,14 @@
# isort: skip_file
"""MS2PIP: Accurate and versatile peptide fragmentation spectrum prediction."""
-__version__ = "4.2.0-alpha.2"
+__version__ = "4.2.0-alpha.3"
__all__ = [
"predict_single",
"predict_batch",
"predict_library",
"correlate",
"correlate_preloaded",
+ "correlate_single",
"get_training_data",
"annotate_spectra",
"download_models",
@@ -26,6 +27,7 @@
predict_library,
correlate,
correlate_preloaded,
+ correlate_single,
get_training_data,
annotate_spectra,
download_models,
diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py
index b01ec7e1..1dc6beaf 100644
--- a/ms2pip/__main__.py
+++ b/ms2pip/__main__.py
@@ -1,7 +1,6 @@
import logging
import sys
from pathlib import Path
-from typing import Optional
import click
import rich
@@ -32,7 +31,7 @@
def _infer_output_name(
input_filename: str,
- output_name: Optional[str] = None,
+ output_name: str | None = None,
) -> Path:
"""Infer output filename from input filename if output_filename was not defined."""
if output_name:
@@ -50,9 +49,7 @@ def cli(*args, **kwargs):
format="%(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=LOGGING_LEVELS[kwargs["logging_level"]],
- handlers=[
- RichHandler(rich_tracebacks=True, show_level=True, show_path=False)
- ],
+ handlers=[RichHandler(rich_tracebacks=True, show_level=True, show_path=False)],
)
rich.print(build_credits())
@@ -79,7 +76,7 @@ def predict_single(*args, **kwargs):
# Write output
rich.print(build_prediction_table(predicted_spectrum))
write_spectra(output_name, [result], output_format)
- if plot:
+ if plot and predicted_spectrum:
spectrum_to_png(predicted_spectrum, output_name)
@@ -142,6 +139,9 @@ def predict_library(*args, **kwargs):
@click.option("--model", type=click.Choice(MODELS), default="HCD")
@click.option("--model-dir")
@click.option("--ms2-tolerance", type=float, default=0.02)
+@click.option(
+ "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da"
+)
@click.option("--processes", "-n", type=int)
def correlate(*args, **kwargs):
# Parse arguments
@@ -171,6 +171,9 @@ def correlate(*args, **kwargs):
@click.option("--spectrum-id-pattern", "-p")
@click.option("--model", type=click.Choice(MODELS), default="HCD")
@click.option("--ms2-tolerance", type=float, default=0.02)
+@click.option(
+ "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da"
+)
@click.option("--processes", "-n", type=int)
def get_training_data(*args, **kwargs):
# Parse arguments
@@ -193,6 +196,9 @@ def get_training_data(*args, **kwargs):
@click.option("--spectrum-id-pattern", "-p")
@click.option("--model", type=click.Choice(MODELS), default="HCD")
@click.option("--ms2-tolerance", type=float, default=0.02)
+@click.option(
+ "--ms2-tolerance-mode", type=click.Choice(["Da", "ppm"], case_sensitive=False), default="Da"
+)
@click.option("--processes", "-n", type=int)
def annotate_spectra(*args, **kwargs):
# Parse arguments
@@ -203,8 +209,8 @@ def annotate_spectra(*args, **kwargs):
results = ms2pip.core.annotate_spectra(*args, **kwargs)
# Write intensities
- output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix()
- logger.info(f"Writing intensities to {output_name_int.with_suffix('.tsv')}")
+ output_name = output_name.with_name(output_name.stem + "_observations")
+ logger.info(f"Writing intensities to {output_name.with_suffix('.tsv')}")
write_spectra(output_name, results, "tsv")
diff --git a/ms2pip/_cython_modules/__init__.py b/ms2pip/_cython_modules/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ms2pip/_cython_modules/ms2pip_features_c.c b/ms2pip/_cython_modules/ms2pip_features_c.c
deleted file mode 100644
index 19749192..00000000
--- a/ms2pip/_cython_modules/ms2pip_features_c.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// Compute feature vectors from peptide
-unsigned int* get_v_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge)
- {
- int i,j,k;
-
- int fnum = 1; //first value in v is its length
-
- for (i=0; i < 19; i++) {
- count_n[i] = 0;
- count_c[i] = 0;
- }
-
- //I need this for Omega
- //important for sptms!!
- peptide_buf[0] = peptide[0];
- for (i=0; i < peplen; i++) {
- if (peptide[i+1] > 18) {
- peptide_buf[i+1] = sptm_mapper[peptide[i+1]];
- }
- else {
- peptide_buf[i+1] = peptide[i+1];
- }
- count_c[peptide_buf[i+1]]++;
- }
-
- int num_shared = 0;
-
- shared_features[num_shared++] = peplen;
- shared_features[num_shared++] = charge;
-
- shared_features[num_shared] = 0;
- if (charge == 1) {
- shared_features[num_shared] = 1;
- }
- num_shared++;
- shared_features[num_shared] = 0;
- if (charge == 2) {
- shared_features[num_shared] =1;
- }
- num_shared++;
- shared_features[num_shared] = 0;
- if (charge == 3) {
- shared_features[num_shared] =1;
- }
- num_shared++;
- shared_features[num_shared] = 0;
- if (charge == 4) {
- shared_features[num_shared] =1;
- }
- num_shared++;
- shared_features[num_shared] = 0;
- if (charge >= 5) {
- shared_features[num_shared] =1;
- }
- num_shared++;
-
- for (j=0; j < num_props; j++) {
- for (i=0; i < peplen; i++) {
- props_buffer[i] = props[j][peptide_buf[i+1]];
- }
- qsort(props_buffer,peplen,sizeof(unsigned int),cmpfunc);
- shared_features[num_shared++] = props_buffer[0];
- shared_features[num_shared++] = props_buffer[(int)(0.25*(peplen-1))];
- shared_features[num_shared++] = props_buffer[(int)(0.5*(peplen-1))];
- shared_features[num_shared++] = props_buffer[(int)(0.75*(peplen-1))];
- shared_features[num_shared++] = props_buffer[peplen-1];
- }
-
- for (i=0; i < peplen-1; i++) {
- for (j=0; j
-#include
-#include
-
-
-float* amino_masses;
-unsigned short* amino_F;
-unsigned short* sptm_mapper;
-float ntermmod;
-
-
-// For ms2pip_features_c.c
-unsigned int v[300000];
-
-int num_props = 4;
-unsigned int props[5][19] = {
- {37,35,59,129,94,0,210,81,191,106,101,117,115,343,49,90,60,134,104}, //basicity
- {68,23,33,29,70,58,41,73,32,66,38,0,40,39,44,53,71,51,55}, //helicity
- {51,75,25,35,100,16,3,94,0,82,12,0,22,22,21,39,80,98,70}, //hydrophobicity
- {32,23,0,4,27,32,48,32,69,29,26,35,28,79,29,28,31,31,28}, //pI
- //{71,103,115,129,147,57,137,113,128,131,114,97,128,156,87,101,99,186,163} //mass
-};
-
-unsigned int props_buffer[100]; //100 is max pep length
-unsigned int shared_features[100]; //100 is max num shared features
-unsigned int count_n[19];
-unsigned int count_c[19];
-unsigned short peptide_buf[200]; //IONBOT
-
-
-// Function required in ms2pip_features_c_general.c and ms2pip_features_c_catboost.c
-int cmpfunc (const void * a, const void * b) {
- return ( *(int*)a - *(int*)b );
-}
-
-
-// This function initializes amino acid masses and PTMs from a configuration file generated by Omega
-void init_ms2pip(char* amino_masses_fname, char* modifications_fname, char* modifications_fname_sptm) {
- int i;
- int nummods;
- int nummods_sptm;
- float mz;
- int numptm;
- int before;
- int after;
-
- FILE* f = fopen(modifications_fname,"rt");
- fscanf(f,"%i\n",&nummods);
- fclose(f);
-
- f = fopen(modifications_fname_sptm,"rt");
- fscanf(f,"%i\n",&nummods_sptm);
- fclose(f);
-
- //malloc
- amino_masses = (float*) malloc((38+nummods+nummods_sptm)*sizeof(float));
- amino_F = (unsigned short*) malloc((38+nummods+nummods_sptm)*sizeof(unsigned short));
- sptm_mapper = (unsigned short*) malloc((38+nummods+nummods_sptm)*sizeof(unsigned short));
-
- f = fopen(amino_masses_fname,"rt");
- for (i=0; i< 19; i++) {
- fscanf(f,"%f\n",&amino_masses[i]);
- amino_F[i] = (unsigned short) (amino_masses[i]-57.021464);
- }
- fscanf(f,"%f\n",&ntermmod);
- fclose(f);
-
- for (i=0; i< 19; i++) {
- amino_masses[19+i]=amino_masses[i];
- amino_F[19+i] = amino_F[i];
- }
-
- f = fopen(modifications_fname_sptm,"rt");
- fscanf(f,"%i\n",&nummods_sptm);
- for (i=0; i< nummods_sptm; i++) {
- fscanf(f,"%f,%i,%i,%i\n",&mz,&numptm,&before,&after);
- sptm_mapper[after] = before;
- sptm_mapper[after] = before;
- if (after > 18) {
- if (before<0) {
- amino_masses[after] = mz;
- }
- else
- {
- amino_masses[after] = amino_masses[before]+mz;
- amino_F[after] = (unsigned short) (amino_masses[before]+mz - 57.021464);
- }
- }
- }
- fclose(f);
- f = fopen(modifications_fname,"rt");
- fscanf(f,"%i\n",&nummods);
- for (i=0; i< nummods; i++) {
- fscanf(f,"%f,%i,%i,%i\n",&mz,&numptm,&before,&after);
- if (after > 18) {
- if (before<0) {
- amino_masses[after] = mz;
- }
- else
- {
- amino_masses[after] = amino_masses[before]+mz;
- amino_F[after] = (unsigned short) (amino_masses[before]+mz - 57.021464);
- }
- }
- }
- fclose(f);
-}
diff --git a/ms2pip/_cython_modules/ms2pip_peaks_c.c b/ms2pip/_cython_modules/ms2pip_peaks_c.c
deleted file mode 100644
index 607e4250..00000000
--- a/ms2pip/_cython_modules/ms2pip_peaks_c.c
+++ /dev/null
@@ -1,982 +0,0 @@
-#include
-#include
-#include
-
-#include "ms2pip_init_c.c"
-#include "ms2pip_features_c.c"
-
-#include "../_models_c/HCD-2019.h"
-#include "../_models_c/TMT.h"
-
-float membuffer[10000];
-float ions[2000];
-float mzs[2000];
-float predictions[1000];
-
-struct annotations{
- float* peaks;
- float* msms;
-};
-typedef struct annotations annotations;
-
-//compute feature vector from peptide + predict intensities
-float* get_p_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge, int model_id, int ce)
- {
- unsigned int* v = get_v_ms2pip(peplen, peptide, modpeptide, charge);
- int fnum = v[0]/(peplen-1);
- int i;
-
- // HCD
- if (model_id == 1) {
- for (i=0; i < peplen-1; i++) {
- predictions[0*(peplen-1)+i] = score_HCD_B(v+1+(i*fnum))+0.5;
- predictions[2*(peplen-1)-i-1] = score_HCD_Y(v+1+(i*fnum))+0.5;
- }
- }
-
- // TMT
- else if (model_id == 3) {
- for (i=0; i < peplen-1; i++) {
- predictions[0*(peplen-1)+i] = score_TMT_B(v+1+(i*fnum))+0.5;
- predictions[2*(peplen-1)-i-1] = score_TMT_Y(v+1+(i*fnum))+0.5;
- }
- }
- // EThcD
- // else if (model_id == 6) {
- // for (i=0; i < peplen-1; i++) {
- // predictions[0*(peplen-1)+i] = score_EThcD_B(v+1+(i*fnum))+0.5;
- // predictions[2*(peplen-1)-i-1] = score_EThcD_Y(v+1+(i*fnum))+0.5;
- // predictions[2*(peplen-1)+i] = score_EThcD_C(v+1+(i*fnum))+0.5;
- // predictions[4*(peplen-1)-i-1] = score_EThcD_Z(v+1+(i*fnum))+0.5;
- // }
- // }
-
- // HCDch2
- else if (model_id == 7) {
- for (i=0; i < peplen-1; i++) {
- predictions[0*(peplen-1)+i] = score_HCD_B(v+1+(i*fnum))+0.5;
- predictions[2*(peplen-1)-i-1] = score_HCD_Y(v+1+(i*fnum))+0.5;
- predictions[2*(peplen-1)+i] = score_HCD_B2(v+1+(i*fnum))+0.5;
- predictions[4*(peplen-1)-i-1] = score_HCD_Y2(v+1+(i*fnum))+0.5;
- }
- }
-
- else {
- return NULL;
- }
- return predictions;
-}
-
-
-//get fragment ion mz values (b, y)
-float* get_mz_ms2pip_general(int peplen, unsigned short* modpeptide)
- {
- int i,j;
- float mz;
- j=0;
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = mz+1.007236; //b-ion
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion
- }
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = (mz + 1.007236 + 1.007236)/2; //b2-ion: (b-ion + H)/2
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = (18.0105647 + mz + 1.007236 + 1.007236)/2; //y2-ion: (y-ion + H)/2
- }
-
- return membuffer;
-}
-
-
-//get fragment ion mz values (b, y, c, z)
-float* get_mz_ms2pip_etd(int peplen, unsigned short* modpeptide)
- {
- int i,j;
- float mz;
- j=0;
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = mz + 1.007236; //b-ion
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion
- }
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = mz + 1.007825032 + 17.0265491; //c-ion: peptide + H + NH3
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = mz + 17.00273965 - 15.01089904 + 1.007825032; //z-ion: peptide + OH - NH
- }
-
- return membuffer;
-}
-
-
-//get fragment ion mz values (b, y, b++, y++)
-float* get_mz_ms2pip_ch2(int peplen, unsigned short* modpeptide)
- {
- int i,j;
- float mz;
- j=0;
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = mz+1.007236; //b-ion
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = 18.0105647 + mz + 1.007236; //y-ion
- }
-
- mz = 0;
- if (modpeptide[0] != 0) {
- mz = amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = (mz + 1.007236 + 1.007236)/2; //b2-ion: (b-ion + H)/2
- }
-
- mz = 0;
- if (modpeptide[peplen+1] != 0) {
- mz = amino_masses[modpeptide[peplen+1]];
- }
- for (i=peplen; i > 1; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j++] = (18.0105647 + mz + 1.007236 + 1.007236)/2; //y2-ion: (y-ion + H)/2
- }
-
- return membuffer;
-}
-
-//get all fragment ion peaks from spectrum
-annotations get_t_ms2pip_all(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- {
- int i,tmp;
- float mz;
- int msms_pos;
- int mem_pos;
- float max, maxmz, tmp2;
-
- //for (i=0; i < numpeaks; i++) {
- // fprintf(stderr,"m %f\n",msms[i]);
- //}
-
- for (i=0; i < 18*(peplen-1); i++) { // 2*9 iontypes: b: a -H2O -NH3 b c y: -H2O z y x
- ions[i] = -9.96578428466; //HARD CODED!!
- mzs[i] = 0; //HARD CODED!!
- }
-
- //b-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- int pos = 0;
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[pos++] = mz+1.007236-27.99492; // a
- membuffer[pos++] = mz+1.007236-18.010565; // -H2O
- membuffer[pos++] = mz+1.007236-17.026001; // -NH3
- membuffer[pos++] = mz+1.007236; // b
- membuffer[pos++] = mz+1.007236+17.02654; // c
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= 5*(peplen-1)) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- maxmz = msms[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- maxmz = msms[tmp];
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[mem_pos] = max;
- mzs[mem_pos] = maxmz;
- mem_pos += 1;
- }
- }
-
- //b2-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- pos = 0;
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[pos++] = (mz+1.007236+1.007236-27.99492)/2.; // a
- membuffer[pos++] = (mz+1.007236+1.007236-18.010565)/2.; // -H2O
- membuffer[pos++] = (mz+1.007236+1.007236-17.026001)/2.; // -NH3
- membuffer[pos++] = (mz+1.007236+1.007236)/2.; // b
- membuffer[pos++] = (mz+1.007236+1.007236+17.02654)/2.; // c
- }
-
- msms_pos = 0;
- mem_pos=0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= 5*(peplen-1)) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- maxmz = msms[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- maxmz = msms[tmp];
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[5*(peplen-1)+mem_pos] = max;
- mzs[5*(peplen-1)+mem_pos] = maxmz;
- mem_pos += 1;
- }
- }
-
-
- // y-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- pos = 0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[pos++] = 18.0105647+mz+1.007236-18.010565; //-H2O
- membuffer[pos++] = 18.0105647+mz+1.007236-17.02545; // z
- membuffer[pos++] = 18.0105647+mz+1.007236; // y
- membuffer[pos++] = 18.0105647+mz+1.007236+25.97926; // x
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= 4*(peplen-1)) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- maxmz = msms[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- maxmz = msms[tmp];
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[10*(peplen-1)+mem_pos] = max;
- mzs[10*(peplen-1)+mem_pos] = maxmz;
- mem_pos += 1;
- }
- }
-
- // y2-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- pos = 0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[pos++] = (18.0105647+mz+1.007236+1.007236-18.010565)/2.; //-H2O
- membuffer[pos++] = (18.0105647+mz+1.007236+1.007236-17.02545)/2.; // z
- membuffer[pos++] = (18.0105647+mz+1.007236+1.007236)/2.; // y
- membuffer[pos++] = (18.0105647+mz+1.007236+1.007236+25.97926)/2.; // x
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= 4*(peplen-1)) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- maxmz = msms[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- maxmz = msms[tmp];
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[14*(peplen-1)+mem_pos] = max;
- mzs[14*(peplen-1)+mem_pos] = maxmz;
- mem_pos += 1;
- }
- }
-
- //for (i=0; i < 18*(peplen-1); i++) { // 2*9 iontypes: b: a -H2O -NH3 b c y: -H2O z y x
- // fprintf(stderr,"%f ",ions[i]); //HARD CODED!!
- //}
- //fprintf(stderr,"\n");
-
- struct annotations r = {ions,mzs};
-
- return r;
-}
-
-//get fragment ion peaks from spectrum
-float* get_t_ms2pip_general(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- {
- int i,j,tmp;
- float mz;
- int msms_pos;
- int mem_pos;
- float max, tmp2;
-
- //for (i=0; i < numpeaks; i++) {
- // fprintf(stderr,"m %f\n",msms[i]);
- //}
-
- for (i=0; i < 2*(peplen-1); i++) {
- ions[i] = -9.96578428466; //HARD CODED!!
- }
-
- //b-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[i-1] = mz+1.007236;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- // y-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- j=0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j] = 18.0105647+mz+1.007236;
- j++;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
-
- return ions;
-}
-
-
-
-//get fragment ion peaks from spectrum (b, y, c, z)
-float* get_t_ms2pip_etd(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- {
- int i,j,tmp;
- float mz;
- int msms_pos;
- int mem_pos;
- float max, tmp2;
-
- for (i=0; i < 4*(peplen-1); i++) {
- ions[i] = -9.96578428466; //HARD CODED!!
- }
-
- //b-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[i-1] = mz+1.007236;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- // y-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- j=0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j] = 18.0105647+mz+1.007236;
- j++;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- //c-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[i-1] = mz + 1.007825032 + 17.026549;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[2*(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- // z-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- j=0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j] = mz + 17.00274 - 15.010899 + 1.007825032;
- j++;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[3*(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- return ions;
-}
-
-
-//get fragment ion peaks from spectrum (b, y, b++, y++)
-float* get_t_ms2pip_ch2(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- {
- int i,j,tmp;
- float mz;
- int msms_pos;
- int mem_pos;
- float max, tmp2;
-
- //for (i=0; i < numpeaks; i++) {
- // fprintf(stderr,"m %f\n",msms[i]);
- //}
-
- for (i=0; i < 4*(peplen-1); i++) {
- ions[i] = -9.96578428466; //HARD CODED!!
- }
-
- //b-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[i-1] = mz+1.007236;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- // y-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- j=0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j] = 18.0105647+mz+1.007236;
- j++;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- //b2-ions
- mz = ntermmod;
- if (modpeptide[0] != 0) {
- mz += amino_masses[modpeptide[0]];
- }
- for (i=1; i < peplen; i++) {
- mz += amino_masses[modpeptide[i]];
- membuffer[i-1] = (mz + 1.007236 + 1.007236)/2;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[2*(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- // y2-ions
- mz = 0.;
- if (modpeptide[peplen+1] != 0) {
- mz += modpeptide[peplen+1];
- }
- j=0;
- for (i=peplen; i >= 2; i--) {
- mz += amino_masses[modpeptide[i]];
- membuffer[j] = (18.0105647 + mz + 1.007236 + 1.007236)/2;
- j++;
- }
-
- msms_pos = 0;
- mem_pos = 0;
- while (1) {
- if (msms_pos >= numpeaks) {
- break;
- }
- if (mem_pos >= peplen-1) {
- break;
- }
- mz = membuffer[mem_pos];
- if (msms[msms_pos] > (mz+tolmz)) {
- mem_pos += 1;
- }
- else if (msms[msms_pos] < (mz-tolmz)) {
- msms_pos += 1;
- }
- else {
- max = peaks[msms_pos];
- tmp = msms_pos + 1;
- if (tmp < numpeaks) {
- while (msms[tmp] <= (mz+tolmz)) {
- tmp2 = peaks[tmp];
- if (max < tmp2) {
- max = tmp2;
- }
- tmp += 1;
- if (tmp == numpeaks) {
- break;
- }
- }
- }
- ions[3*(peplen-1)+mem_pos] = max;
- mem_pos += 1;
- }
- }
-
- return ions;
-}
diff --git a/ms2pip/_cython_modules/ms2pip_pyx.pyx b/ms2pip/_cython_modules/ms2pip_pyx.pyx
deleted file mode 100644
index 0e980ba6..00000000
--- a/ms2pip/_cython_modules/ms2pip_pyx.pyx
+++ /dev/null
@@ -1,187 +0,0 @@
-#cython: language_level=3
-import sys
-import numpy as np
-cimport numpy as np
-
-
-NUM_ION_TYPES_MAPPING = {'general': 2, 'etd': 4, 'ch2': 4, 'all': 18}
-
-
-cdef extern from "ms2pip_peaks_c.c":
-
- ctypedef struct annotations:
- float* peaks
- float* msms
-
- void init_ms2pip(char* amino_masses_fname, char* modifications_fname, char* modifications_fname_sptm)
-
- unsigned int* get_v_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge)
-
- float* get_p_ms2pip(int peplen, unsigned short* peptide, unsigned short* modpeptide, int charge, int model_id, int ce)
-
- float* get_mz_ms2pip_general(int peplen, unsigned short* modpeptide)
- float* get_mz_ms2pip_etd(int peplen, unsigned short* modpeptide)
- float* get_mz_ms2pip_ch2(int peplen, unsigned short* modpeptide)
-
- annotations get_t_ms2pip_all(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- float* get_t_ms2pip_general(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- float* get_t_ms2pip_etd(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
- float* get_t_ms2pip_ch2(int peplen, unsigned short* modpeptide, int numpeaks, float* msms, float* peaks, float tolmz)
-
-
-def ms2pip_init(amino_masses_fname, modifications_fname, modifications_fname_sptm):
- if not isinstance(amino_masses_fname, bytearray):
- amino_masses_fname = bytearray(amino_masses_fname.encode())
- if not isinstance(modifications_fname, bytearray):
- modifications_fname = bytearray(modifications_fname.encode())
- if not isinstance(modifications_fname_sptm, bytearray):
- modifications_fname_sptm = bytearray(modifications_fname_sptm.encode())
- init_ms2pip(amino_masses_fname, modifications_fname, modifications_fname_sptm)
-
-
-def get_vector(np.ndarray[unsigned short, ndim=1, mode="c"] peptide,
- np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- charge):
-
- cdef unsigned int* results = get_v_ms2pip(len(peptide)-2, &peptide[0], &modpeptide[0], charge)
-
- r = []
- offset = 0
- fnum = int(results[0] / (len(peptide) - 3))
- for i in range(len(peptide) - 3):
- v = []
- for j in range(fnum):
- v.append(results[j + 1 + offset])
- offset += fnum
- r.append(np.array(v, dtype=np.uint16))
-
- return r
-
-
-def get_predictions(np.ndarray[unsigned short, ndim=1, mode="c"] peptide,
- np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- charge, model_id, peaks_version, ce):
- cdef float* results = get_p_ms2pip(len(peptide)-2, &peptide[0], &modpeptide[0], charge, model_id, ce)
- if results is NULL:
- raise NotImplementedError(model_id)
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-
-def get_targets(*args):
- if args[4] == 'general':
- result = get_targets_general(*args)
- if args[4] == 'etd':
- result = get_targets_etd(*args)
- if args[4] == 'ch2':
- result = get_targets_ch2(*args)
- if args[4] == 'all':
- result = get_targets_all(*args)
- return result
-
-
-def get_targets_all(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- np.ndarray[float, ndim=1, mode="c"] msms,
- np.ndarray[float, ndim=1, mode="c"] peaks,
- fragerror, peaks_version):
- cdef annotations results
- results = get_t_ms2pip_all(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror)
- result_peaks = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]*(len(modpeptide)-3)):
- result_peaks.append(results.peaks[i])
- result_mzs = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]*(len(modpeptide)-3)):
- result_mzs.append(results.msms[i])
- return (result_mzs,result_peaks)
-
-
-def get_targets_general(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- np.ndarray[float, ndim=1, mode="c"] msms,
- np.ndarray[float, ndim=1, mode="c"] peaks,
- fragerror, peaks_version):
- cdef float* results = get_t_ms2pip_general(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror)
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]): #SD: HAd to change this
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-def get_targets_etd(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- np.ndarray[float, ndim=1, mode="c"] msms,
- np.ndarray[float, ndim=1, mode="c"] peaks,
- fragerror, peaks_version):
- cdef float* results = get_t_ms2pip_etd(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror)
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-
-def get_targets_ch2(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- np.ndarray[float, ndim=1, mode="c"] msms,
- np.ndarray[float, ndim=1, mode="c"] peaks,
- fragerror, peaks_version):
- cdef float* results = get_t_ms2pip_ch2(len(modpeptide)-2, &modpeptide[0], len(peaks), &msms[0], &peaks[0], fragerror)
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-
-def get_mzs(*args):
- if args[1] == 'general':
- result = get_mzs_general(*args)
- if args[1] == 'etd':
- result = get_mzs_etd(*args)
- if args[1] == 'ch2':
- result = get_mzs_ch2(*args)
- return result
-
-
-def get_mzs_general(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- peaks_version):
- cdef float* results = get_mz_ms2pip_general(len(modpeptide)-2, &modpeptide[0])
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-
-def get_mzs_etd(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- peaks_version):
- cdef float* results = get_mz_ms2pip_etd(len(modpeptide)-2, &modpeptide[0])
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
-
-
-def get_mzs_ch2(np.ndarray[unsigned short, ndim=1, mode="c"] modpeptide,
- peaks_version):
- cdef float* results = get_mz_ms2pip_ch2(len(modpeptide)-2, &modpeptide[0])
- result_parsed = []
- for i in range(NUM_ION_TYPES_MAPPING[peaks_version]):
- tmp = []
- for j in range(len(modpeptide)-3):
- tmp.append(results[(len(modpeptide)-3) * i + j])
- result_parsed.append(tmp)
- return result_parsed
diff --git a/ms2pip/_models_c/HCD-2019.h b/ms2pip/_models_c/HCD-2019.h
deleted file mode 100644
index 4887791b..00000000
--- a/ms2pip/_models_c/HCD-2019.h
+++ /dev/null
@@ -1,4 +0,0 @@
-float score_HCD_B(unsigned int* v);
-float score_HCD_B2(unsigned int* v);
-float score_HCD_Y(unsigned int* v);
-float score_HCD_Y2(unsigned int* v);
diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c
deleted file mode 100644
index ae8fcf62..00000000
--- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0ab6fdc7c3b2324228912757d1c55bd56fdebbe066a0b5fc0305f0c7bbb9969
-size 8726156
diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c
deleted file mode 100644
index 19a3d6a4..00000000
--- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_B2.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b2e27aab327de1a883500f7bb019ff25c9412141fb19ffe1a189eb092bdebd5b
-size 1664299
diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c
deleted file mode 100644
index 755926fc..00000000
--- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:105a9323043df6a192d300adaf3867391ef9e6e03599f6bc393289c3f62b982d
-size 9116794
diff --git a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c b/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c
deleted file mode 100644
index 4dd0a7b4..00000000
--- a/ms2pip/_models_c/HCD-2019/model_20190107_HCD_train_Y2.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c1be44093ce4b3085b49142ed5a263c01a79c90b3bcaea4c0e829e3b587f2a3
-size 9063001
diff --git a/ms2pip/_models_c/TMT.h b/ms2pip/_models_c/TMT.h
deleted file mode 100644
index 6ec67231..00000000
--- a/ms2pip/_models_c/TMT.h
+++ /dev/null
@@ -1,2 +0,0 @@
-float score_TMT_B(unsigned int* v);
-float score_TMT_Y(unsigned int* v);
diff --git a/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c b/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c
deleted file mode 100644
index c23a7f9a..00000000
--- a/ms2pip/_models_c/TMT/model_20190107_TMT_train_B.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:49252e3d1b42df924b422f118870f237ec541d58db0037b4800444999277d055
-size 2348087
diff --git a/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c b/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c
deleted file mode 100644
index f3635860..00000000
--- a/ms2pip/_models_c/TMT/model_20190107_TMT_train_Y.c
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5429cc869363e3383ce41a78329870898ce6e6c3b5fcf9ed1ca5eaac9417df42
-size 4323734
diff --git a/ms2pip/_spectrum_processing.py b/ms2pip/_spectrum_processing.py
new file mode 100644
index 00000000..8aed4cb6
--- /dev/null
+++ b/ms2pip/_spectrum_processing.py
@@ -0,0 +1,271 @@
+"""Internal spectrum annotation, target extraction, and PSM-spectrum matching."""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+from collections.abc import Generator
+from pathlib import Path
+
+import numpy as np
+from psm_utils import PSM, PSMList
+from psm_utils import Peptidoform
+from ms2rescore_rs import (
+ MS2Spectrum, # type: ignore[ty:unresolved-import]
+ Precursor, # type: ignore[ty:unresolved-import]
+ annotate_ms2_spectra, # type: ignore[ty:unresolved-import]
+ get_ms2_spectra, # type: ignore[ty:unresolved-import]
+)
+
+import ms2pip.exceptions as exceptions
+from ms2pip.constants import MODELS
+from ms2pip.spectrum import ObservedSpectrum
+
+
+def proforma_to_mass_shift(peptidoform: Peptidoform) -> str:
+ """
+ Convert a Peptidoform to a mass-shift ProForma string.
+
+ Replaces all modification labels with numeric mass shifts so that
+ ms2rescore-rs can parse them. Handles sequence modifications and
+ N/C-terminal modifications.
+
+ Note: This does not handle ProForma features like labile modifications,
+ unlocalized modifications, tagged intervals, or isotope labels. These are
+ not used by ms2pip.
+ """
+ parts = []
+ n_term = peptidoform.properties.get("n_term")
+ if n_term:
+ for mod in n_term:
+ parts.append(f"[{mod.mass:+.4f}]-")
+ for aa, mods in peptidoform.parsed_sequence:
+ parts.append(aa)
+ if mods:
+ for mod in mods:
+ parts.append(f"[{mod.mass:+.4f}]") # type: ignore[ty:unresolved-attribute]
+ c_term = peptidoform.properties.get("c_term")
+ if c_term:
+ for mod in c_term:
+ parts.append(f"-[{mod.mass:+.4f}]")
+ if peptidoform.precursor_charge:
+ parts.append(f"/{peptidoform.precursor_charge}")
+ return "".join(parts)
+
+
+def read_raw_spectra(spectrum_file: str) -> Generator[MS2Spectrum, None, None]:
+ """Read MS2 spectra as raw ms2rescore-rs objects (no conversion to ObservedSpectrum)."""
+ try:
+ spectra = get_ms2_spectra(str(spectrum_file))
+ except ValueError as e:
+ raise exceptions.UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes) from e
+
+ for spectrum in spectra:
+ if str(spectrum.identifier) == "" or len(spectrum.mz) == 0 or len(spectrum.intensity) == 0:
+ continue
+ yield spectrum
+
+
+def to_observed_spectrum(spectrum: MS2Spectrum) -> ObservedSpectrum:
+ """Convert an MS2Spectrum to an ObservedSpectrum."""
+ return ObservedSpectrum(
+ mz=np.array(spectrum.mz, dtype=np.float32),
+ intensity=np.array(spectrum.intensity, dtype=np.float32),
+ identifier=str(spectrum.identifier),
+ precursor_mz=float(spectrum.precursor.mz),
+ precursor_charge=int(spectrum.precursor.charge),
+ retention_time=float(spectrum.precursor.rt),
+ )
+
+
+def annotate_spectrum(
+ spectrum: ObservedSpectrum,
+ psm: PSM,
+ model: str,
+ ms2_tolerance: float,
+ ms2_tolerance_mode: str,
+) -> list[list[tuple]]:
+ """
+ Annotate an ObservedSpectrum using ms2rescore-rs.
+
+ Returns peak annotations as plain Python lists of ``(series, position, charge)`` tuples.
+ """
+ ms2_spectrum = MS2Spectrum(
+ identifier=spectrum.identifier or "",
+ mz=list(spectrum.mz),
+ intensity=list(spectrum.intensity),
+ precursor=Precursor(
+ mz=float(spectrum.precursor_mz) if spectrum.precursor_mz else 0.0,
+ charge=int(spectrum.precursor_charge) if spectrum.precursor_charge else 0,
+ rt=float(spectrum.retention_time) if spectrum.retention_time else 0.0,
+ ),
+ )
+ frag_model = MODELS[model]["fragmentation"]
+ proforma = proforma_to_mass_shift(psm.peptidoform)
+ seq_len = len(psm.peptidoform.parsed_sequence)
+
+ annotated = annotate_ms2_spectra(
+ spectra=[ms2_spectrum],
+ proformas=[proforma],
+ seq_lens=[seq_len],
+ fragmentation_model=frag_model,
+ mass_mode="monoisotopic",
+ tolerance_value=float(ms2_tolerance),
+ tolerance_mode=ms2_tolerance_mode.lower(),
+ )
+ return [
+ [(a.series, a.position, a.charge) for a in peak_anns]
+ for peak_anns in annotated[0].peak_annotations
+ ]
+
+
+def targets_from_annotations(
+ peak_annotations: list,
+ intensity: np.ndarray,
+ ion_types: list[str],
+ seq_len: int,
+) -> dict[str, np.ndarray]:
+ """
+ Extract observed intensity targets from peak annotations.
+
+ Converts per-peak fragment annotations into per-ion-type intensity arrays.
+
+ Parameters
+ ----------
+ peak_annotations
+ Per-peak annotations. Each element is a list of annotations for that peak.
+ Annotations can be :py:class:`ms2rescore_rs.FragmentAnnotation` objects or
+ ``(series, position, charge)`` tuples.
+ intensity
+ Preprocessed intensity array (TIC-normalized, log2-transformed).
+ ion_types
+ Ion types to extract, e.g. ``["b", "y"]`` or ``["b", "y", "b2", "y2"]``.
+ seq_len
+ Length of the peptide sequence (number of amino acids).
+
+ Returns
+ -------
+ targets
+ Dict mapping ion type to intensity array of length ``seq_len - 1``.
+
+ """
+ n_ions = seq_len - 1
+ floor_value = np.float32(np.log2(0.001))
+ targets = {ion: np.full(n_ions, floor_value, dtype=np.float32) for ion in ion_types}
+
+ for peak_idx, peak_anns in enumerate(peak_annotations):
+ for ann in peak_anns:
+ if isinstance(ann, tuple):
+ series, position, charge = ann
+ else:
+ series, position, charge = ann.series, ann.position, ann.charge
+
+ ion_key = series if charge == 1 else f"{series}{charge}"
+
+ if ion_key not in targets:
+ continue
+
+ pos = position - 1
+ if 0 <= pos < n_ions:
+ if intensity[peak_idx] > targets[ion_key][pos]:
+ targets[ion_key][pos] = intensity[peak_idx]
+
+ return targets
+
+
+def load_and_match_spectra(
+ psm_list: PSMList,
+ spectrum_file: str | Path,
+ spectrum_id_pattern: str,
+ model: str,
+ ms2_tolerance: float,
+ ms2_tolerance_mode: str,
+) -> list[tuple[int, PSM, ObservedSpectrum, list]]:
+ """
+ Read spectra from file, annotate, preprocess, and match to PSMs.
+
+ Reads raw MS2Spectrum objects, matches to PSMs by spectrum ID, batch-annotates
+ all matched spectra in a single Rust call, then converts to ObservedSpectrum
+ and preprocesses.
+
+ Returns list of (psm_index, psm, preprocessed_spectrum, peak_annotations) tuples.
+ """
+ try:
+ spectrum_id_regex = re.compile(spectrum_id_pattern)
+ except TypeError:
+ spectrum_id_regex = re.compile(r"(.*)")
+
+ psms_by_specid = defaultdict(list)
+ for i, psm in enumerate(psm_list):
+ psms_by_specid[str(psm.spectrum_id)].append((i, psm))
+
+ # Step 1: Read raw spectra and match to PSMs (no conversion yet)
+ matched_raw: list[tuple[str, MS2Spectrum, list[tuple[int, PSM]]]] = []
+ for spectrum in read_raw_spectra(str(spectrum_file)):
+ match = spectrum_id_regex.search(str(spectrum.identifier))
+ try:
+ spectrum_id = match[1] # type: ignore[ty:not-subscriptable]
+ except (TypeError, IndexError):
+ raise exceptions.TitlePatternError(
+ f"Spectrum title pattern `{spectrum_id_pattern}` could not be matched to "
+ f"spectrum ID `{spectrum.identifier}`. "
+ " Are you sure that the regex contains a capturing group?"
+ )
+
+ if spectrum_id not in psms_by_specid:
+ continue
+
+ matched_raw.append((spectrum_id, spectrum, psms_by_specid[spectrum_id]))
+
+ if not matched_raw:
+ return []
+
+ # Step 2: Batch annotate all matched spectra (single Rust call, Rayon-parallelized)
+ batch_spectra = []
+ batch_proformas = []
+ batch_seq_lens = []
+ batch_indices = [] # (matched_raw_idx, psm_within_spectrum_idx)
+
+ for raw_idx, (_, spectrum, psm_pairs) in enumerate(matched_raw):
+ for psm_idx, (_, psm) in enumerate(psm_pairs):
+ batch_spectra.append(spectrum)
+ batch_proformas.append(proforma_to_mass_shift(psm.peptidoform))
+ batch_seq_lens.append(len(psm.peptidoform.parsed_sequence))
+ batch_indices.append((raw_idx, psm_idx))
+
+ frag_model = MODELS[model]["fragmentation"]
+ annotated_spectra = annotate_ms2_spectra(
+ spectra=batch_spectra,
+ proformas=batch_proformas,
+ seq_lens=batch_seq_lens,
+ fragmentation_model=frag_model,
+ mass_mode="monoisotopic",
+ tolerance_value=float(ms2_tolerance),
+ tolerance_mode=ms2_tolerance_mode.lower(),
+ )
+
+ # Step 3: Convert to ObservedSpectrum, preprocess, and assemble results
+ preprocessed_cache: dict[str, ObservedSpectrum] = {}
+ results = []
+
+ for batch_idx, (raw_idx, psm_idx) in enumerate(batch_indices):
+ spec_id, raw_spectrum, psm_pairs = matched_raw[raw_idx]
+ psm_index, psm = psm_pairs[psm_idx]
+
+ if spec_id not in preprocessed_cache:
+ obs = to_observed_spectrum(raw_spectrum)
+ for label_type in ["iTRAQ", "TMT"]:
+ if label_type in model:
+ obs.remove_reporter_ions(label_type)
+ obs.tic_norm()
+ obs.log2_transform()
+ preprocessed_cache[spec_id] = obs
+
+ peak_annotations = [
+ [(a.series, a.position, a.charge) for a in peak_anns]
+ for peak_anns in annotated_spectra[batch_idx].peak_annotations
+ ]
+
+ results.append((psm_index, psm, preprocessed_cache[spec_id], peak_annotations))
+
+ return results
diff --git a/ms2pip/_utils/dlib.py b/ms2pip/_utils/dlib.py
index 50714dad..004ef3b7 100644
--- a/ms2pip/_utils/dlib.py
+++ b/ms2pip/_utils/dlib.py
@@ -2,7 +2,6 @@
import zlib
from pathlib import Path
-from typing import Union
import numpy
import sqlalchemy
@@ -44,9 +43,9 @@ def process_result_value(self, value, dialect):
decompressed = zlib.decompress(value)
return numpy.frombuffer(decompressed, dtype=self.dtype).tolist()
- def copy(self):
+ def copy(self): # type: ignore[ty:invalid-method-override]
# NOTE: length will be passed through to BLOB
- return CompressedArray(self.dtype, self.impl.length)
+ return CompressedArray(self.dtype, self.impl.length) # type: ignore[ty:unresolved-attribute]
metadata = MetaData()
@@ -100,6 +99,6 @@ def copy(self):
)
-def open_sqlite(filename: Union[str, Path]) -> Connection:
+def open_sqlite(filename: str | Path) -> Connection:
engine = sqlalchemy.create_engine(f"sqlite:///{filename}")
return engine.connect()
diff --git a/ms2pip/_utils/encoder.py b/ms2pip/_utils/encoder.py
deleted file mode 100644
index da1dc0be..00000000
--- a/ms2pip/_utils/encoder.py
+++ /dev/null
@@ -1,322 +0,0 @@
-"""Peptide and modification handling for MS2PIP."""
-from __future__ import annotations
-
-import logging
-import os
-import tempfile
-from typing import Generator
-
-import numpy as np
-from psm_utils import Peptidoform, PSMList
-from pyteomics import proforma
-
-import ms2pip.exceptions as exceptions
-
-logger = logging.getLogger(__name__)
-
-AMINO_ACIDS = [
- "A",
- "C",
- "D",
- "E",
- "F",
- "G",
- "H",
- "I",
- "K",
- "M",
- "N",
- "P",
- "Q",
- "R",
- "S",
- "T",
- "V",
- "W",
- "Y",
-]
-
-AMINO_ACID_MASSES = [
- 71.037114,
- 103.00919,
- 115.026943,
- 129.042593,
- 147.068414,
- 57.021464,
- 137.058912,
- 113.084064,
- 128.094963,
- 131.040485,
- 114.042927,
- 97.052764,
- 128.058578,
- 156.101111,
- 87.032028,
- 101.047679,
- 99.068414,
- 186.079313,
- 163.063329,
-]
-
-AMINO_ACID_IDS = {a: i for i, a in enumerate(AMINO_ACIDS)}
-AMINO_ACID_IDS["L"] = AMINO_ACID_IDS["I"]
-
-
-class Encoder:
- """Modification-aware encoding of peptidoforms."""
-
- def __init__(self) -> None:
- """
- Modification-aware encoding of peptidoforms.
-
- MS²PIP requires all modification mass shifts to be written to a file for use in C code
- before running. This class handles the encoding of peptides and peptidoforms for
- modifications that have been defined.
-
- Encoder files are to be passed on to the ``ms2pip_pyx.ms2pip_init`` function. E.g.,
- ``ms2pip_pyx.ms2pip_init(*encoder.encoder_files)``.
-
- Notes
- -----
- - Either used as a context manager or manually call :py:meth:`write_encoder_files` before
- use and :py:meth:`remove_encoder_files` after use.
- - Fixed, labile, and unlocalized modifications are ignored. Fixed modifications
- should therefore already have been applied (see
- :py:meth:`psm_utils.psm_list.PSMList.apply_fixed_modifications`).
-
- """
- self.modifications = {}
- self.encoder_files = None
-
- self._next_mod_id = 38 # Avoid clash with amino acids and mutations (ionbot compatibility)
-
- def __enter__(self):
- if not self.encoder_files:
- self.write_encoder_files()
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.remove_encoder_files()
-
- def __repr__(self) -> str:
- return "{}.{}(modifications={})".format(
- self.__class__.__module__,
- self.__class__.__qualname__,
- self.modifications,
- )
-
- @classmethod
- def from_peptidoform(cls, peptidoform: Peptidoform) -> Encoder:
- """
- Create Encoder instance from peptidoform.
-
- Parameters
- ----------
- peptidoform : Peptidoform
- Peptidoform to use for modification configuration.
-
- Returns
- -------
- Encoder
- Encoder instance with modifications configured.
-
- """
- encoder = cls()
- encoder._configure_from_peptidoform(peptidoform)
- encoder.write_encoder_files()
- return encoder
-
- @classmethod
- def from_psm_list(cls, psm_list: PSMList) -> Encoder:
- """
- Create Encoder instance from PSMList.
-
- Parameters
- ----------
- psm_list : PSMList
- PSMList to use for modification configuration.
-
- Returns
- -------
- Encoder
- Encoder instance with modifications configured.
-
- """
- encoder = cls()
- encoder._configure_from_psm_list(psm_list)
- encoder.write_encoder_files()
- return encoder
-
- def _configure_modification(self, target: str, modification: proforma.TagBase):
- """
- Add single pyteomics.proforma modification to configuration.
-
- Parameters
- ----------
- target : str
- Target amino acid one-letter code or terminus (``n_term`` or ``c_term``).
- modification : pyteomics.proforma.TagBase
- Modification to add.
-
- """
- if target == "n_term":
- amino_acid_id = -1
- elif target == "c_term":
- amino_acid_id = -2
- elif target in AMINO_ACID_IDS:
- amino_acid_id = AMINO_ACID_IDS[target]
- else:
- logger.warning(f"Skipping modification for invalid amino acid: {target}")
- return None
-
- self.modifications[(target, str(modification))] = {
- "mod_id": self._next_mod_id,
- "mass_shift": modification.mass,
- "amino_acid": target,
- "amino_acid_id": amino_acid_id,
- "modification": modification,
- }
- self._next_mod_id += 1
-
- def _configure_from_peptidoform(self, peptidoform: Peptidoform):
- """Configure encoder with modifications from single Peptidoform."""
- # Get unique modifications from psm
- unique_modifications = dict()
- try:
- for aa, mods in peptidoform.parsed_sequence:
- if mods:
- unique_modifications.update({(aa, str(mod)): mod for mod in mods})
- for term in ["n_term", "c_term"]:
- if peptidoform.properties[term]:
- unique_modifications.update(
- {(term, str(mod)): mod for mod in peptidoform.properties[term]}
- )
- except KeyError as e:
- raise exceptions.UnresolvableModificationError(e.args[0]) from e
-
- # Add modification entries
- for (target, _), mod in unique_modifications.items():
- self._configure_modification(target, mod)
-
- def _configure_from_psm_list(self, psm_list: PSMList):
- """Configure encoder with modifications from PSMList."""
- # Get unique modifications from psm_list
- unique_modifications = dict()
- try:
- for psm in psm_list:
- for aa, mods in psm.peptidoform.parsed_sequence:
- if mods:
- unique_modifications.update({(aa, str(mod)): mod for mod in mods})
- for term in ["n_term", "c_term"]:
- if psm.peptidoform.properties[term]:
- unique_modifications.update(
- {(term, str(mod)): mod for mod in psm.peptidoform.properties[term]}
- )
- except KeyError as e:
- raise exceptions.UnresolvableModificationError(e.args[0]) from e
-
- # Add modification entries
- for (target, _), mod in unique_modifications.items():
- self._configure_modification(target, mod)
-
- def write_encoder_files(self) -> str:
- """Write configured masses to temporary files for use in C code."""
- # AA file
- amino_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n")
- for m in AMINO_ACID_MASSES:
- amino_file.write("{}\n".format(m))
- amino_file.write("0\n")
- amino_file.close()
-
- # PTM file
- mod_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n")
- mod_file.write("{}\n".format(len(self.modifications)))
- for mod in self.modifications.values():
- mod_file.write(
- "{},1,{},{}\n".format(mod["mass_shift"], mod["amino_acid_id"], mod["mod_id"])
- )
- mod_file.close()
-
- # SPTM file (ionbot compatibility)
- mod_file2 = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="\n")
- mod_file2.write("0\n")
- mod_file2.close()
-
- # Store temporary file names
- self.encoder_files = (amino_file.name, mod_file.name, mod_file2.name)
-
- def remove_encoder_files(self):
- """Remove temporary encoder files."""
- if self.encoder_files:
- for f in self.encoder_files:
- os.remove(f)
- self.encoder_files = None
-
- @staticmethod
- def validate_peptidoform(peptidoform: Peptidoform):
- """Validate whether a peptidoform can be encoded for MS²PIP."""
- # Charge required
- if peptidoform.precursor_charge is None:
- raise exceptions.InvalidPeptidoformError("Peptidoform charge is required.")
-
- # Peptides longer then 101 lead to "Segmentation fault (core dumped)"
- if len(peptidoform.parsed_sequence) > 100:
- raise exceptions.InvalidPeptidoformError(
- "Peptidoform sequence cannot be longer than 100 amino acids."
- )
- elif len(peptidoform.parsed_sequence) < 4:
- raise exceptions.InvalidPeptidoformError(
- "Peptidoform sequence cannot be shorter than 4 amino acids."
- )
-
- def encode_peptide(self, peptidoform: Peptidoform) -> np.ndarray:
- """Encode a peptide (without modifications) for MS²PIP."""
- self.validate_peptidoform(peptidoform)
-
- try:
- encoded = [0] + [AMINO_ACID_IDS[aa] for aa, _ in peptidoform.parsed_sequence] + [0]
- except KeyError as e:
- raise exceptions.InvalidAminoAcidError(
- f"Unsupported amino acid found in peptide `{peptidoform.proforma}`"
- ) from e
- return np.array(encoded, dtype=np.uint16)
-
- def encode_peptidoform(self, peptidoform: Peptidoform) -> np.ndarray:
- """
- Encode a peptidoform for MS²PIP.
-
- Notes
- -----
- - Multiple modifications per site is not supported. The first modification is used.
- - Fixed, labile, and unlocalized modifications are ignored. Fixed modifications
- should therefore already have been applied (see
- :py:meth:`psm_utils.PSMList.apply_fixed_modifications`).
-
- """
-
- def _generate_encoding(peptidoform) -> Generator[int, None, None]:
- if peptidoform.properties["n_term"]:
- mod_str = str(peptidoform.properties["n_term"][0])
- yield self.modifications["n_term", mod_str]["mod_id"]
- else:
- yield 0
-
- for aa, mods in peptidoform.parsed_sequence:
- try:
- if not mods:
- yield AMINO_ACID_IDS[aa]
- else:
- yield self.modifications[aa, str(mods[0])]["mod_id"]
- except KeyError as e:
- raise exceptions.InvalidAminoAcidError(
- f"Unsupported amino acid found in peptide `{peptidoform.proforma}`"
- ) from e
-
- if peptidoform.properties["c_term"]:
- mod_str = str(peptidoform.properties["c_term"][0])
- yield self.modifications["c_term", mod_str]["mod_id"]
- else:
- yield 0
-
- self.validate_peptidoform(peptidoform)
- return np.array(list(_generate_encoding(peptidoform)), dtype=np.uint16)
diff --git a/ms2pip/_utils/feature_names.py b/ms2pip/_utils/feature_names.py
index 35632ab8..ea0c8fa7 100644
--- a/ms2pip/_utils/feature_names.py
+++ b/ms2pip/_utils/feature_names.py
@@ -1,4 +1,7 @@
-from ms2pip._utils.encoder import AMINO_ACIDS
+AMINO_ACIDS = [
+ "A", "C", "D", "E", "F", "G", "H", "I", "K", "M",
+ "N", "P", "Q", "R", "S", "T", "V", "W", "Y",
+]
def get_feature_names():
diff --git a/ms2pip/_utils/ion_mobility.py b/ms2pip/_utils/ion_mobility.py
index 7dceb9f6..530703cd 100644
--- a/ms2pip/_utils/ion_mobility.py
+++ b/ms2pip/_utils/ion_mobility.py
@@ -14,7 +14,7 @@ class IonMobility:
def __init__(self, processes=1) -> None:
# Lazy import to avoid loading loading heavy dependencies when not needed
try:
- from im2deep.im2deep import predict_ccs # noqa: F401
+ from im2deep.im2deep import predict_ccs # noqa: F401 # type: ignore[ty:unresolved-import]
self.predict_fn = predict_ccs
self.processes = processes
diff --git a/ms2pip/_utils/psm_input.py b/ms2pip/_utils/psm_input.py
index 7a0cacc6..ee57e461 100644
--- a/ms2pip/_utils/psm_input.py
+++ b/ms2pip/_utils/psm_input.py
@@ -1,8 +1,5 @@
-from __future__ import annotations
-
import logging
from pathlib import Path
-from typing import Union
import psm_utils.io.peptide_record
from psm_utils import PSMList
@@ -10,7 +7,7 @@
logger = logging.getLogger(__name__)
-def read_psms(psms: Union[str, Path, PSMList], filetype: Union[str, None]) -> PSMList:
+def read_psms(psms: str | Path | PSMList, filetype: str | None) -> PSMList:
"""Read PSMList or PSM file."""
# Read PSMs
if isinstance(psms, (str, Path)):
diff --git a/ms2pip/_utils/retention_time.py b/ms2pip/_utils/retention_time.py
index 043f24a2..675688cb 100644
--- a/ms2pip/_utils/retention_time.py
+++ b/ms2pip/_utils/retention_time.py
@@ -71,7 +71,7 @@ def _init_deeplc(self):
"""
# Only import if DeepLC will be used, otherwise lots of extra heavy
# dependencies (e.g. Tensorflow) are imported as well
- import deeplc
+ import deeplc # type: ignore[ty:unresolved-import]
deeplc_params = self.config["deeplc"]
if "calibration_file" in deeplc_params and deeplc_params["calibration_file"]:
@@ -109,9 +109,7 @@ def _prepare_deeplc_peptide_df(self):
def _run_deeplc(self):
"""Run DeepLC."""
logger.info("Predicting retention times with DeepLC...")
- self.deeplc_preds = self.deeplc_predictor.make_preds(
- seq_df=self.deeplc_pep_df.fillna("")
- )
+ self.deeplc_preds = self.deeplc_predictor.make_preds(seq_df=self.deeplc_pep_df.fillna("")) # type: ignore[ty:unresolved-attribute]
def _parse_deeplc_preds(self):
"""Add DeepLC predictions to peprec DataFrame."""
diff --git a/ms2pip/_utils/xgb_models.py b/ms2pip/_utils/xgb_models.py
index 6e933a91..f20a5733 100644
--- a/ms2pip/_utils/xgb_models.py
+++ b/ms2pip/_utils/xgb_models.py
@@ -5,58 +5,130 @@
import os
import urllib.request
from itertools import islice
+from pathlib import Path
import numpy as np
import xgboost as xgb
-from ms2pip.exceptions import InvalidXGBoostModelError
+import ms2pip.exceptions as exceptions
+from ms2pip.constants import MODELS
logger = logging.getLogger(__name__)
+_MAX_PREDICTION_THREADS = 16
-def validate_requested_xgb_model(xgboost_model_files, xgboost_model_hashes, model_dir):
- """Validate requested XGBoost models, and download if necessary"""
- for _, model_file in xgboost_model_files.items():
+
+def validate_model(model: str, model_dir: str | Path | None = None) -> Path:
+ """
+ Validate model name and ensure XGBoost model files are available.
+
+ Downloads missing model files if necessary.
+
+ Parameters
+ ----------
+ model
+ Model name (must be a key in :data:`ms2pip.constants.MODELS`).
+ model_dir
+ Directory for XGBoost model files. Default: ``~/.ms2pip``.
+
+ Returns
+ -------
+ model_dir
+ Resolved model directory as Path.
+
+ """
+ model_dir = Path(model_dir) if model_dir else Path.home() / ".ms2pip"
+ if model not in MODELS:
+ raise exceptions.UnknownModelError(model)
+ logger.debug("Using %s model", model)
+ _validate_requested_xgb_model(
+ MODELS[model]["xgboost_model_files"],
+ MODELS[model]["model_hash"],
+ model_dir,
+ )
+ return model_dir
+
+
+def _validate_requested_xgb_model(xgboost_model_files, xgboost_model_hashes, model_dir):
+ """Validate requested XGBoost models, and download if necessary."""
+ for model_file in xgboost_model_files.values():
if not _check_model_presence(model_file, xgboost_model_hashes[model_file], model_dir):
_download_model(model_file, xgboost_model_hashes[model_file], model_dir)
-def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1):
+def load_xgb_models(
+ model_params: dict,
+ model_dir,
+ processes: int | None = None,
+) -> dict:
"""
- Get predictions starting from feature vectors in DMatrix object.
+ Load XGBoost models from disk.
+
+ Returns a dict of ion_type -> xgb.Booster that can be passed to
+ :func:`predict_intensities` to avoid re-loading on every call.
Parameters
----------
- features: xgb.DMatrix, np.ndarray
- Feature vectors in DMatrix object or as Numpy array.
- num_ions: list[int]
- List with number of ions (per series) for each peptide, i.e. peptide length - 1
- model_params: dict
- Model configuration as defined in ms2pip.ms2pipC.MODELS.
- model_dir: str
+ model_params
+ Model configuration dict (must contain ``xgboost_model_files``).
+ model_dir
Directory where model files are stored.
- processes: int
- Number of CPUs to use in multiprocessing
+ processes
+ Number of threads for XGBoost prediction. Capped internally.
"""
- # Init models
- xgboost_models = _initialize_xgb_models(
- model_params["xgboost_model_files"],
- model_dir,
- processes,
+ nthread = min(
+ processes if processes is not None else (os.cpu_count() or 1),
+ _MAX_PREDICTION_THREADS,
)
- if isinstance(features, np.ndarray):
- features = xgb.DMatrix(features)
+ return _initialize_xgb_models(model_params["xgboost_model_files"], model_dir, nthread)
+
+
+def predict_intensities(
+ features: np.ndarray,
+ num_ions: list[int],
+ model_params: dict,
+ model_dir,
+ processes: int | None = None,
+ xgb_models: dict | None = None,
+) -> list[dict[str, np.ndarray]]:
+ """
+ Predict intensities from feature vectors using XGBoost models.
+
+ Parameters
+ ----------
+ features
+ Feature vectors as numpy array. Will be converted to DMatrix internally.
+ num_ions
+ Number of ions (per series) for each peptide, i.e. peptide length - 1.
+ model_params
+ Model configuration dict (must contain ``xgboost_model_files``).
+ model_dir
+ Directory where model files are stored.
+ processes
+ Number of threads for XGBoost prediction. Capped internally to avoid
+ diminishing returns. By default, uses all available cores (up to cap).
+ xgb_models
+ Pre-loaded XGBoost models (from :func:`load_xgb_models`). When provided,
+ ``model_params``, ``model_dir``, and ``processes`` are ignored for model
+ loading.
+
+ Returns
+ -------
+ predictions
+ List of dicts mapping ion type to predicted intensity array, one per peptide.
+
+ """
+ if xgb_models is None:
+ xgb_models = load_xgb_models(model_params, model_dir, processes)
+ dmatrix = xgb.DMatrix(features)
logger.debug("Predicting intensities from XGBoost model files...")
prediction_dict = {}
- for ion_type, xgb_model in xgboost_models.items():
- # Get predictions from XGBoost model
- preds = xgb_model.predict(features)
- preds = preds.clip(min=np.log2(0.001)) # Clip negative intensities
- xgb_model.__del__()
+ for ion_type, xgb_model in xgb_models.items():
+ preds = xgb_model.predict(dmatrix)
+ preds = preds.clip(min=np.log2(0.001))
- # Reshape into arrays for each peptide
if ion_type.lower() in ["x", "y", "y2", "z"]:
preds = _split_list_by_lengths(preds, num_ions, reverse=True)
elif ion_type.lower() in ["a", "b", "b2", "c"]:
@@ -65,10 +137,8 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1
raise ValueError(f"Unsupported ion_type: {ion_type}")
prediction_dict[ion_type] = preds
- # Convert to list per peptide with dicts per ion type
num_peptides = len(list(prediction_dict.values())[0])
- predictions = [{k: v[i] for k, v in prediction_dict.items()} for i in range(num_peptides)]
- return predictions
+ return [{k: v[i] for k, v in prediction_dict.items()} for i in range(num_peptides)]
def _split_list_by_lengths(list_in, lengths, reverse=False):
@@ -91,18 +161,19 @@ def _check_model_presence(model, model_hash, model_dir):
def _download_model(model, model_hash, model_dir):
"""Download the xgboost model from the Genesis server."""
- if not os.path.isdir(model_dir):
- os.mkdir(model_dir)
+ os.makedirs(model_dir, exist_ok=True)
filename = os.path.join(model_dir, model)
logger.info(f"Downloading {model} to {filename}...")
try:
- urllib.request.urlretrieve(f"https://genesis.ugent.be/uvpublicdata/ms2pip/{model}", filename)
+ urllib.request.urlretrieve(
+ f"https://genesis.ugent.be/uvpublicdata/ms2pip/{model}", filename
+ )
except Exception:
logger.warning("Falling back to Zenodo for model downloads.")
urllib.request.urlretrieve(f"https://zenodo.org/records/13270668/files/{model}", filename)
if not _check_model_integrity(filename, model_hash):
- raise InvalidXGBoostModelError()
+ raise exceptions.InvalidXGBoostModelError()
def _check_model_integrity(filename, model_hash):
@@ -117,7 +188,7 @@ def _check_model_integrity(filename, model_hash):
if sha1_hash.hexdigest() == model_hash:
return True
else:
- logger.warn("Model hash not recognized.")
+ logger.warning("Model hash not recognized.")
return False
diff --git a/ms2pip/constants.py b/ms2pip/constants.py
index c72affe2..b55a2c31 100644
--- a/ms2pip/constants.py
+++ b/ms2pip/constants.py
@@ -1,15 +1,9 @@
"""Constants and fixed configurations for MS²PIP."""
-# Models and their properties
-# id is passed to get_predictions to select model
-# ion_types is required to write the ion types in the headers of the result files
-# features_version is required to select the features version
MODELS = {
"CID": {
- "id": 0,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20190107_CID_train_B.xgboost",
"y": "model_20190107_CID_train_Y.xgboost",
@@ -20,16 +14,20 @@
},
},
"HCD2019": {
- "id": 1,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
+ "xgboost_model_files": {
+ "b": "model_20190107_HCD_train_B.xgboost",
+ "y": "model_20190107_HCD_train_Y.xgboost",
+ },
+ "model_hash": {
+ "model_20190107_HCD_train_B.xgboost": "2503856c382806672e4b85f6b0ccc1f3093acc1b",
+ "model_20190107_HCD_train_Y.xgboost": "867bbc9940f75845b3f4f845d429b3780c997a02",
+ },
},
"TTOF5600": {
- "id": 2,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20190107_TTOF5600_train_B.xgboost",
"y": "model_20190107_TTOF5600_train_Y.xgboost",
@@ -40,16 +38,20 @@
},
},
"TMT": {
- "id": 3,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
+ "xgboost_model_files": {
+ "b": "model_20190107_TMT_train_B.xgboost",
+ "y": "model_20190107_TMT_train_Y.xgboost",
+ },
+ "model_hash": {
+ "model_20190107_TMT_train_B.xgboost": "352073a591d45a2e3181818f5feef99c22755af7",
+ "model_20190107_TMT_train_Y.xgboost": "d9a73bff21ab504bb91eb386f20cd8a86d60c95d",
+ },
},
"iTRAQ": {
- "id": 4,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20190107_iTRAQ_train_B.xgboost",
"y": "model_20190107_iTRAQ_train_Y.xgboost",
@@ -60,10 +62,8 @@
},
},
"iTRAQphospho": {
- "id": 5,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20190107_iTRAQphospho_train_B.xgboost",
"y": "model_20190107_iTRAQphospho_train_Y.xgboost",
@@ -73,18 +73,25 @@
"model_20190107_iTRAQphospho_train_Y.xgboost": "261b2e1810a299ed7ebf193ce1fb81a608c07d3b",
},
},
- # ETD': {'id': 6, 'ion_types': ['B', 'Y', 'C', 'Z'], 'peaks_version': 'etd', 'features_version': 'normal'},
"HCDch2": {
- "id": 7,
"ion_types": ["B", "Y", "B2", "Y2"],
- "peaks_version": "ch2",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
+ "xgboost_model_files": {
+ "b": "model_20190107_HCD_train_B.xgboost",
+ "y": "model_20190107_HCD_train_Y.xgboost",
+ "b2": "model_20190107_HCD_train_B2.xgboost",
+ "y2": "model_20190107_HCD_train_Y2.xgboost",
+ },
+ "model_hash": {
+ "model_20190107_HCD_train_B.xgboost": "2503856c382806672e4b85f6b0ccc1f3093acc1b",
+ "model_20190107_HCD_train_Y.xgboost": "867bbc9940f75845b3f4f845d429b3780c997a02",
+ "model_20190107_HCD_train_B2.xgboost": "2df86d3576e85bfd25cc149d723f1613baf854d0",
+ "model_20190107_HCD_train_Y2.xgboost": "0a116ad9f14925fc70e3eceed9484b16ca8edddb",
+ },
},
"CIDch2": {
- "id": 8,
"ion_types": ["B", "Y", "B2", "Y2"],
- "peaks_version": "ch2",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20190107_CID_train_B.xgboost",
"y": "model_20190107_CID_train_Y.xgboost",
@@ -99,10 +106,8 @@
},
},
"HCD2021": {
- "id": 9,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20210416_HCD2021_B.xgboost",
"y": "model_20210416_HCD2021_Y.xgboost",
@@ -113,10 +118,8 @@
},
},
"Immuno-HCD": {
- "id": 10,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20210316_Immuno_HCD_B.xgboost",
"y": "model_20210316_Immuno_HCD_Y.xgboost",
@@ -127,10 +130,8 @@
},
},
"CID-TMT": {
- "id": 11,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20220104_CID_TMT_B.xgboost",
"y": "model_20220104_CID_TMT_Y.xgboost",
@@ -141,10 +142,8 @@
},
},
"timsTOF2023": {
- "id": 12,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20230912_timsTOF_B.xgboost",
"y": "model_20230912_timsTOF_Y.xgboost",
@@ -155,10 +154,8 @@
},
},
"timsTOF2024": {
- "id": 13,
"ion_types": ["B", "Y"],
- "peaks_version": "general",
- "features_version": "normal",
+ "fragmentation": "cidhcd",
"xgboost_model_files": {
"b": "model_20240105_timsTOF_B.xgboost",
"y": "model_20240105_timsTOF_Y.xgboost",
diff --git a/ms2pip/core.py b/ms2pip/core.py
index 5bf93c6b..70445f0e 100644
--- a/ms2pip/core.py
+++ b/ms2pip/core.py
@@ -1,87 +1,325 @@
#!/usr/bin/env python
from __future__ import annotations
-import itertools
import logging
-import multiprocessing
-import multiprocessing.dummy
-import re
-from collections import defaultdict
+import os
+from collections.abc import Generator
from math import ceil
from pathlib import Path
-from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
+from ms2rescore_rs import (
+ AnnotatedMS2Spectrum, # type: ignore[ty:unresolved-import]
+ MS2Spectrum, # type: ignore[ty:unresolved-import]
+ Precursor, # type: ignore[ty:unresolved-import]
+ annotate_ms2_spectra, # type: ignore[ty:unresolved-import]
+ ms2pip_compute_features, # type: ignore[ty:unresolved-import]
+ ms2pip_compute_theoretical_mz, # type: ignore[ty:unresolved-import]
+)
from psm_utils import PSM, Peptidoform, PSMList
from rich.progress import track
-from ms2rescore_rs import MS2Spectrum
import ms2pip.exceptions as exceptions
-from ms2pip._cython_modules import ms2pip_pyx
-from ms2pip._utils.encoder import Encoder
-from ms2pip._utils.feature_names import get_feature_names
from ms2pip._utils.ion_mobility import IonMobility
from ms2pip._utils.psm_input import read_psms
from ms2pip._utils.retention_time import RetentionTime
-from ms2pip._utils.xgb_models import get_predictions_xgb, validate_requested_xgb_model
+from ms2pip._utils.xgb_models import load_xgb_models, predict_intensities, validate_model
from ms2pip.constants import MODELS
from ms2pip.result import ProcessingResult, calculate_correlations
from ms2pip.search_space import ProteomeSearchSpace
from ms2pip.spectrum import ObservedSpectrum
-from ms2pip.spectrum_input import read_spectrum_file
-from ms2pip.spectrum_output import SUPPORTED_FORMATS
+from ms2pip._spectrum_processing import (
+ annotate_spectrum,
+ load_and_match_spectra,
+ proforma_to_mass_shift,
+ targets_from_annotations,
+)
logger = logging.getLogger(__name__)
+NUM_FEATURES = 139
-def predict_single(
- peptidoform: Union[Peptidoform, str],
- model: Optional[str] = "HCD",
- model_dir: Optional[Union[str, Path]] = None,
-) -> ProcessingResult:
+
+def _set_rayon_threads(processes: int | None) -> None:
+ """Set RAYON_NUM_THREADS if processes is specified and not already set."""
+ if processes is None:
+ return
+ if "RAYON_NUM_THREADS" in os.environ:
+ logger.debug(
+ "RAYON_NUM_THREADS already set to %s; not overriding with processes=%d",
+ os.environ["RAYON_NUM_THREADS"],
+ processes,
+ )
+ return
+ os.environ["RAYON_NUM_THREADS"] = str(processes)
+
+
+def _predict_batch_internal(
+ psm_list: PSMList,
+ model: str,
+ model_dir: str | Path | None = None,
+ processes: int | None = None,
+ xgb_models: dict | None = None,
+) -> list[ProcessingResult]:
"""
- Predict fragmentation spectrum for a single peptide.\f
+ Batch predict features, m/z, and intensities for all PSMs.
+
+ Uses ms2rescore-rs for feature computation and theoretical m/z calculation
+ (both internally parallelized via Rayon), then XGBoost for intensity prediction.
"""
- if isinstance(peptidoform, str):
- peptidoform = Peptidoform(peptidoform)
- psm = PSM(peptidoform=peptidoform, spectrum_id=0)
- model_dir = model_dir if model_dir else Path.home() / ".ms2pip"
+ model_dir = validate_model(model, model_dir)
+ if not psm_list:
+ return []
+
+ ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
+ frag_model = MODELS[model]["fragmentation"]
+
+ proformas = [proforma_to_mass_shift(psm.peptidoform) for psm in psm_list]
+ num_ions = [len(psm.peptidoform.parsed_sequence) - 1 for psm in psm_list]
+
+ _set_rayon_threads(processes)
+
+ # Batch compute theoretical m/z (single Rust call, Rayon-parallelized)
+ logger.debug("Computing theoretical m/z for %d peptides...", len(proformas))
+ all_mz = ms2pip_compute_theoretical_mz(proformas, ion_types, frag_model, "monoisotopic")
+
+ # Batch compute features (single Rust call, Rayon-parallelized)
+ logger.debug("Computing features for %d peptides...", len(proformas))
+ all_features = ms2pip_compute_features(proformas)
+
+ # Predict intensities with XGBoost
+ logger.debug("Predicting intensities with XGBoost...")
+ predictions = predict_intensities(
+ np.concatenate([f.reshape(-1, NUM_FEATURES) for f in all_features]),
+ num_ions,
+ MODELS[model],
+ model_dir,
+ processes=processes,
+ xgb_models=xgb_models,
+ )
+
+ # Assemble results
+ results = []
+ for i, psm in enumerate(psm_list):
+ results.append(
+ ProcessingResult(
+ psm_index=i,
+ psm=psm,
+ theoretical_mz={k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()},
+ predicted_intensity=predictions[i],
+ )
+ )
+ return results
+
+
+def _correlate_internal(
+ psm_spectrum_annotations: list[tuple[int, PSM, ObservedSpectrum, list]],
+ model: str,
+ model_dir: str | Path | None = None,
+ vector_file: bool = False,
+ annotations_only: bool = False,
+ processes: int | None = None,
+) -> list[ProcessingResult]:
+ """
+ Core correlation logic: extract targets, compute features/predictions, assemble results.
+
+ Parameters
+ ----------
+ psm_spectrum_annotations
+ List of (psm_index, psm, preprocessed_spectrum, peak_annotations) tuples.
+ Annotations are per-peak lists of ``(series, position, charge)`` tuples.
+ model
+ Name of prediction model.
+ model_dir
+ Directory for XGBoost model files.
+ vector_file
+ If True, return feature vectors instead of predictions (for training).
+ annotations_only
+ If True, return only m/z and observed intensities (no predictions).
+
+ """
+ if not annotations_only and not vector_file:
+ model_dir = validate_model(model, model_dir)
ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
+ frag_model = MODELS[model]["fragmentation"]
+
+ _set_rayon_threads(processes)
+
+ if not psm_spectrum_annotations:
+ return []
+
+ # Step 1: Extract targets from pre-computed annotations
+ all_targets = []
+ for psm_index, psm, spectrum, peak_annotations in psm_spectrum_annotations:
+ if not psm.peptidoform.precursor_charge:
+ psm.peptidoform.precursor_charge = spectrum.precursor_charge # type: ignore[ty:invalid-assignment]
- with Encoder.from_peptidoform(peptidoform) as encoder:
- ms2pip_pyx.ms2pip_init(*encoder.encoder_files)
- result = _process_peptidoform(0, psm, model, encoder, ion_types=ion_types)
+ seq_len = len(psm.peptidoform.parsed_sequence)
+ targets = targets_from_annotations(
+ peak_annotations, spectrum.intensity.astype(np.float32), ion_types, seq_len
+ )
+ all_targets.append(targets)
+
+ proformas = [proforma_to_mass_shift(psm.peptidoform) for _, psm, _, _ in psm_spectrum_annotations]
+ num_ions = [
+ len(psm.peptidoform.parsed_sequence) - 1 for _, psm, _, _ in psm_spectrum_annotations
+ ]
+
+ # Step 2: Compute features (needed for training and prediction, not annotation-only)
+ all_features = None
+ if not annotations_only:
+ logger.debug("Computing features for %d peptides...", len(proformas))
+ all_features = ms2pip_compute_features(proformas)
+
+ # Step 3: Compute theoretical m/z (needed for annotation and prediction, not training)
+ all_mz = None
+ if not vector_file:
+ logger.debug("Computing theoretical m/z for %d peptides...", len(proformas))
+ all_mz = ms2pip_compute_theoretical_mz(proformas, ion_types, frag_model, "monoisotopic")
- if "xgboost_model_files" in MODELS[model].keys():
- validate_requested_xgb_model(
- MODELS[model]["xgboost_model_files"],
- MODELS[model]["model_hash"],
- model_dir,
+ # Step 4: Assemble results based on mode
+ results = []
+
+ if vector_file:
+ # Training mode: return feature vectors + observed targets
+ assert all_features is not None
+ for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations):
+ results.append(
+ ProcessingResult(
+ psm_index=psm_index,
+ psm=psm,
+ theoretical_mz=None,
+ predicted_intensity=None,
+ observed_intensity=all_targets[i],
+ correlation=None,
+ feature_vectors=all_features[i],
+ )
)
- predictions = np.array(
- get_predictions_xgb(
- result.feature_vectors,
- [len(peptidoform.parsed_sequence) - 1],
- MODELS[model],
- model_dir,
+
+ elif annotations_only:
+ # Annotation mode: return m/z + observed targets
+ assert all_mz is not None
+ for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations):
+ mz = {k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()}
+ results.append(
+ ProcessingResult(
+ psm_index=psm_index,
+ psm=psm,
+ theoretical_mz=mz,
+ predicted_intensity=None,
+ observed_intensity=all_targets[i],
+ correlation=None,
+ feature_vectors=None,
)
)
- result.predicted_intensity = predictions[0] # Only one spectrum in predictions
- result.feature_vectors = None
- return result
+ else:
+ # Prediction mode: compute XGBoost predictions
+ assert all_features is not None and all_mz is not None
+ logger.debug("Predicting intensities with XGBoost...")
+ predictions = predict_intensities(
+ np.concatenate([f.reshape(-1, NUM_FEATURES) for f in all_features]),
+ num_ions,
+ MODELS[model],
+ model_dir,
+ processes=processes,
+ )
+
+ for i, (psm_index, psm, _spectrum, _ann) in enumerate(psm_spectrum_annotations):
+ mz = {k: np.array(v, dtype=np.float32) for k, v in all_mz[i].items()}
+ results.append(
+ ProcessingResult(
+ psm_index=psm_index,
+ psm=psm,
+ theoretical_mz=mz,
+ predicted_intensity=predictions[i],
+ observed_intensity=all_targets[i],
+ )
+ )
+
+ return results
+
+
+def _into_batches(iterable, batch_size: int) -> Generator[list, None, None]:
+ """Accumulate iterator elements into batches of a given size."""
+ batch = []
+ for item in iterable:
+ batch.append(item)
+ if len(batch) == batch_size:
+ yield batch
+ batch = []
+ if batch:
+ yield batch
+
+
+def _assemble_training_data(results: list[ProcessingResult], model: str) -> pd.DataFrame:
+ """Assemble training data from results list to single pandas DataFrame."""
+ from ms2pip._utils.feature_names import get_feature_names
+
+ ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
+
+ training_data = pd.DataFrame(
+ np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]),
+ columns=get_feature_names(),
+ )
+ training_data["psm_index"] = np.concatenate(
+ [
+ np.repeat(r.psm_index, r.feature_vectors.shape[0])
+ for r in results
+ if r.feature_vectors is not None
+ ]
+ )
+ for ion_type in ion_types:
+ if ion_type in ["a", "b", "b2", "c"]:
+ training_data[f"target_{ion_type}"] = np.concatenate(
+ [
+ r.observed_intensity[ion_type]
+ for r in results
+ if r.feature_vectors is not None and r.observed_intensity is not None
+ ]
+ )
+ elif ion_type in ["x", "y", "y2", "z"]:
+ training_data[f"target_{ion_type}"] = np.concatenate(
+ [
+ r.observed_intensity[ion_type][::-1]
+ for r in results
+ if r.feature_vectors is not None and r.observed_intensity is not None
+ ]
+ )
+
+ training_data = training_data[
+ ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types]
+ ]
+
+ return training_data
+
+
+def predict_single(
+ peptidoform: Peptidoform | str,
+ model: str = "HCD",
+ model_dir: str | Path | None = None,
+) -> ProcessingResult:
+ """
+ Predict fragmentation spectrum for a single peptide.\f
+ """
+ if isinstance(peptidoform, str):
+ peptidoform = Peptidoform(peptidoform)
+ psm = PSM(peptidoform=peptidoform, spectrum_id=0)
+ psm_list = PSMList(psm_list=[psm])
+ results = _predict_batch_internal(psm_list, model, model_dir)
+ return results[0]
def predict_batch(
- psms: Union[PSMList, str, Path],
+ psms: PSMList | str | Path,
add_retention_time: bool = False,
add_ion_mobility: bool = False,
- psm_filetype: Optional[str] = None,
- model: Optional[str] = "HCD",
- model_dir: Optional[Union[str, Path]] = None,
- processes: Optional[int] = None,
-) -> List[ProcessingResult]:
+ psm_filetype: str | None = None,
+ model: str = "HCD",
+ model_dir: str | Path | None = None,
+ processes: int | None = None,
+) -> list[ProcessingResult]:
"""
Predict fragmentation spectra for a batch of peptides.\f
@@ -101,11 +339,12 @@ def predict_batch(
model_dir
Directory where XGBoost model files are stored. Default: `~/.ms2pip`.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Returns
-------
- predictions: List[ProcessingResult]
+ predictions: list[ProcessingResult]
Predicted spectra with theoretical m/z and predicted intensity values.
"""
@@ -123,29 +362,20 @@ def predict_batch(
im_predictor = IonMobility(processes=processes)
im_predictor.add_im_predictions(psm_list)
- with Encoder.from_psm_list(psm_list) as encoder:
- ms2pip_parallelized = _Parallelized(
- encoder=encoder,
- model=model,
- model_dir=model_dir,
- processes=processes,
- )
- logger.info("Processing peptides...")
- results = ms2pip_parallelized.process_peptides(psm_list)
-
- return results
+ logger.info("Processing peptides...")
+ return _predict_batch_internal(psm_list, model, model_dir, processes=processes)
def predict_library(
- fasta_file: Optional[Union[str, Path]] = None,
- config: Optional[Union[ProteomeSearchSpace, dict, str, Path]] = None,
+ fasta_file: str | Path | None = None,
+ config: ProteomeSearchSpace | dict | str | Path | None = None,
add_retention_time: bool = False,
add_ion_mobility: bool = False,
- model: Optional[str] = "HCD",
- model_dir: Optional[Union[str, Path]] = None,
+ model: str = "HCD",
+ model_dir: str | Path | None = None,
batch_size: int = 100000,
- processes: Optional[int] = None,
-) -> Generator[ProcessingResult, None, None]:
+ processes: int | None = None,
+) -> Generator[list[ProcessingResult], None, None]:
"""
Predict spectral library from protein FASTA file.\f
@@ -168,58 +398,66 @@ def predict_library(
batch_size
Number of peptides to process in each batch.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Yields
------
- predictions: List[ProcessingResult]
+ predictions: list[ProcessingResult]
Predicted spectra with theoretical m/z and predicted intensity values.
"""
if fasta_file and config:
- # Use provided proteome, but overwrite fasta_file
- config = ProteomeSearchSpace.from_any(config)
- config.fasta_file = fasta_file
+ search_space = ProteomeSearchSpace.from_any(config)
+ search_space.fasta_file = Path(fasta_file)
elif fasta_file and not config:
- # Default proteome search space with provided fasta_file
- config = ProteomeSearchSpace(fasta_file=fasta_file)
+ search_space = ProteomeSearchSpace(fasta_file=fasta_file)
elif not fasta_file and config:
- # Use provided proteome
- config = ProteomeSearchSpace.from_any(config)
+ search_space = ProteomeSearchSpace.from_any(config)
else:
raise ValueError("Either `fasta_file` or `config` must be provided.")
- search_space = ProteomeSearchSpace.from_any(config)
search_space.build()
+ # Pre-load XGBoost models once for all batches
+ model_dir = validate_model(model, model_dir)
+ xgb_models = load_xgb_models(MODELS[model], model_dir, processes)
+
for batch in track(
_into_batches(search_space, batch_size=batch_size),
description="Predicting spectra...",
total=ceil(len(search_space) / batch_size),
):
- yield predict_batch(
- search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))),
- add_retention_time=add_retention_time,
- add_ion_mobility=add_ion_mobility,
- model=model,
- model_dir=model_dir,
+ psm_list = search_space.filter_psms_by_mz(PSMList(psm_list=list(batch)))
+
+ if add_retention_time:
+ RetentionTime(processes=processes).add_rt_predictions(psm_list)
+ if add_ion_mobility:
+ IonMobility(processes=processes).add_im_predictions(psm_list)
+
+ yield _predict_batch_internal(
+ psm_list,
+ model,
+ model_dir,
processes=processes,
+ xgb_models=xgb_models,
)
def correlate(
- psms: Union[PSMList, str, Path],
- spectrum_file: Union[str, Path],
- psm_filetype: Optional[str] = None,
- spectrum_id_pattern: Optional[str] = None,
+ psms: PSMList | str | Path,
+ spectrum_file: str | Path,
+ psm_filetype: str | None = None,
+ spectrum_id_pattern: str | None = None,
compute_correlations: bool = False,
add_retention_time: bool = False,
add_ion_mobility: bool = False,
- model: Optional[str] = "HCD",
- model_dir: Optional[Union[str, Path]] = None,
+ model: str = "HCD",
+ model_dir: str | Path | None = None,
ms2_tolerance: float = 0.02,
- processes: Optional[int] = None,
-) -> List[ProcessingResult]:
+ ms2_tolerance_mode: str = "Da",
+ processes: int | None = None,
+) -> list[ProcessingResult]:
"""
Compare predicted and observed intensities and optionally compute correlations.\f
@@ -246,13 +484,16 @@ def correlate(
model_dir
Directory where XGBoost model files are stored. Default: `~/.ms2pip`.
ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
+ MS2 tolerance for observed spectrum peak annotation. By default, 0.02.
+ ms2_tolerance_mode
+ Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Returns
-------
- results: List[ProcessingResult]
+ results: list[ProcessingResult]
Predicted spectra with theoretical m/z and predicted intensity values, and optionally,
correlations.
@@ -270,18 +511,22 @@ def correlate(
im_predictor = IonMobility(processes=processes)
im_predictor.add_im_predictions(psm_list)
- with Encoder.from_psm_list(psm_list) as encoder:
- ms2pip_parallelized = _Parallelized(
- encoder=encoder,
- model=model,
- model_dir=model_dir,
- ms2_tolerance=ms2_tolerance,
- processes=processes,
+ # Validate runs and collections
+ if len(psm_list.collections) != 1 or len(psm_list.runs) != 1:
+ raise exceptions.InvalidInputError("PSMs should be for a single run and collection.")
+
+ logger.info("Processing spectra and peptides...")
+ matched = load_and_match_spectra(
+ psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode
+ )
+
+ if not matched:
+ raise exceptions.NoMatchingSpectraFound(
+ "No spectra matching spectrum IDs from PSM list could be found in provided file."
)
- logger.info("Processing spectra and peptides...")
- results = ms2pip_parallelized.process_spectra(psm_list, spectrum_file, spectrum_id_pattern)
- # Correlations also requested
+ results = _correlate_internal(matched, model, model_dir, processes=processes)
+
if compute_correlations:
logger.info("Computing correlations")
calculate_correlations(results)
@@ -291,25 +536,28 @@ def correlate(
def correlate_preloaded(
- psms: Union[PSMList, List[PSM]],
+ psms: PSMList | list[PSM],
compute_correlations: bool = False,
- model: Optional[str] = "HCD",
- model_dir: Optional[Union[str, Path]] = None,
+ model: str = "HCD",
+ model_dir: str | Path | None = None,
ms2_tolerance: float = 0.02,
- processes: Optional[int] = None,
-) -> List[ProcessingResult]:
+ ms2_tolerance_mode: str = "Da",
+ processes: int | None = None,
+) -> list[ProcessingResult]:
"""
Compare predicted and observed intensities for PSMs with preloaded spectra.\f
- Processes PSMs that already have :py:class:`ms2rescore_rs.MS2Spectrum`
- objects in their ``spectrum`` attribute. It extracts the spectra, performs predictions,
- and optionally computes correlations.
+ Processes PSMs that already have :py:class:`ms2rescore_rs.MS2Spectrum` or
+ :py:class:`ms2rescore_rs.AnnotatedMS2Spectrum` objects in their ``spectrum``
+ attribute.
Parameters
----------
psms
PSMList or list of PSM objects. Each PSM must have an
- :py:class:`ms2rescore_rs.MS2Spectrum` object in its ``spectrum`` attribute.
+ :py:class:`ms2rescore_rs.MS2Spectrum` or
+ :py:class:`ms2rescore_rs.AnnotatedMS2Spectrum` object in its ``spectrum``
+ attribute.
compute_correlations
Compute correlations between predictions and targets. Default: False.
model
@@ -317,21 +565,25 @@ def correlate_preloaded(
model_dir
Directory where XGBoost model files are stored. Default: `~/.ms2pip`.
ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
+ MS2 tolerance for observed spectrum peak annotation. By default, 0.02.
+ Only used when spectra are not already annotated.
+ ms2_tolerance_mode
+ Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``.
+ Only used when spectra are not already annotated.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Returns
-------
- results: List[ProcessingResult]
+ results: list[ProcessingResult]
ProcessingResult objects with theoretical m/z, predicted intensity, and observed
intensity values, and optionally, correlations.
Raises
------
ValueError
- If PSMs do not contain :py:class:`ms2rescore_rs.MS2Spectrum` objects in the
- ``spectrum`` attribute.
+ If PSMs do not contain spectrum objects in the ``spectrum`` attribute.
"""
if isinstance(psms, list):
@@ -339,16 +591,26 @@ def correlate_preloaded(
else:
psm_list = psms
- if not all(psm_list["spectrum"]) or not (isinstance(psm_list["spectrum"][0], MS2Spectrum)):
- raise ValueError("PSMs must contain MS2Spectrum objects in the 'spectrum' attribute.")
+ first_spectrum = psm_list["spectrum"][0]
+ if not all(psm_list["spectrum"]) or not isinstance(
+ first_spectrum, (MS2Spectrum, AnnotatedMS2Spectrum)
+ ):
+ raise ValueError(
+ "PSMs must contain MS2Spectrum or AnnotatedMS2Spectrum objects "
+ "in the 'spectrum' attribute."
+ )
+
+ spectra_are_annotated = isinstance(first_spectrum, AnnotatedMS2Spectrum)
- # Convert MS2Spectrum -> ObservedSpectrum and preprocess
- preloaded_spectra: Dict[str, ObservedSpectrum] = {}
+ # Convert to ObservedSpectrum and preprocess; store annotations if present
+ preloaded_spectra: dict[str, ObservedSpectrum] = {}
+ preloaded_annotations: dict[str, list] | None = {} if spectra_are_annotated else None
for psm in psm_list:
spec_id = str(psm.spectrum_id)
if spec_id in preloaded_spectra:
continue
spectrum = psm.spectrum
+ assert spectrum is not None
obs = ObservedSpectrum(
mz=np.array(spectrum.mz, dtype=np.float32),
intensity=np.array(spectrum.intensity, dtype=np.float32),
@@ -363,23 +625,86 @@ def correlate_preloaded(
obs.tic_norm()
obs.log2_transform()
preloaded_spectra[spec_id] = obs
+ if spectra_are_annotated:
+ assert isinstance(spectrum, AnnotatedMS2Spectrum)
+ assert preloaded_annotations is not None
+ preloaded_annotations[spec_id] = [
+ [(a.series, a.position, a.charge) for a in peak_anns]
+ for peak_anns in spectrum.peak_annotations
+ ]
+
+ # Build PSM-spectrum-annotation tuples
+ # For unannotated spectra, batch annotate using ms2rescore-rs
+ psm_spectrum_annotations = []
+ needs_annotation = [] # indices into psm_spectrum_annotations that need annotation
+
+ for i, psm in enumerate(psm_list):
+ spec_id = str(psm.spectrum_id)
+ spectrum = preloaded_spectra.get(spec_id)
+ if spectrum is None:
+ continue
+ if preloaded_annotations is not None and spec_id in preloaded_annotations:
+ psm_spectrum_annotations.append((i, psm, spectrum, preloaded_annotations[spec_id]))
+ else:
+ psm_spectrum_annotations.append((i, psm, spectrum, None))
+ needs_annotation.append(len(psm_spectrum_annotations) - 1)
- # Delegate to _Parallelized
- with Encoder.from_psm_list(psm_list) as encoder:
- ms2pip_parallelized = _Parallelized(
- encoder=encoder,
- model=model,
- model_dir=model_dir,
- ms2_tolerance=ms2_tolerance,
- processes=processes,
+ if not psm_spectrum_annotations:
+ raise exceptions.NoMatchingSpectraFound(
+ "No spectra matching spectrum IDs from PSM list could be found."
+ )
+
+ # Batch annotate any unannotated spectra
+ if needs_annotation:
+ frag_model = MODELS[model]["fragmentation"]
+ batch_spectra = []
+ batch_proformas = []
+ batch_seq_lens = []
+ for idx in needs_annotation:
+ _, psm, spectrum, _ = psm_spectrum_annotations[idx]
+ batch_spectra.append(
+ MS2Spectrum(
+ identifier=spectrum.identifier or "",
+ mz=list(spectrum.mz),
+ intensity=list(spectrum.intensity),
+ precursor=Precursor(
+ mz=float(spectrum.precursor_mz) if spectrum.precursor_mz else 0.0,
+ charge=int(spectrum.precursor_charge) if spectrum.precursor_charge else 0,
+ rt=float(spectrum.retention_time) if spectrum.retention_time else 0.0,
+ ),
+ )
+ )
+ batch_proformas.append(proforma_to_mass_shift(psm.peptidoform))
+ batch_seq_lens.append(len(psm.peptidoform.parsed_sequence))
+
+ annotated = annotate_ms2_spectra(
+ spectra=batch_spectra,
+ proformas=batch_proformas,
+ seq_lens=batch_seq_lens,
+ fragmentation_model=frag_model,
+ mass_mode="monoisotopic",
+ tolerance_value=float(ms2_tolerance),
+ tolerance_mode=ms2_tolerance_mode.lower(),
)
- logger.info("Processing spectra and peptides...")
- results = ms2pip_parallelized.process_preloaded_spectra(psm_list, preloaded_spectra)
+
+ for j, idx in enumerate(needs_annotation):
+ psm_index, psm, spectrum, _ = psm_spectrum_annotations[idx]
+ peak_annotations = [
+ [(a.series, a.position, a.charge) for a in peak_anns]
+ for peak_anns in annotated[j].peak_annotations
+ ]
+ psm_spectrum_annotations[idx] = (psm_index, psm, spectrum, peak_annotations)
+
+ logger.info("Processing spectra and peptides...")
+ results = _correlate_internal(psm_spectrum_annotations, model, model_dir, processes=processes)
if compute_correlations:
logger.info("Computing correlations")
calculate_correlations(results)
- logger.info(f"Median correlation: {np.median([r.correlation for r in results if r.correlation is not None])}")
+ logger.info(
+ f"Median correlation: "
+ f"{np.median([r.correlation for r in results if r.correlation is not None])}"
+ )
return results
@@ -387,6 +712,7 @@ def correlate_preloaded(
def correlate_single(
observed_spectrum: ObservedSpectrum,
ms2_tolerance: float = 0.02,
+ ms2_tolerance_mode: str = "Da",
model: str = "HCD",
) -> ProcessingResult:
"""
@@ -397,7 +723,9 @@ def correlate_single(
observed_spectrum
ObservedSpectrum instance with observed m/z and intensity values and peptidoform.
ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
+ MS2 tolerance for observed spectrum peak annotation. By default, 0.02.
+ ms2_tolerance_mode
+ Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``.
model
Model to use for prediction. Default: "HCD".
@@ -407,47 +735,41 @@ def correlate_single(
Result with theoretical m/z, predicted intensity, observed intensity, and correlation.
"""
- # Check peptidoform in observed spectrum
if not isinstance(observed_spectrum.peptidoform, Peptidoform):
raise ValueError("Peptidoform must be set in observed spectrum to correlate.")
- # Annotate spectrum and get target intensities
- with Encoder.from_peptidoform(observed_spectrum.peptidoform) as encoder:
- ms2pip_pyx.ms2pip_init(*encoder.encoder_files)
- enc_peptidoform = encoder.encode_peptidoform(observed_spectrum.peptidoform)
- targets = ms2pip_pyx.get_targets(
- enc_peptidoform,
- observed_spectrum.mz.astype(np.float32),
- observed_spectrum.intensity.astype(np.float32),
- float(ms2_tolerance),
- MODELS[model]["peaks_version"],
- )
+ # Preprocess a copy of the spectrum (TIC normalization + log2 transform)
+ preprocessed = observed_spectrum.model_copy(deep=True)
+ for label_type in ["iTRAQ", "TMT"]:
+ if label_type in model:
+ preprocessed.remove_reporter_ions(label_type)
+ preprocessed.tic_norm()
+ preprocessed.log2_transform()
- # Reshape to dict with intensities per ion type
+ psm = PSM(peptidoform=observed_spectrum.peptidoform, spectrum_id=0)
+ annotated = annotate_spectrum(preprocessed, psm, model, ms2_tolerance, ms2_tolerance_mode)
ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
- observed_intensity = {
- i: np.array(p, dtype=np.float32).clip(min=np.log2(0.001)) # Clip negative intensities
- for i, p in zip(ion_types, targets)
- }
+ seq_len = len(observed_spectrum.peptidoform.parsed_sequence)
+ observed_intensity = targets_from_annotations(
+ annotated, preprocessed.intensity.astype(np.float32), ion_types, seq_len
+ )
- # Predict spectrum and add target intensities
result = predict_single(observed_spectrum.peptidoform, model=model)
result.observed_intensity = observed_intensity
- # Add correlation
calculate_correlations([result])
-
return result
def get_training_data(
- psms: Union[PSMList, str, Path],
- spectrum_file: Union[str, Path],
- psm_filetype: Optional[str] = None,
- spectrum_id_pattern: Optional[str] = None,
- model: Optional[str] = "HCD",
+ psms: PSMList | str | Path,
+ spectrum_file: str | Path,
+ psm_filetype: str | None = None,
+ spectrum_id_pattern: str | None = None,
+ model: str = "HCD",
ms2_tolerance: float = 0.02,
- processes: Optional[int] = None,
+ ms2_tolerance_mode: str = "Da",
+ processes: int | None = None,
):
"""
Extract feature vectors and target intensities from observed spectra for training.\f
@@ -468,9 +790,12 @@ def get_training_data(
Model to use as reference for the ion types that are extracted from the observed spectra.
Default: "HCD", which results in the extraction of singly charged b- and y-ions.
ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
+ MS2 tolerance for observed spectrum peak annotation. By default, 0.02.
+ ms2_tolerance_mode
+ Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Returns
-------
@@ -481,32 +806,40 @@ def get_training_data(
psm_list = read_psms(psms, filetype=psm_filetype)
spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)"
- with Encoder.from_psm_list(psm_list) as encoder:
- ms2pip_parallelized = _Parallelized(
- encoder=encoder,
- model=model,
- ms2_tolerance=ms2_tolerance,
- processes=processes,
- )
- logger.info("Processing spectra and peptides...")
- results = ms2pip_parallelized.process_spectra(
- psm_list, spectrum_file, spectrum_id_pattern, vector_file=True
+ if len(psm_list.collections) != 1 or len(psm_list.runs) != 1:
+ raise exceptions.InvalidInputError("PSMs should be for a single run and collection.")
+
+ logger.info("Processing spectra and peptides...")
+ matched = load_and_match_spectra(
+ psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode
+ )
+
+ if not matched:
+ raise exceptions.NoMatchingSpectraFound(
+ "No spectra matching spectrum IDs from PSM list could be found in provided file."
)
- logger.info("Assembling training data in DataFrame...")
- training_data = _assemble_training_data(results, model)
+ results = _correlate_internal(
+ matched,
+ model,
+ model_dir=None,
+ vector_file=True,
+ processes=processes,
+ )
- return training_data
+ logger.info("Assembling training data in DataFrame...")
+ return _assemble_training_data(results, model)
def annotate_spectra(
- psms: Union[PSMList, str, Path],
- spectrum_file: Union[str, Path],
- psm_filetype: Optional[str] = None,
- spectrum_id_pattern: Optional[str] = None,
- model: Optional[str] = "HCD",
+ psms: PSMList | str | Path,
+ spectrum_file: str | Path,
+ psm_filetype: str | None = None,
+ spectrum_id_pattern: str | None = None,
+ model: str = "HCD",
ms2_tolerance: float = 0.02,
- processes: Optional[int] = None,
+ ms2_tolerance_mode: str = "Da",
+ processes: int | None = None,
):
"""
Annotate observed spectra.\f
@@ -527,37 +860,45 @@ def annotate_spectra(
Model to use as reference for the ion types that are extracted from the observed spectra.
Default: "HCD", which results in the extraction of singly charged b- and y-ions.
ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
+ MS2 tolerance for observed spectrum peak annotation. By default, 0.02.
+ ms2_tolerance_mode
+ Unit of the MS2 tolerance: ``"Da"`` or ``"ppm"``. By default, ``"Da"``.
processes
- Number of parallel processes for multiprocessing steps. By default, all available.
+ Number of threads for Rayon (Rust) and XGBoost parallelism. By default,
+ all available.
Returns
-------
- results: List[ProcessingResult]
+ results: list[ProcessingResult]
List of ProcessingResult objects with theoretical m/z and observed intensity values.
"""
psm_list = read_psms(psms, filetype=psm_filetype)
spectrum_id_pattern = spectrum_id_pattern if spectrum_id_pattern else "(.*)"
- with Encoder.from_psm_list(psm_list) as encoder:
- ms2pip_parallelized = _Parallelized(
- encoder=encoder,
- model=model,
- ms2_tolerance=ms2_tolerance,
- processes=processes,
- )
- logger.info("Processing spectra and peptides...")
- results = ms2pip_parallelized.process_spectra(
- psm_list, spectrum_file, spectrum_id_pattern, vector_file=False, annotations_only=True
+ if len(psm_list.collections) != 1 or len(psm_list.runs) != 1:
+ raise exceptions.InvalidInputError("PSMs should be for a single run and collection.")
+
+ logger.info("Processing spectra and peptides...")
+ matched = load_and_match_spectra(
+ psm_list, spectrum_file, spectrum_id_pattern, model, ms2_tolerance, ms2_tolerance_mode
+ )
+
+ if not matched:
+ raise exceptions.NoMatchingSpectraFound(
+ "No spectra matching spectrum IDs from PSM list could be found in provided file."
)
- return results
+ return _correlate_internal(
+ matched,
+ model,
+ model_dir=None,
+ annotations_only=True,
+ processes=processes,
+ )
-def download_models(
- models: Optional[List[str]] = None, model_dir: Optional[Union[str, Path]] = None
-):
+def download_models(models: list[str] | None = None, model_dir: str | Path | None = None):
"""
Download all specified models to the specified directory.
@@ -577,619 +918,9 @@ def download_models(
models = list(MODELS.keys())
for model in models:
- try:
- if "xgb_model_files" in MODELS[model].keys():
- continue
- except KeyError:
+ if model not in MODELS:
raise exceptions.UnknownModelError(model)
+ if "xgboost_model_files" not in MODELS[model]:
+ continue
logger.debug("Downloading %s model files", model)
- validate_requested_xgb_model(
- MODELS[model]["xgboost_model_files"],
- MODELS[model]["model_hash"],
- model_dir,
- )
-
-
-class _Parallelized:
- """Implementations of common multiprocessing functionality across MS²PIP usage modes."""
-
- def __init__(
- self,
- encoder: Encoder = None,
- model: Optional[str] = None,
- model_dir: Optional[Union[str, Path]] = None,
- ms2_tolerance: float = 0.02,
- processes: Optional[int] = None,
- ):
- """
- Implementations of common multiprocessing functionality across MS²PIP usage modes.
-
- Parameters
- ----------
- encoding
- Configured encoding class instance. Required if input peptides contain modifications.
- model
- Name of the model to use for predictions. Overrides configuration file.
- model_dir
- Custom directory for downloaded XGBoost model files. By default, `~/.ms2pip` is used.
- ms2_tolerance
- MS2 tolerance in Da for observed spectrum peak annotation. By default, 0.02 Da.
- processes
- Number of parallel processes for multiprocessing steps. By default, all available.
-
- """
- # Input parameters
- self.encoder = encoder
- self.model = model
- self.model_dir = model_dir if model_dir else Path.home() / ".ms2pip"
- self.ms2_tolerance = ms2_tolerance
- self.processes = processes if processes else multiprocessing.cpu_count()
-
- # Setup encoder if not configured
- if not self.encoder:
- self.encoder = Encoder()
- self.encoder.write_encoder_files()
-
- # Validate requested model
- if self.model in MODELS.keys():
- logger.debug("Using %s model", self.model)
- if "xgboost_model_files" in MODELS[self.model].keys():
- validate_requested_xgb_model(
- MODELS[self.model]["xgboost_model_files"],
- MODELS[self.model]["model_hash"],
- self.model_dir,
- )
- else:
- raise exceptions.UnknownModelError(self.model)
-
- def _get_pool(self):
- """Get multiprocessing pool."""
- logger.debug(f"Starting workers (processes={self.processes})...")
- if multiprocessing.current_process().daemon:
- logger.warning(
- "MS²PIP is running in a daemon process. Disabling multiprocessing as daemonic "
- "processes cannot have children."
- )
- return multiprocessing.dummy.Pool(1)
- elif self.processes == 1:
- logger.debug("Using dummy multiprocessing pool.")
- return multiprocessing.dummy.Pool(1)
- else:
- return multiprocessing.get_context("spawn").Pool(self.processes)
-
- def _validate_output_formats(self, output_formats: List[str]) -> List[str]:
- """Validate requested output formats."""
- if not output_formats:
- self.output_formats = ["csv"]
- else:
- for output_format in output_formats:
- if output_format not in SUPPORTED_FORMATS:
- raise exceptions.UnknownOutputFormatError(output_format)
- self.output_formats = output_formats
-
- def _execute_in_pool(self, psm_list: PSMList, func: Callable, args: tuple):
- """Execute function in multiprocessing pool."""
-
- def get_chunk_size(n_items, n_processes):
- """Get optimal chunk size for multiprocessing."""
- if n_items < 5000:
- return n_items
- else:
- max_chunk_size = 50000
- n_chunks = ceil(ceil(n_items / n_processes) / max_chunk_size) * n_processes
- return ceil(n_items / n_chunks)
-
- def to_chunks(_list, chunk_size):
- """Split _list into chunks of size chunk_size."""
-
- def _generate_chunks():
- for i in range(0, len(_list), chunk_size):
- yield _list[i : i + chunk_size]
-
- _list = list(_list)
- return list(_generate_chunks())
-
- def _enumerated_psm_list_by_spectrum_id(psm_list, spectrum_ids_chunk):
- selected_indices = np.flatnonzero(np.isin(psm_list["spectrum_id"], spectrum_ids_chunk))
- return [(i, psm_list.psm_list[i]) for i in selected_indices]
-
- with self._get_pool() as pool:
- if not psm_list:
- logger.warning("No PSMs to process.")
- return []
-
- # Split PSMList into chunks
- if func == _process_spectra:
- # Split by spectrum_id to keep PSMs for same spectrum together
- spectrum_ids = set(psm_list["spectrum_id"])
- chunk_size = get_chunk_size(len(spectrum_ids), pool._processes)
- chunks = [
- _enumerated_psm_list_by_spectrum_id(psm_list, spectrum_ids_chunk)
- for spectrum_ids_chunk in to_chunks(spectrum_ids, chunk_size)
- ]
- else:
- # Simple split by PSM
- chunk_size = get_chunk_size(len(psm_list), pool._processes)
- chunks = to_chunks(list(enumerate(psm_list)), chunk_size)
-
- logger.debug(f"Processing {len(chunks)} chunk(s) of ~{chunk_size} entries each.")
-
- # Add jobs to pool
- mp_results = []
- for psm_list_chunk in chunks:
- mp_results.append(pool.apply_async(func, args=(psm_list_chunk, *args)))
-
- # Gather results
- # results = [
- # r.get()
- # for r in track(
- # mp_results,
- # disable=len(chunks) == 1,
- # description="Processing chunks...",
- # transient=True,
- # show_speed=False,
- # )
- # ]
- results = [r.get() for r in mp_results]
-
- # Sort results by input order
- results = list(
- sorted(
- itertools.chain.from_iterable(results),
- key=lambda result: result.psm_index,
- )
- )
-
- return results
-
- def process_peptides(self, psm_list: PSMList) -> List[ProcessingResult]:
- """Process peptides in parallel."""
- results = self._execute_in_pool(
- psm_list,
- _process_peptides,
- (self.encoder, self.model),
- )
- logger.debug(f"Gathered data for {len(results)} peptides.")
-
- # Add XGBoost predictions if required
- if "xgboost_model_files" in MODELS[self.model].keys():
- results = self._add_xgboost_predictions(results)
-
- return results
-
- def process_spectra(
- self,
- psm_list: PSMList,
- spectrum_file: Union[str, Path],
- spectrum_id_pattern: str,
- vector_file: bool = False,
- annotations_only: bool = False,
- ) -> List[ProcessingResult]:
- """
- Process PSMs and observed spectra in parallel
-
- Parameters
- ----------
- psm_list
- psm_utils.PSMList instance with PSMs to process
- spectrum_file
- Filename of spectrum file
- spectrum_id_pattern
- Regular expression pattern to apply to spectrum titles before matching to
- peptide file entries
- vector_file
- If feature vectors should be extracted instead of predictions
- annotations_only
- If only peak annotations should be extracted from the spectrum file
-
- """
- # Validate runs and collections
- if not len(psm_list.collections) == 1 or not len(psm_list.runs) == 1:
- raise exceptions.InvalidInputError("PSMs should be for a single run and collection.")
-
- args = (
- spectrum_file,
- vector_file,
- self.encoder,
- self.model,
- self.ms2_tolerance,
- spectrum_id_pattern,
- annotations_only,
- None, # preloaded_spectra
- )
- results = self._execute_in_pool(psm_list, _process_spectra, args)
-
- # Validate number of results
- if not results:
- raise exceptions.NoMatchingSpectraFound(
- "No spectra matching spectrum IDs from PSM list could be found in provided file."
- )
- logger.debug(f"Gathered data for {len(results)} PSMs.")
-
- # Add XGBoost predictions if required
- if (
- not (vector_file or annotations_only)
- and "xgboost_model_files" in MODELS[self.model].keys()
- ):
- results = self._add_xgboost_predictions(results)
-
- return results
-
- def process_preloaded_spectra(
- self,
- psm_list: PSMList,
- preloaded_spectra: Dict[str, ObservedSpectrum],
- ) -> List[ProcessingResult]:
- """
- Process PSMs with pre-loaded and preprocessed ObservedSpectrum objects.
-
- Parameters
- ----------
- psm_list
- psm_utils.PSMList instance with PSMs to process.
- preloaded_spectra
- Dictionary mapping spectrum IDs to preprocessed ObservedSpectrum objects.
-
- """
- args = (
- None, # spec_file
- False, # vector_file
- self.encoder,
- self.model,
- self.ms2_tolerance,
- None, # spectrum_id_pattern
- False, # annotations_only
- preloaded_spectra,
- )
- results = self._execute_in_pool(psm_list, _process_spectra, args)
-
- if not results:
- raise exceptions.NoMatchingSpectraFound(
- "No spectra matching spectrum IDs from PSM list could be found."
- )
- logger.debug(f"Gathered data for {len(results)} PSMs.")
-
- if "xgboost_model_files" in MODELS[self.model].keys():
- results = self._add_xgboost_predictions(results)
-
- return results
-
- def _add_xgboost_predictions(self, results: List[ProcessingResult]) -> List[ProcessingResult]:
- """
- Add XGBoost predictions to results.
-
- Notes
- -----
- This functions is applied after the parallel processing, as XGBoost implements its own
- multiprocessing.
- """
-
- if "xgboost_model_files" not in MODELS[self.model].keys():
- raise ValueError("XGBoost model files not found in MODELS dictionary.")
-
- logger.debug("Converting feature vectors to XGBoost DMatrix...")
- import xgboost as xgb
-
- results_to_predict = [r for r in results if r.feature_vectors is not None]
-
- if not results_to_predict:
- return results
-
- num_ions = [len(r.psm.peptidoform.parsed_sequence) - 1 for r in results_to_predict]
- xgb_vector = xgb.DMatrix(np.vstack(list(r.feature_vectors for r in results_to_predict)))
-
- predictions = get_predictions_xgb(
- xgb_vector,
- num_ions,
- MODELS[self.model],
- self.model_dir,
- processes=self.processes,
- )
-
- logger.debug("Adding XGBoost predictions to results...")
- for result, preds in zip(results_to_predict, predictions):
- result.predicted_intensity = preds
- result.feature_vectors = None
-
- return results
-
-
-def _process_peptidoform(
- psm_index: int,
- psm: PSM,
- model: str,
- encoder: Encoder,
- ion_types: Optional[List[str]] = None,
-) -> ProcessingResult:
- """
- Process a single peptidoform from a PSM.
-
- Get theoretical m/z and predicted intensities (from C model) or feature vectors (for XGBoost
- model) for a single peptidoform from a PSM.
-
- Notes
- -----
- - ``ms2pip_pyx.init()`` must be called before this function is called.
- - Optionally, lowercase version of ``ion_types`` from the model configuration can be provided
- to save computational time.
-
- """
- peptidoform = psm.peptidoform
- if not ion_types:
- ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
-
- enc_peptide = encoder.encode_peptide(peptidoform)
- enc_peptidoform = encoder.encode_peptidoform(peptidoform)
-
- # Get ion mzs and map to ion types
- mz = ms2pip_pyx.get_mzs(enc_peptidoform, MODELS[model]["peaks_version"])
- mz = {i: np.array(mz, dtype=np.float32) for i, mz in zip(ion_types, mz)}
-
- # Get predictions from XGBoost models.
- if "xgboost_model_files" in MODELS[model].keys():
- predictions = None
- feature_vectors = np.array(
- ms2pip_pyx.get_vector(enc_peptide, enc_peptidoform, peptidoform.precursor_charge),
- dtype=np.uint16,
- )
- # Or get predictions from C models.
- else:
- predictions = ms2pip_pyx.get_predictions(
- enc_peptide,
- enc_peptidoform,
- peptidoform.precursor_charge,
- MODELS[model]["id"],
- MODELS[model]["peaks_version"],
- 30.0, # TODO: Remove CE feature
- )
- predictions = {
- i: np.array(p, dtype=np.float32).clip(min=np.log2(0.001)) # Clip negative intensities
- for i, p in zip(ion_types, predictions)
- }
- feature_vectors = None
-
- return ProcessingResult(
- psm_index=psm_index,
- psm=psm,
- theoretical_mz=mz,
- predicted_intensity=predictions,
- observed_intensity=None,
- feature_vectors=feature_vectors,
- )
-
-
-def _process_peptides(
- enumerated_psm_list: List[Tuple[int, PSM]],
- encoder: Encoder,
- model: str,
-) -> List[ProcessingResult]:
- """
- Predict spectrum for each entry in PeptideRecord DataFrame.
-
- Parameters
- ----------
- enumerated_psm_list
- List of tuples of (index, PSM) for each PSM in the input file.
- encoder
- Configured encoder to use for peptide and peptidoform encoding
- model
- Name of prediction model to be used
-
- """
- ms2pip_pyx.ms2pip_init(*encoder.encoder_files)
- results = []
- ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
-
- for psm_index, psm in enumerated_psm_list:
- try:
- result = _process_peptidoform(psm_index, psm, model, encoder, ion_types)
- except (
- exceptions.InvalidPeptidoformError,
- exceptions.InvalidAminoAcidError,
- ):
- result = ProcessingResult(psm_index=psm_index, psm=psm)
- results.append(result)
-
- return results
-
-
-def _process_spectra(
- enumerated_psm_list: List[Tuple[int, PSM]],
- spec_file: Optional[str],
- vector_file: bool,
- encoder: Encoder,
- model: str,
- ms2_tolerance: float,
- spectrum_id_pattern: Optional[str],
- annotations_only: bool = False,
- preloaded_spectra: Optional[Dict[str, ObservedSpectrum]] = None,
-) -> List[ProcessingResult]:
- """
- Perform requested tasks for each spectrum in spectrum file or from preloaded spectra.
-
- Parameters
- ----------
- enumerated_psm_list
- List of tuples of (index, PSM) for each PSM in the input file.
- spec_file
- Filename of spectrum file. Not used when ``preloaded_spectra`` is provided.
- vector_file
- If feature vectors should be extracted instead of predictions.
- encoder: Encoder
- Configured encoder to use for peptide and peptidoform encoding.
- model
- Name of prediction model to be used.
- ms2_tolerance
- Fragmentation spectrum m/z error tolerance in Dalton.
- spectrum_id_pattern
- Regular expression pattern to apply to spectrum titles before matching to
- peptide file entries. Not used when ``preloaded_spectra`` is provided.
- annotations_only
- If only peak annotations should be extracted from the spectrum file.
- preloaded_spectra
- Optional dictionary mapping spectrum IDs to preprocessed ObservedSpectrum objects.
- When provided, spectra are looked up directly instead of reading from ``spec_file``.
-
- """
- ms2pip_pyx.ms2pip_init(*encoder.encoder_files)
- results = []
- ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
-
- def _process_psm_with_spectrum(psm_index, psm, spectrum):
- """Process a single PSM against an observed spectrum."""
- try:
- enc_peptidoform = encoder.encode_peptidoform(psm.peptidoform)
- except exceptions.InvalidAminoAcidError:
- return ProcessingResult(psm_index=psm_index, psm=psm)
-
- targets = ms2pip_pyx.get_targets(
- enc_peptidoform,
- spectrum.mz.astype(np.float32),
- spectrum.intensity.astype(np.float32),
- float(ms2_tolerance),
- MODELS[model]["peaks_version"],
- )
- targets = {i: np.array(t, dtype=np.float32) for i, t in zip(ion_types, targets)}
-
- if not psm.peptidoform.precursor_charge:
- psm.peptidoform.precursor_charge = spectrum.precursor_charge
-
- if vector_file:
- enc_peptide = encoder.encode_peptide(psm.peptidoform)
- feature_vectors = np.array(
- ms2pip_pyx.get_vector(
- enc_peptide, enc_peptidoform, psm.peptidoform.precursor_charge
- ),
- dtype=np.uint16,
- )
- return ProcessingResult(
- psm_index=psm_index,
- psm=psm,
- theoretical_mz=None,
- predicted_intensity=None,
- observed_intensity=targets,
- correlation=None,
- feature_vectors=feature_vectors,
- )
-
- elif annotations_only:
- mz = ms2pip_pyx.get_mzs(enc_peptidoform, MODELS[model]["peaks_version"])
- mz = {i: np.array(mz, dtype=np.float32) for i, mz in zip(ion_types, mz)}
- return ProcessingResult(
- psm_index=psm_index,
- psm=psm,
- theoretical_mz=mz,
- predicted_intensity=None,
- observed_intensity=targets,
- correlation=None,
- feature_vectors=None,
- )
-
- else:
- try:
- result = _process_peptidoform(psm_index, psm, model, encoder, ion_types)
- except (
- exceptions.InvalidPeptidoformError,
- exceptions.InvalidAminoAcidError,
- ):
- return ProcessingResult(psm_index=psm_index, psm=psm)
- else:
- result.observed_intensity = targets
- return result
-
- if preloaded_spectra is not None:
- # Spectra are already preprocessed; look up by spectrum ID
- for psm_index, psm in enumerated_psm_list:
- spectrum = preloaded_spectra.get(str(psm.spectrum_id))
- if spectrum is None:
- continue
- results.append(_process_psm_with_spectrum(psm_index, psm, spectrum))
- else:
- # Read spectra from file and preprocess
- assert spec_file is not None, "spec_file is required when preloaded_spectra is not provided"
- try:
- spectrum_id_regex = re.compile(spectrum_id_pattern)
- except TypeError:
- spectrum_id_regex = re.compile(r"(.*)")
-
- psms_by_specid = defaultdict(list)
- for psm_index, psm in enumerated_psm_list:
- psms_by_specid[str(psm.spectrum_id)].append((psm_index, psm))
-
- for spectrum in read_spectrum_file(spec_file):
- match = spectrum_id_regex.search(spectrum.identifier)
- try:
- spectrum_id = match[1]
- except (TypeError, IndexError):
- raise exceptions.TitlePatternError(
- f"Spectrum title pattern `{spectrum_id_pattern}` could not be matched to "
- f"spectrum ID `{spectrum.identifier}`. "
- " Are you sure that the regex contains a capturing group?"
- )
-
- if spectrum_id not in psms_by_specid:
- continue
-
- # Spectrum preprocessing
- for label_type in ["iTRAQ", "TMT"]:
- if label_type in model:
- spectrum.remove_reporter_ions(label_type)
- # spectrum.remove_precursor() # TODO: Decide to implement this or not
- spectrum.tic_norm()
- spectrum.log2_transform()
-
- for psm_index, psm in psms_by_specid[spectrum_id]:
- results.append(_process_psm_with_spectrum(psm_index, psm, spectrum))
-
- return results
-
-
-def _assemble_training_data(results: List[ProcessingResult], model: str) -> pd.DataFrame:
- """Assemble training data from results list to single pandas DataFrame."""
- # Get ion types
- ion_types = [it.lower() for it in MODELS[model]["ion_types"]]
-
- # Assemble feature vectors, PSM indices, and targets
- training_data = pd.DataFrame(
- np.vstack([r.feature_vectors for r in results if r.feature_vectors is not None]),
- columns=get_feature_names(),
- )
- training_data["psm_index"] = np.concatenate(
- [
- np.repeat(r.psm_index, r.feature_vectors.shape[0])
- for r in results
- if r.feature_vectors is not None
- ]
- )
- for ion_type in ion_types:
- if ion_type in ["a", "b", "b2", "c"]:
- training_data[f"target_{ion_type}"] = np.concatenate(
- [r.observed_intensity[ion_type] for r in results if r.feature_vectors is not None]
- )
- elif ion_type in ["x", "y", "y2", "z"]:
- training_data[f"target_{ion_type}"] = np.concatenate(
- [
- r.observed_intensity[ion_type][::-1]
- for r in results
- if r.feature_vectors is not None
- ]
- )
-
- # Reorder columns
- training_data = training_data[
- ["psm_index"] + get_feature_names() + [f"target_{it}" for it in ion_types]
- ]
-
- return training_data
-
-
-def _into_batches(iterable: Iterable[Any], batch_size: int) -> Generator[List[Any], None, None]:
- """Accumulate iterator elements into batches of a given size."""
- batch = []
- for item in iterable:
- batch.append(item)
- if len(batch) == batch_size:
- yield batch
- batch = []
- if batch:
- yield batch
+ validate_model(model, model_dir)
diff --git a/ms2pip/plot.py b/ms2pip/plot.py
index 56f7bc63..5a9c2b97 100644
--- a/ms2pip/plot.py
+++ b/ms2pip/plot.py
@@ -1,5 +1,4 @@
from pathlib import Path
-from typing import Union
from ms2pip.spectrum import Spectrum
@@ -12,7 +11,7 @@
_can_plot = False
-def spectrum_to_png(spectrum: Spectrum, filepath: Union[str, Path]):
+def spectrum_to_png(spectrum: Spectrum, filepath: str | Path):
"""Plot a single spectrum and write to a PNG file."""
if not _can_plot:
raise ImportError("Matplotlib and spectrum_utils are required to plot spectra.")
diff --git a/ms2pip/result.py b/ms2pip/result.py
index 3cf7eb73..37ef6beb 100644
--- a/ms2pip/result.py
+++ b/ms2pip/result.py
@@ -3,8 +3,9 @@
from __future__ import annotations
import csv
-from typing import Any, Dict, List, Optional, Tuple
from logging import getLogger
+from pathlib import Path
+from typing import Any
import numpy as np
from psm_utils import PSM
@@ -12,32 +13,31 @@
try:
import spectrum_utils.plot as sup
- import spectrum_utils.spectrum as sus
except ImportError:
- sus = None
- sup = None
+ sup = None # type: ignore[ty:invalid-assignment]
from ms2pip.spectrum import ObservedSpectrum, PredictedSpectrum
logger = getLogger(__name__)
+
class ProcessingResult(BaseModel):
"""Result of processing a single PSM."""
psm_index: int
- psm: Optional[PSM] = None
- theoretical_mz: Optional[Dict[str, np.ndarray]] = None
- predicted_intensity: Optional[Dict[str, np.ndarray]] = None
- observed_intensity: Optional[Dict[str, np.ndarray]] = None
- correlation: Optional[float] = None
- feature_vectors: Optional[np.ndarray] = None
+ psm: PSM
+ theoretical_mz: dict[str, np.ndarray] | None = None
+ predicted_intensity: dict[str, np.ndarray] | None = None
+ observed_intensity: dict[str, np.ndarray] | None = None
+ correlation: float | None = None
+ feature_vectors: np.ndarray | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(__pydantic_self__, **data: Any) -> None:
"""Result of processing a single PSM."""
super().__init__(**data)
- def as_spectra(self) -> Tuple[Optional[PredictedSpectrum], Optional[ObservedSpectrum]]:
+ def as_spectra(self) -> tuple[PredictedSpectrum | None, ObservedSpectrum | None]:
"""Convert result to predicted and observed spectra."""
if not self.theoretical_mz:
raise ValueError("Theoretical m/z values required to convert to spectra.")
@@ -110,15 +110,17 @@ def plot_spectra(self):
return ax
-def calculate_correlations(results: List[ProcessingResult]) -> None:
+def calculate_correlations(results: list[ProcessingResult]) -> None:
"""Calculate and add Pearson correlations to list of results."""
for result in results:
- pred_int = np.concatenate([i for i in result.predicted_intensity.values()])
- obs_int = np.concatenate([i for i in result.observed_intensity.values()])
+ if result.predicted_intensity is None or result.observed_intensity is None:
+ continue
+ pred_int = np.concatenate(list(result.predicted_intensity.values()))
+ obs_int = np.concatenate(list(result.observed_intensity.values()))
result.correlation = np.corrcoef(pred_int, obs_int)[0][1]
-def write_correlations(results: List["ProcessingResult"], output_file: str) -> None:
+def write_correlations(results: list[ProcessingResult], output_file: str | Path) -> None:
"""Write correlations to CSV file."""
with open(output_file, "wt") as f:
fieldnames = ["psm_index", "correlation"]
diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py
index 72eab413..c9c88240 100644
--- a/ms2pip/search_space.py
+++ b/ms2pip/search_space.py
@@ -73,12 +73,14 @@
import multiprocessing
import multiprocessing.dummy
+import multiprocessing.pool
from collections import defaultdict
+from collections.abc import Generator
from functools import partial
from itertools import chain, combinations, product
from logging import getLogger
from pathlib import Path
-from typing import Any, Dict, Generator, List, Optional, Union
+from typing import Any
import numpy as np
import pyteomics.fasta
@@ -94,12 +96,12 @@ class ModificationConfig(BaseModel):
"""Configuration for a single modification in the search space."""
label: str
- amino_acid: Optional[str] = None
- peptide_n_term: Optional[bool] = False
- protein_n_term: Optional[bool] = False
- peptide_c_term: Optional[bool] = False
- protein_c_term: Optional[bool] = False
- fixed: Optional[bool] = False
+ amino_acid: str | None = None
+ peptide_n_term: bool | None = False
+ protein_n_term: bool | None = False
+ peptide_c_term: bool | None = False
+ protein_c_term: bool | None = False
+ fixed: bool | None = False
def __init__(self, **data: Any):
"""
@@ -159,15 +161,15 @@ class ProteomeSearchSpace(BaseModel):
fasta_file: Path
min_length: int = 8
max_length: int = 30
- min_precursor_mz: Optional[float] = 0
- max_precursor_mz: Optional[float] = np.inf
+ min_precursor_mz: float | None = 0
+ max_precursor_mz: float | None = np.inf
cleavage_rule: str = "trypsin"
missed_cleavages: int = 2
semi_specific: bool = False
add_decoys: bool = False
- modifications: List[ModificationConfig] = DEFAULT_MODIFICATIONS
+ modifications: list[ModificationConfig] = DEFAULT_MODIFICATIONS
max_variable_modifications: int = 3
- charges: List[int] = [2, 3]
+ charges: list[int] = [2, 3]
def __init__(self, **data: Any):
"""
@@ -204,7 +206,7 @@ def __init__(self, **data: Any):
"""
super().__init__(**data)
- self._peptidoform_spaces: List[_PeptidoformSearchSpace] = []
+ self._peptidoform_spaces: list[_PeptidoformSearchSpace] = []
@field_validator("modifications")
@classmethod
@@ -234,7 +236,7 @@ def __len__(self):
return sum(len(pep_space) for pep_space in self._peptidoform_spaces)
@classmethod
- def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> ProteomeSearchSpace:
+ def from_any(cls, _input: dict | str | Path | ProteomeSearchSpace) -> ProteomeSearchSpace:
"""
Create ProteomeSearchSpace from various input types.
@@ -271,7 +273,7 @@ def build(self, processes: int = 1):
self._add_modifications(processes)
self._add_charges()
- def __iter__(self) -> Generator[PSM, None, None]:
+ def __iter__(self) -> Generator[PSM, None, None]: # type: ignore[ty:invalid-method-override]
"""
Generate PSMs from search space.
@@ -304,14 +306,14 @@ def filter_psms_by_mz(self, psms: PSMList) -> PSMList:
psm_list=[
psm
for psm in psms
- if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz
+ if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz # type: ignore[ty:unsupported-operator]
]
)
def _digest_fasta(self, processes: int = 1):
"""Digest FASTA file to peptides and populate search space."""
# Convert to string to avoid issues with Path objects
- self.fasta_file = str(self.fasta_file)
+ self.fasta_file = str(self.fasta_file) # type: ignore[ty:invalid-assignment]
n_proteins = _count_fasta_entries(self.fasta_file)
if self.add_decoys:
fasta_db = pyteomics.fasta.decoy_db(
@@ -394,11 +396,11 @@ class _PeptidoformSearchSpace(BaseModel):
"""Search space for a given amino acid sequence."""
sequence: str
- proteins: List[str]
- is_n_term: Optional[bool] = None
- is_c_term: Optional[bool] = None
- modification_options: List[Dict[int, ModificationConfig]] = []
- charge_options: List[int] = []
+ proteins: list[str]
+ is_n_term: bool | None = None
+ is_c_term: bool | None = None
+ modification_options: list[dict[int, ModificationConfig]] = []
+ charge_options: list[int] = []
def __init__(self, **data: Any):
"""
@@ -425,7 +427,7 @@ def __init__(self, **data: Any):
def __len__(self):
return len(self.modification_options) * len(self.charge_options)
- def __iter__(self) -> Generator[str, None, None]:
+ def __iter__(self) -> Generator[str, None, None]: # type: ignore[ty:invalid-method-override]
"""Yield peptidoform strings with given charges and modifications."""
if not self.charge_options:
raise ValueError("Peptide charge options not defined.")
@@ -437,7 +439,7 @@ def __iter__(self) -> Generator[str, None, None]:
@staticmethod
def _construct_peptidoform_string(
- sequence: str, modifications: Dict[int, ModificationConfig], charge: int
+ sequence: str, modifications: dict[int, ModificationConfig], charge: int
) -> str:
if not modifications:
return f"{sequence}/{charge}"
@@ -469,7 +471,7 @@ def _digest_single_protein(
cleavage_rule: str = "trypsin",
missed_cleavages: int = 2,
semi_specific: bool = False,
-) -> List[_PeptidoformSearchSpace]:
+) -> list[_PeptidoformSearchSpace]:
"""Digest protein sequence and return a list of validated peptides."""
def valid_residues(sequence: str) -> bool:
@@ -516,8 +518,8 @@ def _count_fasta_entries(filename: Path) -> int:
def _restructure_modifications_by_target(
- modifications: List[ModificationConfig],
-) -> Dict[str, Dict[str, List[ModificationConfig]]]:
+ modifications: list[ModificationConfig],
+) -> dict[str, dict[str, list[ModificationConfig]]]:
"""Restructure variable modifications to options per side chain or terminus."""
modifications_by_target = {
"sidechain": defaultdict(lambda: []),
@@ -552,9 +554,9 @@ def add_mod(mod, target, amino_acid):
def _get_modification_possibilities_by_site(
peptide: _PeptidoformSearchSpace,
- modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]],
- modifications: List[ModificationConfig],
-) -> Dict[Union[str, int], List[ModificationConfig]]:
+ modifications_by_target: dict[str, dict[str, list[ModificationConfig]]],
+ modifications: list[ModificationConfig],
+) -> dict[str | int, list[ModificationConfig]]:
"""Get all possible modifications for each site in a peptide sequence."""
possibilities_by_site = defaultdict(list)
@@ -609,10 +611,10 @@ def _get_modification_possibilities_by_site(
def _get_peptidoform_modification_versions(
peptide: _PeptidoformSearchSpace,
- modifications: List[ModificationConfig],
- modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]],
+ modifications: list[ModificationConfig],
+ modifications_by_target: dict[str, dict[str, list[ModificationConfig]]],
max_variable_modifications: int = 3,
-) -> List[Dict[Union[str, int], List[ModificationConfig]]]:
+) -> list[dict[str | int, list[ModificationConfig]]]:
"""
Get all potential combinations of modifications for a peptide sequence.
@@ -660,7 +662,7 @@ def _get_peptidoform_modification_versions(
return modification_versions
-def _get_pool(processes: int) -> Union[multiprocessing.Pool, multiprocessing.dummy.Pool]:
+def _get_pool(processes: int) -> multiprocessing.pool.Pool:
"""Get a multiprocessing pool with the given number of processes."""
# TODO: fix None default value for processes
if processes > 1:
diff --git a/ms2pip/spectrum.py b/ms2pip/spectrum.py
index 8e49a389..c4830b0c 100644
--- a/ms2pip/spectrum.py
+++ b/ms2pip/spectrum.py
@@ -3,17 +3,18 @@
from __future__ import annotations
import warnings
-from typing import Any, Optional, Union
+from typing import Any
import numpy as np
from psm_utils import Peptidoform
from pydantic import model_validator, field_validator, ConfigDict, BaseModel
+
try:
import spectrum_utils.spectrum as sus
import spectrum_utils.plot as sup
except ImportError:
- sus = None
- sup = None
+ sus = None # type: ignore[ty:invalid-assignment]
+ sup = None # type: ignore[ty:invalid-assignment]
class Spectrum(BaseModel):
@@ -21,14 +22,14 @@ class Spectrum(BaseModel):
mz: np.ndarray
intensity: np.ndarray
- annotations: Optional[np.ndarray] = None
- identifier: Optional[str] = None
- peptidoform: Optional[Union[Peptidoform, str]] = None
- precursor_mz: Optional[float] = None
- precursor_charge: Optional[int] = None
- retention_time: Optional[float] = None
- mass_tolerance: Optional[float] = None
- mass_tolerance_unit: Optional[str] = None
+ annotations: np.ndarray | None = None
+ identifier: str | None = None
+ peptidoform: Peptidoform | str | None = None
+ precursor_mz: float | None = None
+ precursor_charge: int | None = None
+ retention_time: float | None = None
+ mass_tolerance: float | None = None
+ mass_tolerance_unit: str | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -71,7 +72,7 @@ def __repr__(self) -> str:
@model_validator(mode="after")
@classmethod
- def check_array_lengths(cls, data: dict):
+ def check_array_lengths(cls, data):
if len(data.mz) != len(data.intensity):
raise ValueError("Array lengths do not match.")
if data.annotations is not None:
@@ -149,7 +150,7 @@ def to_spectrum_utils(self):
if not self.peptidoform:
raise ValueError("`precursor_charge` or `peptidoform` must be set.")
else:
- precursor_charge = self.peptidoform.precursor_charge
+ precursor_charge = self.peptidoform.precursor_charge # type: ignore[ty:unresolved-attribute]
if self.precursor_mz:
precursor_mz = self.precursor_mz
@@ -158,19 +159,21 @@ def to_spectrum_utils(self):
raise ValueError("`precursor_mz` or `peptidoform` must be set.")
else:
warnings.warn("precursor_mz not set, using theoretical precursor m/z.")
- precursor_mz = self.peptidoform.theoretical_mz
+ precursor_mz = self.peptidoform.theoretical_mz # type: ignore[ty:unresolved-attribute]
spectrum = sus.MsmsSpectrum(
identifier=self.identifier if self.identifier else "spectrum",
- precursor_mz=precursor_mz,
- precursor_charge=precursor_charge,
+ precursor_mz=precursor_mz, # type: ignore[ty:invalid-argument-type]
+ precursor_charge=precursor_charge, # type: ignore[ty:invalid-argument-type]
mz=self.mz,
intensity=self.intensity,
- retention_time=self.retention_time,
+ retention_time=self.retention_time, # type: ignore[ty:invalid-argument-type]
)
if self.peptidoform:
spectrum.annotate_proforma(
- str(self.peptidoform), self.mass_tolerance, self.mass_tolerance_unit
+ str(self.peptidoform),
+ self.mass_tolerance, # type: ignore[ty:invalid-argument-type]
+ self.mass_tolerance_unit, # type: ignore[ty:invalid-argument-type]
)
return spectrum
@@ -184,7 +187,5 @@ class ObservedSpectrum(Spectrum):
class PredictedSpectrum(Spectrum):
"""Predicted MS2 spectrum."""
- mass_tolerance: Optional[float] = 0.001
- mass_tolerance_unit: Optional[str] = "Da"
-
- pass
+ mass_tolerance: float | None = 0.001
+ mass_tolerance_unit: str | None = "Da"
diff --git a/ms2pip/spectrum_input.py b/ms2pip/spectrum_input.py
index 8674c722..1ff8cab3 100644
--- a/ms2pip/spectrum_input.py
+++ b/ms2pip/spectrum_input.py
@@ -1,12 +1,14 @@
"""Read MS2 spectra."""
+from __future__ import annotations
+
+from collections.abc import Generator
from pathlib import Path
-from typing import Generator
import numpy as np
-from ms2rescore_rs import get_ms2_spectra
+from ms2rescore_rs import get_ms2_spectra # type: ignore[ty:unresolved-import]
-from ms2pip.exceptions import UnsupportedSpectrumFiletypeError
+import ms2pip.exceptions as exceptions
from ms2pip.spectrum import ObservedSpectrum
@@ -31,8 +33,8 @@ def read_spectrum_file(spectrum_file: str) -> Generator[ObservedSpectrum, None,
"""
try:
spectra = get_ms2_spectra(str(spectrum_file))
- except ValueError:
- raise UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes)
+ except ValueError as e:
+ raise exceptions.UnsupportedSpectrumFiletypeError(Path(spectrum_file).suffixes) from e
for spectrum in spectra:
obs_spectrum = ObservedSpectrum(
@@ -40,7 +42,7 @@ def read_spectrum_file(spectrum_file: str) -> Generator[ObservedSpectrum, None,
intensity=np.array(spectrum.intensity, dtype=np.float32),
identifier=str(spectrum.identifier),
precursor_mz=float(spectrum.precursor.mz),
- precursor_charge=float(spectrum.precursor.charge),
+ precursor_charge=int(spectrum.precursor.charge),
retention_time=float(spectrum.precursor.rt),
)
# Workaround for mobiusklein/mzdata#3
diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py
index c7690246..da3c081e 100644
--- a/ms2pip/spectrum_output.py
+++ b/ms2pip/spectrum_output.py
@@ -45,11 +45,12 @@
import warnings
from abc import ABC, abstractmethod
from collections import defaultdict
+from collections.abc import Generator
from io import StringIO
-from pathlib import Path
from os import PathLike
+from pathlib import Path
from time import localtime, strftime
-from typing import Any, Dict, Generator, List, Optional, Union
+from typing import Any
import numpy as np
from psm_utils import PSM, Peptidoform
@@ -64,8 +65,8 @@
def write_spectra(
- filename: Union[str, PathLike],
- processing_results: List[ProcessingResult],
+ filename: str | PathLike,
+ processing_results: list[ProcessingResult],
file_format: str = "tsv",
write_mode: str = "w",
):
@@ -94,7 +95,7 @@ class _Writer(ABC):
suffix = ""
- def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"):
+ def __init__(self, filename: str | PathLike, write_mode: str = "w"):
self.filename = Path(filename).with_suffix(self.suffix)
self.write_mode = write_mode
@@ -136,7 +137,7 @@ def _file_object(self):
self.open()
return self._open_file
- def write(self, processing_results: List[ProcessingResult]):
+ def write(self, processing_results: list[ProcessingResult]):
"""Write multiple processing results to file."""
for result in processing_results:
self._write_result(result)
@@ -162,7 +163,7 @@ class TSV(_Writer):
"im",
]
- def write(self, processing_results: List[ProcessingResult]):
+ def write(self, processing_results: list[ProcessingResult]):
"""Write multiple processing results to file."""
writer = csv.DictWriter(
self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n"
@@ -172,7 +173,7 @@ def write(self, processing_results: List[ProcessingResult]):
for result in processing_results:
self._write_result(result, writer)
- def _write_result(self, result: ProcessingResult, writer: csv.DictWriter):
+ def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): # type: ignore[ty:invalid-method-override]
"""Write single processing result to file."""
# Only write results with predictions or observations
if not result.theoretical_mz:
@@ -189,7 +190,7 @@ def _write_row(result: ProcessingResult, ion_type: str, ion_index: int):
"psm_index": result.psm_index,
"ion_type": ion_type,
"ion_number": ion_index + 1,
- "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]),
+ "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]), # type: ignore[ty:not-subscriptable]
"predicted": "{:.8f}".format(result.predicted_intensity[ion_type][ion_index])
if result.predicted_intensity
else None,
@@ -206,7 +207,7 @@ class MSP(_Writer):
suffix = ".msp"
- def write(self, results: List[ProcessingResult]):
+ def write(self, results: list[ProcessingResult]): # type: ignore[ty:invalid-method-override]
"""Write multiple processing results to file."""
for result in results:
self._write_result(result)
@@ -214,20 +215,20 @@ def write(self, results: List[ProcessingResult]):
def _write_result(self, result: ProcessingResult):
"""Write single processing result to file."""
predicted_spectrum = result.as_spectra()[0]
- intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4
- peaks = zip(predicted_spectrum.mz, intensity_normalized, predicted_spectrum.annotations)
+ intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute]
+ peaks = zip(predicted_spectrum.mz, intensity_normalized, predicted_spectrum.annotations) # type: ignore[ty:invalid-argument-type, ty:unresolved-attribute]
# Header
lines = [
f"Name: {result.psm.peptidoform.sequence}/{result.psm.get_precursor_charge()}",
f"MW: {result.psm.peptidoform.theoretical_mass}",
self._format_comment_line(result.psm),
- f"Num peaks: {len(predicted_spectrum.mz)}",
+ f"Num peaks: {len(predicted_spectrum.mz)}", # type: ignore[ty:unresolved-attribute]
]
# Peaks
lines.extend(
- f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks
+ f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks # type: ignore[ty:not-iterable]
)
# Write to file
@@ -241,8 +242,8 @@ def _format_modifications(peptidoform: Peptidoform):
def _format_single_modification(
amino_acid: str,
position: int,
- modifications: Optional[List[proforma.ModificationBase]],
- ) -> Union[str, None]:
+ modifications: list[proforma.ModificationBase] | None,
+ ) -> str | None:
"""Get modification label from :py:class:`proforma.ModificationBase` list."""
if not modifications:
return None
@@ -255,7 +256,7 @@ def _format_single_modification(
return f"{position},{amino_acid},{modification.value}"
sequence_mods = [
- _format_single_modification(aa, pos + 1, mods)
+ _format_single_modification(aa, pos + 1, mods) # type: ignore[ty:invalid-argument-type]
for pos, (aa, mods) in enumerate(peptidoform.parsed_sequence)
]
n_term = _format_single_modification(
@@ -278,7 +279,7 @@ def _format_parent_mass(peptidoform: Peptidoform) -> str:
return f"Parent={peptidoform.theoretical_mz}"
@staticmethod
- def _format_protein_string(psm: PSM) -> Union[str, None]:
+ def _format_protein_string(psm: PSM) -> str | None:
"""Format protein list as string."""
if psm.protein_list:
return f"Protein={','.join(psm.protein_list)}"
@@ -286,7 +287,7 @@ def _format_protein_string(psm: PSM) -> Union[str, None]:
return None
@staticmethod
- def _format_retention_time(psm: PSM) -> Union[str, None]:
+ def _format_retention_time(psm: PSM) -> str | None:
"""Format retention time as string."""
if psm.retention_time:
return f"RetentionTime={psm.retention_time}"
@@ -294,7 +295,7 @@ def _format_retention_time(psm: PSM) -> Union[str, None]:
return None
@staticmethod
- def _format_ion_mobility(psm: PSM) -> Union[str, None]:
+ def _format_ion_mobility(psm: PSM) -> str | None:
"""Format ion mobility as string."""
if psm.ion_mobility:
return f"IonMobility={psm.ion_mobility}"
@@ -334,7 +335,7 @@ class MGF(_Writer):
suffix = ".mgf"
- def write(self, results: List[ProcessingResult]):
+ def write(self, results: list[ProcessingResult]): # type: ignore[ty:invalid-method-override]
"""Write multiple processing results to file."""
for result in results:
self._write_result(result)
@@ -342,8 +343,8 @@ def write(self, results: List[ProcessingResult]):
def _write_result(self, result: ProcessingResult):
"""Write single processing result to file."""
predicted_spectrum = result.as_spectra()[0]
- intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4
- peaks = zip(predicted_spectrum.mz, intensity_normalized)
+ intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute]
+ peaks = zip(predicted_spectrum.mz, intensity_normalized) # type: ignore[ty:unresolved-attribute]
# Header
lines = [
@@ -384,7 +385,7 @@ class Spectronaut(_Writer):
"FragmentLossType",
]
- def write(self, processing_results: List[ProcessingResult]):
+ def write(self, processing_results: list[ProcessingResult]):
"""Write multiple processing results to file."""
writer = csv.DictWriter(
self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n"
@@ -394,7 +395,7 @@ def write(self, processing_results: List[ProcessingResult]):
for result in processing_results:
self._write_result(result, writer)
- def _write_result(self, result: ProcessingResult, writer: csv.DictWriter):
+ def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): # type: ignore[ty:invalid-method-override]
"""Write single processing result to file."""
# Only write results with predictions
if result.predicted_intensity is None:
@@ -404,7 +405,7 @@ def _write_result(self, result: ProcessingResult, writer: csv.DictWriter):
writer.writerow({**psm_info, **fragment_info})
@staticmethod
- def _process_psm(psm: PSM) -> Dict[str, Any]:
+ def _process_psm(psm: PSM) -> dict[str, Any]:
"""Process PSM to Spectronaut format."""
return {
"ModifiedPeptide": _peptidoform_str_without_charge(psm.peptidoform),
@@ -417,23 +418,23 @@ def _process_psm(psm: PSM) -> Dict[str, Any]:
}
@staticmethod
- def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], None, None]:
+ def _yield_fragment_info(result: ProcessingResult) -> Generator[dict[str, Any], None, None]:
"""Yield fragment information for a processing result."""
# Normalize intensities
intensities = {
ion_type: _unlogarithmize(intensities)
- for ion_type, intensities in result.predicted_intensity.items()
+ for ion_type, intensities in result.predicted_intensity.items() # type: ignore[ty:unresolved-attribute]
}
max_intensity = max(itertools.chain(*intensities.values()))
intensities = {
ion_type: _basepeak_normalize(intensities[ion_type], basepeak=max_intensity)
for ion_type in intensities
}
- for ion_type in result.predicted_intensity:
+ for ion_type in result.predicted_intensity: # type: ignore[ty:not-iterable]
fragment_type = ion_type[0].lower()
fragment_charge = ion_type[1:] if len(ion_type) > 1 else "1"
for ion_index, (intensity, mz) in enumerate(
- zip(intensities[ion_type], result.theoretical_mz[ion_type])
+ zip(intensities[ion_type], result.theoretical_mz[ion_type]) # type: ignore[ty:invalid-argument-type, ty:not-subscriptable]
):
yield {
"RelativeFragmentIntensity": f"{intensity:.8f}",
@@ -468,7 +469,7 @@ class Bibliospec(_Writer):
"ion-mobility",
]
- def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"):
+ def __init__(self, filename: str | PathLike, write_mode: str = "w"):
super().__init__(filename, write_mode)
self.ssl_file = self.filename.with_suffix(self.ssl_suffix)
self.ms2_file = self.filename.with_suffix(self.ms2_suffix)
@@ -514,7 +515,7 @@ def _ms2_file_object(self):
self.open()
return self._open_ms2_file
- def write(self, processing_results: List[ProcessingResult]):
+ def write(self, processing_results: list[ProcessingResult]):
"""Write multiple processing results to file."""
# Create CSV writer
ssl_dict_writer = csv.DictWriter(
@@ -553,7 +554,7 @@ def _write_result(
modified_sequence: str,
scan_number: int,
writer: csv.DictWriter,
- ):
+ ): # type: ignore[ty:invalid-method-override]
"""Write single processing result to files."""
self._write_result_to_ssl(result, modified_sequence, scan_number, writer)
self._write_result_to_ms2(result, modified_sequence, scan_number)
@@ -584,8 +585,8 @@ def _write_result_to_ms2(
):
"""Write single processing result to the MS2 file."""
predicted_spectrum = result.as_spectra()[0]
- intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4
- peaks = zip(predicted_spectrum.mz, intensity_normalized)
+ intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute]
+ peaks = zip(predicted_spectrum.mz, intensity_normalized) # type: ignore[ty:unresolved-attribute]
# Header
lines = [
@@ -607,8 +608,8 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str:
"""Format modified sequence as string for Spectronaut."""
modification_dict = defaultdict(list)
for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]:
- if peptidoform.properties[term]:
- modification_dict[position].extend(peptidoform.properties[term])
+ if peptidoform.properties[term]: # type: ignore[ty:invalid-key]
+ modification_dict[position].extend(peptidoform.properties[term]) # type: ignore[ty:invalid-key]
for position, (_, mods) in enumerate(peptidoform.parsed_sequence):
if mods:
modification_dict[position].extend(mods)
@@ -620,7 +621,7 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str:
)
@staticmethod
- def _get_last_ssl_scan_number(ssl_file: Union[str, PathLike, StringIO]):
+ def _get_last_ssl_scan_number(ssl_file: str | PathLike | StringIO):
"""Read scan number of last line in a Bibliospec SSL file."""
if isinstance(ssl_file, StringIO):
ssl_file.seek(0)
@@ -652,7 +653,7 @@ def open(self):
self._open_file = self.filename.unlink(missing_ok=True)
self._open_file = dlib.open_sqlite(self.filename)
- def write(self, processing_results: List[ProcessingResult]):
+ def write(self, processing_results: list[ProcessingResult]):
"""Write MS2PIP predictions to a DLIB SQLite file."""
connection = self._file_object
dlib.metadata.create_all(connection.engine)
@@ -667,13 +668,13 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str:
"""Format modified sequence as string for DLIB."""
# Sum all sequential mass shifts for each position
masses = [
- sum(mod.mass for mod in mods) if mods else 0 for _, mods in peptidoform.parsed_sequence
+ sum(mod.mass for mod in mods) if mods else 0 for _, mods in peptidoform.parsed_sequence # type: ignore[ty:unresolved-attribute]
]
# Add N- and C-terminal modifications
for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]:
- if peptidoform.properties[term]:
- masses[position] += sum(mod.mass for mod in peptidoform.properties[term])
+ if peptidoform.properties[term]: # type: ignore[ty:invalid-key]
+ masses[position] += sum(mod.mass for mod in peptidoform.properties[term]) # type: ignore[ty:invalid-key]
# Format modified sequence
return "".join(
@@ -700,9 +701,9 @@ def _write_metadata(connection: Connection):
@staticmethod
def _write_entries(
- processing_results: List[ProcessingResult],
+ processing_results: list[ProcessingResult],
connection: Connection,
- output_filename: Union[str, PathLike],
+ output_filename: str | PathLike,
):
"""Write spectra to DLIB SQLite file."""
with connection.begin():
@@ -711,8 +712,8 @@ def _write_entries(
raise ValueError("Retention time required to write DLIB file.")
spectrum = result.as_spectra()[0]
- intensity_normalized = _basepeak_normalize(spectrum.intensity) * 1e4
- n_peaks = len(spectrum.mz)
+ intensity_normalized = _basepeak_normalize(spectrum.intensity) * 1e4 # type: ignore[ty:unresolved-attribute]
+ n_peaks = len(spectrum.mz) # type: ignore[ty:unresolved-attribute]
connection.execute(
dlib.Entry.insert().values(
@@ -724,7 +725,7 @@ def _write_entries(
RTInSeconds=result.psm.retention_time,
Score=0,
MassEncodedLength=n_peaks,
- MassArray=spectrum.mz.tolist(),
+ MassArray=spectrum.mz.tolist(), # type: ignore[ty:unresolved-attribute]
IntensityEncodedLength=n_peaks,
IntensityArray=intensity_normalized.tolist(),
SourceFile=str(output_filename),
@@ -732,7 +733,7 @@ def _write_entries(
)
@staticmethod
- def _write_peptide_to_protein(results: List[ProcessingResult], connection: Connection):
+ def _write_peptide_to_protein(results: list[ProcessingResult], connection: Connection):
"""Write peptide-to-protein mappings to DLIB SQLite file."""
peptide_to_proteins = {
(result.psm.peptidoform.sequence, protein)
@@ -780,12 +781,12 @@ def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str:
return re.sub(r"\/\d+$", "", str(peptidoform))
-def _unlogarithmize(intensities: np.array) -> np.array:
+def _unlogarithmize(intensities: np.array) -> np.array: # type: ignore[ty:invalid-type-form]
"""Undo logarithmic transformation of intensities."""
return (2**intensities) - 0.001
-def _basepeak_normalize(intensities: np.array, basepeak: Optional[float] = None) -> np.array:
+def _basepeak_normalize(intensities: np.array, basepeak: float | None = None) -> np.array: # type: ignore[ty:invalid-type-form]
"""Normalize intensities to most intense peak."""
if not basepeak:
basepeak = intensities.max()
diff --git a/pyproject.toml b/pyproject.toml
index 002ca21e..61b4df8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,14 +24,12 @@ authors = [
classifiers = [
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
- "Operating System :: POSIX :: Linux",
- "Operating System :: MacOS :: MacOS X",
- "Operating System :: Microsoft :: Windows",
+ "Operating System :: OS Independent",
"Programming Language :: Python :: 3 :: Only",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Development Status :: 5 - Production/Stable",
]
-requires-python = ">=3.9"
+requires-python = ">=3.11"
dependencies = [
"numpy>=1.25,<3",
"pandas>=1,<3",
@@ -45,12 +43,14 @@ dependencies = [
"pydantic>=2",
"werkzeug>=2",
"psm_utils>=1.0",
- "ms2rescore-rs>=0.5.0a0,<2",
+ "ms2rescore-rs>=0.5.0a1,<2",
]
-
[project.optional-dependencies]
plotting = ["matplotlib>=3.0", "spectrum-utils>=0.4"]
-dev = ["black", "isort>5", "pytest"]
+
+[dependency-groups]
+notebook = ["ipykernel", "ipywidgets", "pip"]
+dev = ["pytest", "ruff", "ty"]
docs = [
"sphinx",
"numpydoc>=1,<2",
@@ -74,7 +74,7 @@ publication = "https://doi.org/10.1093/nar/gkad335/"
ms2pip = "ms2pip.__main__:main"
[build-system]
-requires = ["setuptools", "cython", "numpy>=2.0"]
+requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
@@ -82,19 +82,8 @@ include = ["ms2pip*"]
[tool.black]
line-length = 99
-target-version = ['py310']
+target-version = ['py311']
[tool.ruff]
line-length = 99
-target-version = 'py310'
-
-[tool.cibuildwheel]
-# Keep Python targets explicit so CI updates don't start publishing untested wheels.
-build = "cp3{10,11,12,13}-manylinux_x86_64 cp3{10,11,12,13}-win_amd64 cp3{10,11,12,13}-macosx_{x86_64,arm64}"
-test-command = "ms2pip --help"
-
-# Prevent building from source for packages with complex C/C++/Rust dependencies
-environment = { PIP_ONLY_BINARY = "pyarrow,pandas,numpy,lxml,xgboost,scipy,ms2rescore-rs" }
-
-[tool.cibuildwheel.macos]
-before-all = "brew install libomp"
+target-version = 'py311'
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b389ab08..00000000
--- a/setup.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-import platform
-from glob import glob
-
-import numpy
-from Cython.Distutils import build_ext
-from setuptools import setup
-from setuptools.extension import Extension
-
-
-def _get_version():
- with open("ms2pip/__init__.py") as f:
- for line in f:
- if line.startswith("__version__"):
- return line.split("=")[1].strip().strip('"').strip("'")
-
-
-to_remove = [
- "ms2pip/_cython_modules/ms2pip_pyx.c*",
- "ms2pip/_cython_modules/ms2pip_pyx.so",
-]
-_ = [[os.remove(f) for f in glob(pat)] for pat in to_remove]
-
-# Large machine-written C model files require optimization to be disabled
-compile_args = {
- "Linux": [
- "-O0",
- "-fno-var-tracking",
- "-Wno-unused-result",
- "-Wno-cpp",
- "-Wno-unused-function",
- ],
- "Darwin": [
- "-O0",
- ],
- "Windows": [
- "/Od",
- "/DEBUG",
- "/GL-",
- "/bigobj",
- "/wd4244",
- ],
-}
-
-extensions = [
- Extension(
- "ms2pip._cython_modules.ms2pip_pyx",
- sources=["ms2pip/_cython_modules/ms2pip_pyx.pyx"] + glob("ms2pip/_models_c/*/*.c"),
- extra_compile_args=compile_args[platform.system()],
- )
-]
-
-setup(
- version=_get_version(),
- ext_modules=extensions,
- include_dirs=[numpy.get_include()],
- cmdclass={"build_ext": build_ext},
-)
diff --git a/tests/test_core.py b/tests/test_core.py
index 771521f9..5f90dea2 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,22 +1,10 @@
import numpy as np
-from psm_utils import PSM, Peptidoform
-import pandas as pd
+from psm_utils import PSM, PSMList, Peptidoform
-from ms2pip.core import get_training_data, predict_single
+from ms2pip.core import predict_batch, predict_library, predict_single
from ms2pip.result import ProcessingResult
-def _test_get_training_data():
- expected_df = pd.read_feather("tests/test_data/massivekb_selected_500.feather")
- output_df = get_training_data(
- "tests/test_data/massivekb_selected_500.peprec",
- "tests/test_data/massivekb_selected_500.mgf",
- model="HCD",
- ms2_tolerance=0.02,
- processes=1
- )
- pd.testing.assert_frame_equal(expected_df, output_df)
-
def test_predict_single():
pep = Peptidoform("ACDE/2")
result = predict_single(pep)
@@ -25,8 +13,8 @@ def test_predict_single():
psm_index=0,
psm=PSM(peptidoform=pep, spectrum_id=0),
theoretical_mz={
- "b": np.array([72.04435, 175.05354, 290.08047], dtype=np.float32),
- "y": np.array([148.0604, 263.0873, 366.0965], dtype=np.float32),
+ "b": np.array([72.04439, 175.05357, 290.0805], dtype=np.float32),
+ "y": np.array([148.06044, 263.08737, 366.09656], dtype=np.float32),
},
predicted_intensity={
"b": np.array([-9.14031, -7.6102686, -7.746709], dtype=np.float32),
@@ -41,8 +29,65 @@ def test_predict_single():
assert result.psm == expected.psm
np.testing.assert_array_almost_equal(result.theoretical_mz["b"], expected.theoretical_mz["b"])
np.testing.assert_array_almost_equal(result.theoretical_mz["y"], expected.theoretical_mz["y"])
- np.testing.assert_array_almost_equal(result.predicted_intensity["b"], expected.predicted_intensity["b"])
- np.testing.assert_array_almost_equal(result.predicted_intensity["y"], expected.predicted_intensity["y"])
+ np.testing.assert_array_almost_equal(
+ result.predicted_intensity["b"], expected.predicted_intensity["b"]
+ )
+ np.testing.assert_array_almost_equal(
+ result.predicted_intensity["y"], expected.predicted_intensity["y"]
+ )
assert result.observed_intensity == expected.observed_intensity
assert result.correlation == expected.correlation
assert result.feature_vectors == expected.feature_vectors
+
+
+def test_predict_single_modified():
+ result = predict_single("AC[+57.0215]M[+15.9949]DEK/2")
+
+ assert result.predicted_intensity is not None
+ assert result.theoretical_mz is not None
+ assert set(result.theoretical_mz.keys()) == {"b", "y"}
+ # 6 residues → 5 cleavage sites
+ assert len(result.theoretical_mz["b"]) == 5
+ assert len(result.theoretical_mz["y"]) == 5
+ assert len(result.predicted_intensity["b"]) == 5
+ # m/z values should be positive and increasing for b-ions
+ assert all(result.theoretical_mz["b"] > 0)
+ assert all(np.diff(result.theoretical_mz["b"]) > 0)
+
+
+def test_predict_batch():
+ psm_list = PSMList(
+ psm_list=[
+ PSM(peptidoform=Peptidoform("ACDE/2"), spectrum_id=0),
+ PSM(peptidoform=Peptidoform("PEPTIDEK/3"), spectrum_id=1),
+ PSM(peptidoform=Peptidoform("AAAAAAA/2"), spectrum_id=2),
+ ]
+ )
+ results = predict_batch(psm_list)
+
+ assert len(results) == 3
+ for i, result in enumerate(results):
+ assert result.psm_index == i
+ assert result.predicted_intensity is not None
+ assert result.theoretical_mz is not None
+ assert set(result.theoretical_mz.keys()) == {"b", "y"}
+ assert result.feature_vectors is None
+
+ # Check correct number of ions per peptide
+ assert len(results[0].theoretical_mz["b"]) == 3 # ACDE: 4 residues → 3
+ assert len(results[1].theoretical_mz["b"]) == 7 # PEPTIDEK: 8 residues → 7
+ assert len(results[2].theoretical_mz["b"]) == 6 # AAAAAAA: 7 residues → 6
+
+
+def test_predict_library():
+ batches = list(
+ predict_library(fasta_file="tests/test_data/test.fasta", batch_size=100)
+ )
+
+ assert len(batches) >= 1
+ for batch in batches:
+ assert isinstance(batch, list)
+ for result in batch:
+ assert isinstance(result, ProcessingResult)
+ assert result.predicted_intensity is not None
+ assert result.theoretical_mz is not None
diff --git a/tests/test_correlate.py b/tests/test_correlate.py
new file mode 100644
index 00000000..6056c2ab
--- /dev/null
+++ b/tests/test_correlate.py
@@ -0,0 +1,158 @@
+import numpy as np
+import pytest
+from psm_utils import PSM, Peptidoform
+from ms2rescore_rs import AnnotatedMS2Spectrum, FragmentAnnotation, MS2Spectrum, Precursor
+
+from ms2pip.core import correlate_preloaded, correlate_single
+from ms2pip.spectrum import ObservedSpectrum
+
+
+def _make_observed_spectrum(peptidoform="ACDEFK/2"):
+ """Create an ObservedSpectrum with realistic peaks for testing."""
+ return ObservedSpectrum(
+ mz=np.array(
+ [72.044, 147.113, 175.054, 276.167, 290.081, 389.251, 488.319, 566.192],
+ dtype=np.float32,
+ ),
+ intensity=np.array(
+ [50.0, 100.0, 80.0, 200.0, 60.0, 150.0, 300.0, 40.0],
+ dtype=np.float32,
+ ),
+ identifier="test_spectrum",
+ peptidoform=peptidoform,
+ precursor_mz=341.65,
+ precursor_charge=2,
+ retention_time=100.0,
+ )
+
+
+def test_correlate_single():
+ obs = _make_observed_spectrum()
+ result = correlate_single(obs, ms2_tolerance=0.02, model="HCD")
+
+ assert result.predicted_intensity is not None
+ assert result.observed_intensity is not None
+ assert result.correlation is not None
+ assert not np.isnan(result.correlation)
+ assert set(result.predicted_intensity.keys()) == {"b", "y"}
+ assert set(result.observed_intensity.keys()) == {"b", "y"}
+
+
+def test_correlate_single_ppm_tolerance():
+ obs = _make_observed_spectrum()
+ result = correlate_single(obs, ms2_tolerance=20.0, ms2_tolerance_mode="ppm", model="HCD")
+
+ assert result.predicted_intensity is not None
+ assert result.observed_intensity is not None
+ assert result.correlation is not None
+
+
+def test_correlate_single_requires_peptidoform():
+ obs = ObservedSpectrum(
+ mz=np.array([100.0], dtype=np.float32),
+ intensity=np.array([1.0], dtype=np.float32),
+ identifier="test",
+ )
+ with pytest.raises(ValueError, match="Peptidoform must be set"):
+ correlate_single(obs)
+
+
+def test_correlate_preloaded_annotated():
+ ann_spec = AnnotatedMS2Spectrum(
+ identifier="test_spectrum",
+ mz=[175.054, 276.167, 488.319],
+ intensity=[100.0, 200.0, 300.0],
+ precursor=Precursor(mz=341.65, charge=2, rt=100.0),
+ peak_annotations=[
+ [FragmentAnnotation(series="b", position=2, charge=1)],
+ [FragmentAnnotation(series="y", position=4, charge=1)],
+ [],
+ ],
+ )
+ psm = PSM(
+ peptidoform=Peptidoform("ACDEFK/2"),
+ spectrum_id="test_spectrum",
+ spectrum=ann_spec,
+ )
+
+ results = correlate_preloaded([psm], model="HCD")
+
+ assert len(results) == 1
+ result = results[0]
+ assert result.predicted_intensity is not None
+ assert result.observed_intensity is not None
+ assert set(result.observed_intensity.keys()) == {"b", "y"}
+ # Annotated b2 position should have non-floor intensity
+ floor = np.log2(0.001)
+ assert result.observed_intensity["b"][1] > floor
+
+
+def test_correlate_preloaded_raw():
+ raw_spec = MS2Spectrum(
+ identifier="test_spectrum",
+ mz=[175.054, 276.167, 488.319],
+ intensity=[100.0, 200.0, 300.0],
+ precursor=Precursor(mz=341.65, charge=2, rt=100.0),
+ )
+ psm = PSM(
+ peptidoform=Peptidoform("ACDEFK/2"),
+ spectrum_id="test_spectrum",
+ spectrum=raw_spec,
+ )
+
+ results = correlate_preloaded(
+ [psm], model="HCD", ms2_tolerance=0.02, ms2_tolerance_mode="Da"
+ )
+
+ assert len(results) == 1
+ assert results[0].predicted_intensity is not None
+ assert results[0].observed_intensity is not None
+
+
+def test_correlate_preloaded_ppm_tolerance():
+ raw_spec = MS2Spectrum(
+ identifier="test_spectrum",
+ mz=[175.054, 276.167],
+ intensity=[100.0, 200.0],
+ precursor=Precursor(mz=341.65, charge=2, rt=100.0),
+ )
+ psm = PSM(
+ peptidoform=Peptidoform("ACDEFK/2"),
+ spectrum_id="test_spectrum",
+ spectrum=raw_spec,
+ )
+
+ results = correlate_preloaded(
+ [psm], model="HCD", ms2_tolerance=20.0, ms2_tolerance_mode="ppm"
+ )
+
+ assert len(results) == 1
+ assert results[0].observed_intensity is not None
+
+
+def test_correlate_preloaded_invalid_spectrum():
+ psm = PSM(
+ peptidoform=Peptidoform("ACDEFK/2"),
+ spectrum_id="test",
+ )
+ with pytest.raises(ValueError, match="MS2Spectrum or AnnotatedMS2Spectrum"):
+ correlate_preloaded([psm])
+
+
+def test_correlate_preloaded_multiple_psms():
+ psms = []
+ for i, pep in enumerate(["ACDEFK/2", "PEPTIDEK/3"]):
+ spec = MS2Spectrum(
+ identifier=f"spec_{i}",
+ mz=[175.054, 276.167],
+ intensity=[100.0, 200.0],
+ precursor=Precursor(mz=400.0, charge=2, rt=float(i * 10)),
+ )
+ psms.append(PSM(peptidoform=Peptidoform(pep), spectrum_id=f"spec_{i}", spectrum=spec))
+
+ results = correlate_preloaded(psms, model="HCD")
+
+ assert len(results) == 2
+ for result in results:
+ assert result.predicted_intensity is not None
+ assert result.observed_intensity is not None
diff --git a/tests/test_correlation.py b/tests/test_correlation.py
new file mode 100644
index 00000000..62eaaaaa
--- /dev/null
+++ b/tests/test_correlation.py
@@ -0,0 +1,55 @@
+import numpy as np
+
+from ms2pip.correlation import ms2pip_pearson, spectral_angle
+
+
+def test_ms2pip_pearson():
+ true = np.array([100.0, 200.0, 50.0, 300.0])
+ pred = np.array([90.0, 210.0, 40.0, 310.0])
+
+ corr = ms2pip_pearson(true, pred)
+
+ assert isinstance(corr, float)
+ assert 0.9 < corr <= 1.0
+
+
+def test_ms2pip_pearson_identical():
+ arr = np.array([100.0, 200.0, 50.0, 300.0])
+ corr = ms2pip_pearson(arr, arr)
+
+ assert abs(corr - 1.0) < 1e-6
+
+
+def test_ms2pip_pearson_anticorrelated():
+ true = np.array([100.0, 200.0, 300.0])
+ pred = np.array([300.0, 200.0, 100.0])
+
+ corr = ms2pip_pearson(true, pred)
+
+ assert corr < 0
+
+
+def test_spectral_angle():
+ true = np.array([100.0, 200.0, 50.0, 300.0])
+ pred = np.array([90.0, 210.0, 40.0, 310.0])
+
+ sa = spectral_angle(true, pred)
+
+ assert isinstance(sa, float)
+ assert 0.0 < sa <= 1.0
+
+
+def test_spectral_angle_identical():
+ arr = np.array([100.0, 200.0, 50.0, 300.0])
+ sa = spectral_angle(arr, arr)
+
+ assert abs(sa - 1.0) < 1e-6
+
+
+def test_spectral_angle_orthogonal():
+ true = np.array([1.0, 0.0])
+ pred = np.array([0.0, 1.0])
+
+ sa = spectral_angle(true, pred)
+
+ assert sa < 0.5
diff --git a/tests/test_encoder.py b/tests/test_encoder.py
deleted file mode 100644
index 5feb5f96..00000000
--- a/tests/test_encoder.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import pytest
-from psm_utils import Peptidoform, PSM, PSMList
-
-from ms2pip._utils.encoder import Encoder
-
-
-class TestEncoder:
- def test_from_peptidoform(self):
- test_cases = [
- # Peptidoform, {(target, label): (amino_acid, amino_acid_id, mass_shift)}
- ("ACDEK", {}),
- ("AC[+57.021464]DEK", {("C", "+57.021464"): ("C", 1, 57.021464)}),
- ("AC[U:4]", {("C", "UNIMOD:4"): ("C", 1, 57.021464)}),
- ("AC[formula:H3C2NO]", {("C", "Formula:H3C2NO"): ("C", 1, 57.021464)}),
- ("[Acetyl]-ACDE", {("n_term", "Acetyl"): ("n_term", -1, 42.010565)}),
- ("ACDE-[Amidated]", {("c_term", "Amidated"): ("c_term", -2, -0.984016)}),
- (
- "AC[+57.021464]DE-[Amidated]",
- {
- ("C", "+57.021464"): ("C", 1, 57.021464),
- ("c_term", "Amidated"): ("c_term", -2, -0.984016),
- },
- ),
- (
- "[Acetyl]-AC[+57.021464]DE",
- {
- ("n_term", "Acetyl"): ("n_term", -1, 42.010565),
- ("C", "+57.021464"): ("C", 1, 57.021464),
- },
- ),
- ]
-
- for peptidoform, expected_mods in test_cases:
- encoder = Encoder.from_peptidoform(Peptidoform(peptidoform))
- for key, modification in encoder.modifications.items():
- for item_key, expected_item in zip(
- ["amino_acid", "amino_acid_id", "mass_shift"], expected_mods[key]
- ):
- if isinstance(expected_item, float):
- assert modification[item_key] == pytest.approx(expected_item)
- else:
- assert modification[item_key] == expected_item
-
- def test_from_psm_list(self):
- psm_list = PSMList(psm_list=[
- PSM(peptidoform="AC[+57.021464]DEK", spectrum_id=0),
- PSM(peptidoform="AC[U:4]", spectrum_id=1),
- PSM(peptidoform="AC[formula:H3C2NO]", spectrum_id=2),
- PSM(peptidoform="[Acetyl]-ACDE", spectrum_id=3),
- PSM(peptidoform="ACDE-[Amidated]",spectrum_id= 4)
- ])
- expected = {
- ("C", "+57.021464"): {
- "mod_id": 38,
- "mass_shift": 57.021464,
- "amino_acid": "C",
- "amino_acid_id": 1,
- },
- ("C", "UNIMOD:4"): {
- "mod_id": 39,
- "mass_shift": 57.021464,
- "amino_acid": "C",
- "amino_acid_id": 1,
- },
- ("C", "Formula:H3C2NO"): {
- "mod_id": 40,
- "mass_shift": 57.02146372057,
- "amino_acid": "C",
- "amino_acid_id": 1,
- },
- ("n_term", "Acetyl"): {
- "mod_id": 41,
- "mass_shift": 42.010565,
- "amino_acid": "n_term",
- "amino_acid_id": -1,
- },
- ("c_term", "Amidated"): {
- "mod_id": 42,
- "mass_shift": -0.984016,
- "amino_acid": "c_term",
- "amino_acid_id": -2,
- },
- }
-
- encoder = Encoder.from_psm_list(psm_list)
- for modification_key, modification_dict in encoder.modifications.items():
- for item_key, expected_item in expected[modification_key].items():
- if isinstance(expected_item, float):
- assert modification_dict[item_key] == pytest.approx(expected_item)
- else:
- assert modification_dict[item_key] == expected_item
diff --git a/tests/test_spectrum_processing.py b/tests/test_spectrum_processing.py
new file mode 100644
index 00000000..2a174b48
--- /dev/null
+++ b/tests/test_spectrum_processing.py
@@ -0,0 +1,138 @@
+import numpy as np
+from psm_utils import PSM, Peptidoform
+
+from ms2pip._spectrum_processing import (
+ annotate_spectrum,
+ proforma_to_mass_shift,
+ targets_from_annotations,
+)
+
+
+def test_targets_from_annotations_basic():
+ """Test basic target extraction from annotations."""
+ # 4-residue peptide → 3 cleavage sites
+ peak_annotations = [
+ [("b", 1, 1)], # peak 0 matches b1
+ [], # peak 1 unmatched
+ [("y", 2, 1)], # peak 2 matches y2
+ ]
+ intensity = np.array([5.0, 3.0, 8.0], dtype=np.float32)
+
+ targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4)
+
+ floor = np.float32(np.log2(0.001))
+ assert targets["b"][0] == 5.0 # b1 at position 0
+ assert targets["b"][1] == floor # b2 unmatched
+ assert targets["b"][2] == floor # b3 unmatched
+ assert targets["y"][0] == floor # y1 unmatched
+ assert targets["y"][1] == 8.0 # y2 at position 1
+ assert targets["y"][2] == floor # y3 unmatched
+
+
+def test_targets_from_annotations_charge2():
+ """Test that charge-2 annotations map to b2/y2 ion types."""
+ peak_annotations = [
+ [("b", 1, 2)], # charge 2 → maps to "b2"
+ ]
+ intensity = np.array([10.0], dtype=np.float32)
+
+ targets = targets_from_annotations(
+ peak_annotations, intensity, ["b", "y", "b2", "y2"], seq_len=4
+ )
+
+ assert targets["b2"][0] == 10.0
+ floor = np.float32(np.log2(0.001))
+ assert targets["b"][0] == floor # charge-1 b is not matched
+
+
+def test_targets_from_annotations_highest_intensity():
+ """Test that the highest intensity is kept when multiple peaks match."""
+ peak_annotations = [
+ [("b", 1, 1)], # peak 0: b1 with intensity 3.0
+ [("b", 1, 1)], # peak 1: b1 with intensity 7.0
+ ]
+ intensity = np.array([3.0, 7.0], dtype=np.float32)
+
+ targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4)
+
+ assert targets["b"][0] == 7.0 # highest wins
+
+
+def test_targets_from_annotations_ignores_unknown_ion_types():
+ """Test that annotations for ion types not in the target list are ignored."""
+ peak_annotations = [
+ [("c", 1, 1)], # c-ion not in requested types
+ ]
+ intensity = np.array([10.0], dtype=np.float32)
+
+ targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4)
+
+ floor = np.float32(np.log2(0.001))
+ assert all(v == floor for v in targets["b"])
+ assert all(v == floor for v in targets["y"])
+
+
+def test_targets_from_annotations_fragment_annotation_objects():
+ """Test that FragmentAnnotation objects (not tuples) also work."""
+ from ms2rescore_rs import FragmentAnnotation
+
+ peak_annotations = [
+ [FragmentAnnotation(series="b", position=1, charge=1)],
+ ]
+ intensity = np.array([5.0], dtype=np.float32)
+
+ targets = targets_from_annotations(peak_annotations, intensity, ["b", "y"], seq_len=4)
+
+ assert targets["b"][0] == 5.0
+
+
+def test_annotate_spectrum():
+ """Test that annotate_spectrum returns annotations for matching peaks."""
+ spectrum = __import__("ms2pip.spectrum", fromlist=["ObservedSpectrum"]).ObservedSpectrum(
+ mz=np.array([72.044, 175.054, 290.081], dtype=np.float32),
+ intensity=np.array([100.0, 200.0, 300.0], dtype=np.float32),
+ identifier="test",
+ precursor_mz=250.0,
+ precursor_charge=2,
+ retention_time=100.0,
+ )
+ psm = PSM(peptidoform=Peptidoform("ACDE/2"), spectrum_id="test")
+
+ annotations = annotate_spectrum(spectrum, psm, "HCD", 0.02, "Da")
+
+ # Should return one list of annotations per peak
+ assert len(annotations) == 3
+ # At least some peaks should be annotated (b1 at ~72.044, b2 at ~175.054)
+ annotated_peaks = [i for i, anns in enumerate(annotations) if len(anns) > 0]
+ assert len(annotated_peaks) > 0
+
+
+def test_proforma_to_mass_shift_unmodified():
+ result = proforma_to_mass_shift(Peptidoform("PEPTIDE/2"))
+ assert result == "PEPTIDE/2"
+
+
+def test_proforma_to_mass_shift_unimod_names():
+ result = proforma_to_mass_shift(Peptidoform("PEPTC[UNIMOD:Carbamidomethyl]M[UNIMOD:Oxidation]IDE/2"))
+ assert "[+" in result
+ assert "Carbamidomethyl" not in result
+ assert "Oxidation" not in result
+ assert result.startswith("PEPTC[+57.0215]M[+15.9949]")
+ assert result.endswith("/2")
+
+
+def test_proforma_to_mass_shift_nterm():
+ result = proforma_to_mass_shift(Peptidoform("[UNIMOD:Acetyl]-PEPTIDE/2"))
+ assert result.startswith("[+42.0106]-PEPTIDE")
+ assert result.endswith("/2")
+
+
+def test_proforma_to_mass_shift_already_mass_shift():
+ result = proforma_to_mass_shift(Peptidoform("PEPTC[+57.0215]IDE/2"))
+ assert result == "PEPTC[+57.0215]IDE/2"
+
+
+def test_proforma_to_mass_shift_no_charge():
+ result = proforma_to_mass_shift(Peptidoform("PEPTIDE"))
+ assert result == "PEPTIDE"
+ assert "/" not in result