From 6986d74b8c1bbf5bf1e9f7c3558a79bfd6b83305 Mon Sep 17 00:00:00 2001 From: julianu Date: Mon, 28 Oct 2024 08:52:56 +0000 Subject: [PATCH 1/2] extracting queries without dependencies and added gitignore --- .gitignore | 176 ++++++++++++++++++++++++++ bin/generate_queries_from_mgf_mzml.py | 87 +++++++++---- 2 files changed, 238 insertions(+), 25 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..de24a29 --- /dev/null +++ b/.gitignore @@ -0,0 +1,176 @@ +## cmake +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps +CMakeUserPresets.json + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/bin/generate_queries_from_mgf_mzml.py b/bin/generate_queries_from_mgf_mzml.py index f51388d..201f03a 100755 --- a/bin/generate_queries_from_mgf_mzml.py +++ b/bin/generate_queries_from_mgf_mzml.py @@ -2,7 +2,7 @@ import argparse -import pyopenms +import re # Constants HYDROGEN_MONO_MASS = 1.007825035 @@ -71,32 +71,69 @@ def parse_mgf(in_file, ppm): def parse_mzml(mzml_file, ppm): ''' Read all MS2 precursors from mzML ''' - # Load MZML - exp = pyopenms.MSExperiment() - pyopenms.MzMLFile().load(mzml_file, exp) - entries = [] - for spectrum in exp.getSpectra(): - if spectrum.getMSLevel() == 2: - precs = spectrum.getPrecursors() - if len(precs) != 1: - raise Exception("Unexpected number of precursors in a MS2-Spectrum") - else: - # Get mz and Charge - pepmass = precs[0].getMZ() - charge = precs[0].getCharge() - - # Convert peptide mass to Da - da = (float(pepmass) * float(charge)) - (HYDROGEN_MONO_MASS * float(charge)) - da = da - WATER_MASS # Subsract H2O-Mass, due to Protein-Graphs not encoding the water mass - - # Get lower and upper limit - lower = da - (da / 1000000) * ppm - upper = da + (da / 1000000) * ppm - - # Append to entries - entries.append((lower, upper)) + with open(mzml_file, "r") as in_file: + # Save all queries in entries + entries = [] + + # Iterate linewise + in_spectrum = False + in_precursor = False + pepmass = -1 + charge = 0 + for line in in_file: + # Set in_entry, since we are in a MS2-Spectrum-Entry + if line.strip().startswith(" 0) and (charge != 0): + # If yes: save the query: + # Convert peptide mass to Da + da = (float(pepmass) * float(charge)) - (HYDROGEN_MONO_MASS * float(charge)) + da = da - WATER_MASS # Subtract H2O-Mass, as Protein-Graphs don't encode the water mass + + # Get lower and upper limit + lower = da - (da / 1000000) * args.ppm + upper = da + (da / 1000000) * args.ppm + + # Append to entries + entries.append((lower, upper)) + elif (pepmass > 0): + print("no charge: " + pepmass) + + in_precursor = False + pepmass = -1 + charge = 0 + + + if line.strip().startswith(" Date: Mon, 28 Oct 2024 08:54:45 +0000 Subject: [PATCH 2/2] pyopenms does not seem to be necessary --- compile_and_setup_depencies.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/compile_and_setup_depencies.sh b/compile_and_setup_depencies.sh index 6e81ebe..8aa4e50 100755 --- a/compile_and_setup_depencies.sh +++ b/compile_and_setup_depencies.sh @@ -86,9 +86,6 @@ pip install lxml pip install git+https://github.com/mafreitas/tdf2mzml # Needed to convert Bruker to mgf pip install alphatims -# Needed to convert Bruker to mgf -pip install pyopenms - ##### Make all files in bin executable (excluding sub-directories) to be visible by processes in Nextflow