From a2b2da201e03f61a20e1adc700aa63697ccf234c Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:37:12 +0000 Subject: [PATCH 01/12] setup dvc in repo --- .devcontainer/devcontainer.json | 1 + .dvc/.gitignore | 3 +++ .dvc/config | 7 +++++++ .dvcignore | 3 +++ tests/data/.gitignore | 1 + tests/data/derivatives-infant.dvc | 6 ++++++ 6 files changed, 21 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 tests/data/.gitignore create mode 100644 tests/data/derivatives-infant.dvc diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e090..45b4c94a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,6 +4,7 @@ "remoteUser": "gitpod", "runArgs": ["--privileged"], + "postCreateCommand": "bash -c 'conda install -c conda-forge dvc dvc-gdrive && dvc --version'", // Configure tool-specific properties. "customizations": { // Configure properties specific to VS Code. diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..448fa06b --- /dev/null +++ b/.dvc/config @@ -0,0 +1,7 @@ +[core] + remote = storage +['remote "storage"'] + url = gdrive://1f5Lw8-HRvX_QzNyygYvhM8S9aBoR-aS- + gdrive_client_id = 58977874598-7k0o50klkpujhjpll5960o4bac1c5u9g.apps.googleusercontent.com + gdrive_client_secret = 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 + gdrive_use_service_account = true diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/tests/data/.gitignore b/tests/data/.gitignore new file mode 100644 index 00000000..af6746bf --- /dev/null +++ b/tests/data/.gitignore @@ -0,0 +1 @@ +/derivatives-infant diff --git a/tests/data/derivatives-infant.dvc b/tests/data/derivatives-infant.dvc new file mode 100644 index 00000000..af64e4ef --- /dev/null +++ b/tests/data/derivatives-infant.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 69c8723df1438101d77d1b87e58c8280.dir + size: 348426446 + nfiles: 13 + hash: md5 + path: derivatives-infant From cac8d6abe6b6e9ea6b2d5bf45a0b4ca53fd46607 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:37:53 +0000 Subject: [PATCH 02/12] add nf-boost as plugin to reduce memory usage in test case --- nextflow.config | 6 ++++++ nextflow_schema.json | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/nextflow.config b/nextflow.config index f9825475..475048fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -268,6 +268,7 @@ params { // Boilerplate options outdir = null publish_dir_mode = 'copy' + cleanup = false email = null email_on_fail = null plaintext_email = false @@ -548,6 +549,11 @@ manifest { // Nextflow plugins plugins { id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-boost' // Cleaning up intermediate file while the pipeline is running. +} + +boost { + cleanup = params.cleanup } validation { diff --git a/nextflow_schema.json b/nextflow_schema.json index e1399bf4..cd52dcd8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1467,6 +1467,14 @@ "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, + "cleanup": { + "type": "boolean", + "default": false, + "description": "Remove intermediate files after pipeline completion.", + "help_text": "If set to true, the pipeline will remove all intermediate files after the pipeline has completed. This can save disk space but may make debugging more difficult and option -resume won't be available. Use with caution", + "fa_icon": "fas fa-copy", + "hidden": true + }, "lean_output": { "type": "boolean", "description": "Do not copy intermediate files to output directory.", From 12f84fef2c9b1f68db1240043c7b6b81b11cce2a Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:38:45 +0000 Subject: [PATCH 03/12] set up sample test case with data from dvc --- .github/workflows/ci.yml | 10 +++++++ conf/base.config | 2 +- nf-test.config | 2 +- tests/nextflow.config | 4 +++ tests/run_connectomics.nf.test | 44 +++++++++++++++++++++++++++++ tests/run_connectomics.nf.test.snap | 10 +++++++ 6 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 tests/run_connectomics.nf.test create mode 100644 tests/run_connectomics.nf.test.snap diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72aadfde..9c5e9418 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,6 +36,7 @@ jobs: - "docker" - "singularity" test_name: + - "run_connectomics.nf.test" - "chained.nf.test" - "connectomics.nf.test" - "tracking.nf.test" @@ -82,6 +83,15 @@ jobs: mkdir -p $NXF_SINGULARITY_CACHEDIR mkdir -p $NXF_SINGULARITY_LIBRARYDIR + - name: Set up DVC + uses: iterative/setup-dvc@v1 + + - name: Fetch test dataset + env: | + GDRIVE_CREDENTIALS_DATA: ${{ secrets.DVC_KEY }} + run: | + dvc pull + - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" run: | nf-test test ${GITHUB_WORKSPACE}/tests/${{ matrix.test_name }} --ci --profile ${{ matrix.profile }} diff --git a/conf/base.config b/conf/base.config index 4ee07a14..10192df8 100644 --- a/conf/base.config +++ b/conf/base.config @@ -32,7 +32,7 @@ process { } withLabel:process_medium { cpus = { 4 * task.attempt * (executor.name == 'slurm' ? 2 : 1) } - memory = { 8.GB * task.attempt * (executor.name == 'slurm' ? 2 : 1) } + memory = { 10.GB * task.attempt * (executor.name == 'slurm' ? 2 : 1) } time = { 8.h * task.attempt * (executor.name == 'slurm' ? 2 : 1) } } withLabel:process_high { diff --git a/nf-test.config b/nf-test.config index 0c8518a1..0e40da9f 100644 --- a/nf-test.config +++ b/nf-test.config @@ -7,7 +7,7 @@ config { withTrace true autoSort false ignore "modules/**", "subworkflows/**" - options "-dump-channels -stub-run" + options "-dump-channels" plugins { load "nft-utils@0.0.3" } diff --git a/tests/nextflow.config b/tests/nextflow.config index 207cbf1c..e08a163e 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -3,6 +3,10 @@ Nextflow config file for running tests ======================================================================================== */ +params { + outdir = "output/" + publish_dir_mode = "copy" +} process { withName: '.*' { diff --git a/tests/run_connectomics.nf.test b/tests/run_connectomics.nf.test new file mode 100644 index 00000000..675bc6c6 --- /dev/null +++ b/tests/run_connectomics.nf.test @@ -0,0 +1,44 @@ +nextflow_pipeline { + + name "Test nf-pediatric -profile connectomics,infant" + script "../main.nf" + + test("nf-pediatric -profile connectomics,infant") { + + when { + params { + + params.input_deriv = "$projectDir/tests/data/derivatives-infant/" + params.outdir = "$outputDir" + + params.connectomics = true + + params.infant = true + + params.commit_para_diff = "1.2E-3" + params.commit_iso_diff = "2.0E-3" + params.decompose_min_len = 10 + params.decompose_outlier_threshold = 0.4 + + params.cleanup = true + } + } + + then { + // stable name: All files + folders in ${params.outdir}/ with a stable name. + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + assertAll( + { assert workflow.success }, + { assert snapshot( + // Number of successfully completed tasks + workflow.trace.succeeded().size(), + // Remove the nextflow version from the versions.yml because we test it using different nextflow versions. + removeNextflowVersion("$outputDir/pipeline_info/nf-pediatric_software_mqc_versions.yml"), + // All stable name. + stable_name + ).md5().match() + } + ) + } + } +} diff --git a/tests/run_connectomics.nf.test.snap b/tests/run_connectomics.nf.test.snap new file mode 100644 index 00000000..35ac84b8 --- /dev/null +++ b/tests/run_connectomics.nf.test.snap @@ -0,0 +1,10 @@ +{ + "nf-pediatric -profile connectomics,infant": { + "content": "4e3209e3c6e93e9901a525e77ce460fb", + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-02-17T20:34:19.897718" + } +} \ No newline at end of file From 4c21d3baa59fb564d122da0593ca466680c75b57 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:48:23 +0000 Subject: [PATCH 04/12] fix typo --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c5e9418..a30e92e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,8 +87,8 @@ jobs: uses: iterative/setup-dvc@v1 - name: Fetch test dataset - env: | - GDRIVE_CREDENTIALS_DATA: ${{ secrets.DVC_KEY }} + env: + GDRIVE_CREDENTIALS_DATA: ${{ secrets.DVC_KEY }} run: | dvc pull From ed02b8b6cf6bb0c34731790fd2d85cf33c0c590e Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:48:41 +0000 Subject: [PATCH 05/12] fix linting --- .prettierignore | 1 + nextflow.config | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.prettierignore b/.prettierignore index edd29f01..5eee3045 100644 --- a/.prettierignore +++ b/.prettierignore @@ -11,3 +11,4 @@ testing* *.pyc bin/ ro-crate-metadata.json +*.dvc diff --git a/nextflow.config b/nextflow.config index 475048fa..56da57a4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -549,11 +549,11 @@ manifest { // Nextflow plugins plugins { id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet - id 'nf-boost' // Cleaning up intermediate file while the pipeline is running. + id 'nf-boost@0.4.0' // Cleaning up intermediate file while the pipeline is running. } boost { - cleanup = params.cleanup + cleanup = params.cleanup } validation { From 6067b4dd6df76ad2d61b2e27965b0e5067696a59 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:59:51 +0000 Subject: [PATCH 06/12] fix dvc credentials --- .dvc/config | 2 -- .github/workflows/ci.yml | 11 +++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.dvc/config b/.dvc/config index 448fa06b..1b6039da 100644 --- a/.dvc/config +++ b/.dvc/config @@ -2,6 +2,4 @@ remote = storage ['remote "storage"'] url = gdrive://1f5Lw8-HRvX_QzNyygYvhM8S9aBoR-aS- - gdrive_client_id = 58977874598-7k0o50klkpujhjpll5960o4bac1c5u9g.apps.googleusercontent.com - gdrive_client_secret = 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 gdrive_use_service_account = true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a30e92e1..dd122ed1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -86,6 +86,17 @@ jobs: - name: Set up DVC uses: iterative/setup-dvc@v1 + - name: Setup Google Drive credentials + env: + GDRIVE_CLIENT_ID: ${{ secrets.GDRIVE_CLIENT_ID }} + GDRIVE_CLIENT_SECRET: ${{ secrets.GDRIVE_CLIENT_SECRET }} + GDRIVE_SERVICE_ACCOUNT_JSON: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} + run: | + echo "$GDRIVE_SERVICE_ACCOUNT_JSON" > dvc-remote-connections.json + dvc remote modify storage gdrive_client_id $GDRIVE_CLIENT_ID + dvc remote modify storage gdrive_client_secret $GDRIVE_CLIENT_SECRET + dvc remote modify storage gdrive_service_account_json_file_path dvc-remote-connections.json + - name: Fetch test dataset env: GDRIVE_CREDENTIALS_DATA: ${{ secrets.DVC_KEY }} From d9fffe92dd27fa667ac20d7fa74228a82ab26313 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 02:59:59 +0000 Subject: [PATCH 07/12] fix lint --- .editorconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index 6d9b74cc..a5eaf1c0 100644 --- a/.editorconfig +++ b/.editorconfig @@ -28,8 +28,8 @@ indent_style = unset [/assets/email*] indent_size = unset -# ignore python and markdown -[*.{py,md}] +# ignore python, markdown, and dvc files +[*.{py,md,dvc}] indent_style = unset # ignore ro-crate metadata files From 656fd27cca73a1fa8a822a0bc504a753140a5f00 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 03:08:23 +0000 Subject: [PATCH 08/12] specify as strings --- .github/workflows/ci.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd122ed1..fe8ae526 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -93,13 +93,11 @@ jobs: GDRIVE_SERVICE_ACCOUNT_JSON: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} run: | echo "$GDRIVE_SERVICE_ACCOUNT_JSON" > dvc-remote-connections.json - dvc remote modify storage gdrive_client_id $GDRIVE_CLIENT_ID - dvc remote modify storage gdrive_client_secret $GDRIVE_CLIENT_SECRET + dvc remote modify storage gdrive_client_id "$GDRIVE_CLIENT_ID" + dvc remote modify storage gdrive_client_secret "$GDRIVE_CLIENT_SECRET" dvc remote modify storage gdrive_service_account_json_file_path dvc-remote-connections.json - name: Fetch test dataset - env: - GDRIVE_CREDENTIALS_DATA: ${{ secrets.DVC_KEY }} run: | dvc pull From 7a5d709e0bcbc72aff3791f665f0562b267237e9 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 03:24:21 +0000 Subject: [PATCH 09/12] debugging --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe8ae526..67e344f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -95,7 +95,7 @@ jobs: echo "$GDRIVE_SERVICE_ACCOUNT_JSON" > dvc-remote-connections.json dvc remote modify storage gdrive_client_id "$GDRIVE_CLIENT_ID" dvc remote modify storage gdrive_client_secret "$GDRIVE_CLIENT_SECRET" - dvc remote modify storage gdrive_service_account_json_file_path dvc-remote-connections.json + dvc remote modify --local storage gdrive_service_account_json_file_path dvc-remote-connections.json - name: Fetch test dataset run: | From ea4c18dae49e82d0c07b0f55b5d1584cf4e7e6a4 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 14:23:41 +0000 Subject: [PATCH 10/12] use environmental variable --- .dvc/.gitignore | 1 + .github/workflows/ci.yml | 13 ++----------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/.dvc/.gitignore b/.dvc/.gitignore index 528f30c7..4445cea1 100644 --- a/.dvc/.gitignore +++ b/.dvc/.gitignore @@ -1,3 +1,4 @@ /config.local /tmp /cache +nf-pediatric-test-data-* diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 67e344f5..f02563bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -86,18 +86,9 @@ jobs: - name: Set up DVC uses: iterative/setup-dvc@v1 - - name: Setup Google Drive credentials + - name: Pull data with DVC env: - GDRIVE_CLIENT_ID: ${{ secrets.GDRIVE_CLIENT_ID }} - GDRIVE_CLIENT_SECRET: ${{ secrets.GDRIVE_CLIENT_SECRET }} - GDRIVE_SERVICE_ACCOUNT_JSON: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} - run: | - echo "$GDRIVE_SERVICE_ACCOUNT_JSON" > dvc-remote-connections.json - dvc remote modify storage gdrive_client_id "$GDRIVE_CLIENT_ID" - dvc remote modify storage gdrive_client_secret "$GDRIVE_CLIENT_SECRET" - dvc remote modify --local storage gdrive_service_account_json_file_path dvc-remote-connections.json - - - name: Fetch test dataset + GDRIVE_CREDENTIALS_DATA: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} run: | dvc pull From 269ce3c17189ff6e2fc2ef993594ab6f67122bef Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 14:37:37 +0000 Subject: [PATCH 11/12] debugging --- .github/workflows/ci.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f02563bd..cd5b728a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,12 +37,12 @@ jobs: - "singularity" test_name: - "run_connectomics.nf.test" - - "chained.nf.test" - - "connectomics.nf.test" - - "tracking.nf.test" - - "freesurfer.nf.test" - - "infantseg.nf.test" - - "multisubjects.nf.test" + #- "chained.nf.test" + #- "connectomics.nf.test" + #- "tracking.nf.test" + #- "freesurfer.nf.test" + #- "infantseg.nf.test" + #- "multisubjects.nf.test" isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev @@ -90,6 +90,8 @@ jobs: env: GDRIVE_CREDENTIALS_DATA: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} run: | + echo $GDRIVE_CREDENTIALS_DATA > gdrive-credentials.json + dvc remote modify --local storage gdrive_service_account_json_file_path $(realpath gdrive-credentials.json) dvc pull - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" From dbff8c92de98713fcf3e18092a24ae85c2f5bc76 Mon Sep 17 00:00:00 2001 From: Anthony Gagnon Date: Tue, 18 Feb 2025 15:20:37 +0000 Subject: [PATCH 12/12] replace echo by printf for correct json formatting --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd5b728a..5d9a3251 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -90,7 +90,7 @@ jobs: env: GDRIVE_CREDENTIALS_DATA: ${{ secrets.GDRIVE_SERVICE_ACCOUNT_JSON_FILE_PATH }} run: | - echo $GDRIVE_CREDENTIALS_DATA > gdrive-credentials.json + printf '%s' "$GDRIVE_CREDENTIALS_DATA" > gdrive-credentials.json dvc remote modify --local storage gdrive_service_account_json_file_path $(realpath gdrive-credentials.json) dvc pull