Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 0 additions & 62 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,68 +66,6 @@ jobs:
# If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors.
# If this is not a PR, then DO fail CI if the Codecov upload errors.
fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }}
test-slurm:
if: false
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
version:
# Please note: You must specify the full Julia version number (major.minor.patch).
# This is because the value here will be directly interpolated into a download URL.
# - '1.2.0' # minimum Julia version supported in Project.toml
- '1.6.7' # previous LTS
- '1.10.7' # current LTS
- '1.11.2' # currently the latest stable release
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Print Docker version
run: |
docker --version
docker version
# This next bit of code is taken from:
# https://github.com/kleinhenz/SlurmClusterManager.jl
# Original author: Joseph Kleinhenz
# License: MIT
- name: Setup Slurm inside Docker
run: |
docker version
docker compose version
docker build --build-arg "JULIA_VERSION=${MATRIX_JULIA_VERSION:?}" -t slurm-cluster-julia -f ci/Dockerfile .
docker compose -f ci/docker-compose.yml up -d
docker ps
env:
MATRIX_JULIA_VERSION: ${{matrix.version}}
- name: Print some information for debugging purposes
run: |
docker exec -t slurmctld pwd
docker exec -t slurmctld ls -la
docker exec -t slurmctld ls -la HTCondorClusterManager
- name: Instantiate package
run: docker exec -t slurmctld julia --project=HTCondorClusterManager -e 'import Pkg; @show Base.active_project(); Pkg.instantiate(); Pkg.status()'
- name: Run tests without a Slurm allocation
run: docker exec -t slurmctld julia --project=HTCondorClusterManager -e 'import Pkg; Pkg.test(; test_args=["slurm"])'
- name: Run tests inside salloc
run: docker exec -t slurmctld salloc -t 00:10:00 -n 2 julia --project=HTCondorClusterManager -e 'import Pkg; Pkg.test(; test_args=["slurm"], coverage=true)'
- name: Run tests inside sbatch
run: docker exec -t slurmctld HTCondorClusterManager/ci/run_my_sbatch.sh
- run: find . -type f -name '*.cov'
- name: Copy .cov files out of the Docker container
run: docker exec slurmctld /bin/bash -c 'cd /home/docker/HTCondorClusterManager && tar -cf - src/*.cov' | tar -xvf -
- run: find . -type f -name '*.cov'
# - run: find . -type f -name '*.cov' -exec cat {} \;
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v5
with:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
# If this PR is from a fork, then do NOT fail CI if the Codecov upload errors.
# If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors.
# If this is not a PR, then DO fail CI if the Codecov upload errors.
fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }}
example-pull-gcr:
runs-on: ubuntu-latest
timeout-minutes: 20
Expand Down
21 changes: 0 additions & 21 deletions ci/Dockerfile

This file was deleted.

48 changes: 0 additions & 48 deletions ci/docker-compose.yml

This file was deleted.

14 changes: 0 additions & 14 deletions ci/my_sbatch.sh

This file was deleted.

14 changes: 0 additions & 14 deletions ci/run_my_sbatch.sh

This file was deleted.

70 changes: 0 additions & 70 deletions docs/sge.md

This file was deleted.

18 changes: 0 additions & 18 deletions slurm_test.jl

This file was deleted.

18 changes: 6 additions & 12 deletions src/HTCondorClusterManager.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
module HTCondorClusterManager

using Distributed
using Sockets
using Pkg
import Distributed
import Sockets
import Pkg

using Distributed: launch, manage, kill, init_worker, connect

export launch, manage, kill, init_worker, connect
import Distributed: launch, manage, kill, init_worker, connect


worker_cookie() = begin Distributed.init_multi(); cluster_cookie() end
worker_arg() = `--worker=$(worker_cookie())`


# PBS doesn't have the same semantics as SGE wrt to file accumulate,
# a different solution will have to be found
include("qsub.jl")
include("scyld.jl")
include("condor.jl")
include("slurm.jl")
include("affinity.jl")
include("elastic.jl")

end
52 changes: 0 additions & 52 deletions src/affinity.jl

This file was deleted.

12 changes: 6 additions & 6 deletions src/condor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export HTCManager, addprocs_htc

struct HTCManager <: ClusterManager
struct HTCManager <: Distributed.ClusterManager
np::Integer
end

Expand Down Expand Up @@ -51,7 +51,7 @@ function condor_script(portnum::Integer, np::Integer, params::Dict)
"$tdir/$jobname.sub"
end

function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Condition)
function Distributed.launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Condition)
let
mgr_desc = "HTCondor"
msg = "The $(mgr_desc) functionality in ClusterManagers.jl is currently not actively maintained. " *
Expand All @@ -63,7 +63,7 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond
end
try
portnum = rand(8000:9000)
portnum, server = listenany(ip"0.0.0.0", portnum)
portnum, server = listenany(Distributed.ip"0.0.0.0", portnum)
np = manager.np

script = condor_script(portnum, np, params)
Expand All @@ -76,7 +76,7 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond

for i=1:np
conn = accept(server)
config = WorkerConfig()
config = Distributed.WorkerConfig()

config.io = conn

Expand All @@ -92,12 +92,12 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond
end
end

function kill(manager::HTCManager, id::Int64, config::WorkerConfig)
function Distributed.kill(manager::HTCManager, id::Int64, config::Distributed.WorkerConfig)
remotecall(exit,id)
close(config.io)
end

function manage(manager::HTCManager, id::Integer, config::WorkerConfig, op::Symbol)
function Distributed.manage(manager::HTCManager, id::Integer, config::Distributed.WorkerConfig, op::Symbol)
if op == :finalize
if !isnothing(config.io)
close(config.io)
Expand Down
Loading
Loading