Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions docs/source/AdministratorGuide/Resources/computingelements.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,25 +57,22 @@ of the *ComputingElement* is located inside the corresponding site section in th
# Site administrative domain
LCG
{
# Site section
# Site section. This is the DIRAC's site name.
LCG.CNAF.it
{
# Site name
# Alternative site name (e.g. site name in GOC DB)
Name = CNAF

# List of valid CEs on the site
CE = ce01.infn.it, ce02.infn.it

# Section describing each CE
CEs
{
# Specific CE description section
# Specific CE description section. This site name is unique.
ce01.infn.it
{
# Type of the CE
# Type of the CE. "HTCondorCE" and "AREX" and "SSH" are the most common types.
CEType = HTCondorCE

# Section to describe various queue in the CE
# Section to describe various (logical) queues in the CE.
Queues
{
long
Expand All @@ -93,7 +90,6 @@ of the *ComputingElement* is located inside the corresponding site section in th

This is the general structure in which specific CE descriptions are inserted.
The CE configuration is part of the general DIRAC configuration
It can be placed in the general Configuration Service or in the local configuration of the DIRAC installation.
Examples of the configuration can be found in the :ref:`full_configuration_example`, in the *Resources/Computing* section.
You can find the options of a specific CE in the code documentation: :mod:`DIRAC.Resources.Computing`.

Expand Down
23 changes: 18 additions & 5 deletions docs/source/UserGuide/Tutorials/JobManagementAdvanced/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,16 +335,15 @@ Jobs that can (or should) run using more than 1 processor should be described as
using the "setNumberOfProcessors" method of the API::

j = Job()
j.setCPUTime(500)
j.setExecutable('echo',arguments='hello')
j.setExecutable('ls',arguments='-l')
j.setExecutable('echo', arguments='hello again')
j.setName('MP test')
...
j.setNumberOfProcessors(16)

Calling ``Job().setNumberOfProcessors()``, with a value bigger than 1,
will translate into adding also the "MultiProcessor" tag to the job description.

``Job().setNumberOfProcessors()`` takes at most 3 arguments, in this order:
``numberOfProcessors`` is the exact number of requested processors, ``minNumberOfProcessors`` is the minimum allowed, ``maxNumberOfProcessors`` the maximum.

Users can specify in the job descriptions NumberOfProcessors and WholeNode parameters, e.g.::

NumberOfProcessors = 16;
Expand All @@ -356,6 +355,20 @@ This will be translated internally into 16Processors and WholeNode tags.
This would allow resources (WN's) to put flexibly requirements on jobs to be taken, for example, avoiding single-core jobs on a multi-core nodes.


Setting memory limits
@@@@@@@@@@@@@@@@@@@@@

Jobs can (and probably should) set RAM limits.
using the "setRAMRequirements" method of the API::

j = Job()
...
j.setRAMRequirements(2500, 4000)

Calling ``Job().setRAMRequirements()`` takes 2 values, where the first is the minimum required amount of RAM (in MB) that the job requests.
The second value instead specifies the limit that should not be surpassed.


Submitting jobs with specifc requirements (e.g. GPU)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

Expand Down
1 change: 1 addition & 0 deletions src/DIRAC/Core/Utilities/CGroups2.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def systemCall(self, *args, **kwargs):
if "ceParameters" in kwargs:
if cpuLimit := kwargs["ceParameters"].get("CPULimit", None):
cores = float(cpuLimit)
# MemoryLimitMB should be the job upper limit
if memoryMB := int(kwargs["ceParameters"].get("MemoryLimitMB", 0)):
memory = memoryMB * 1024 * 1024
if kwargs["ceParameters"].get("MemoryNoSwap", "no").lower() in ("yes", "true"):
Expand Down
79 changes: 62 additions & 17 deletions src/DIRAC/Interfaces/API/Job.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
"""
Job Base Class
Job Base Class

This class provides generic job definition functionality suitable for any VO.
This class provides generic job definition functionality suitable for any VO.

Helper functions are documented with example usage for the DIRAC API. An example
script (for a simple executable) would be::
Helper functions are documented with example usage for the DIRAC API. An example
script (for a simple executable) would be::

from DIRAC.Interfaces.API.Dirac import Dirac
from DIRAC.Interfaces.API.Job import Job
from DIRAC.Interfaces.API.Dirac import Dirac
from DIRAC.Interfaces.API.Job import Job

j = Job()
j.setCPUTime(500)
j.setExecutable('/bin/echo hello')
j.setExecutable('yourPythonScript.py')
j.setExecutable('/bin/echo hello again')
j.setName('MyJobName')
j = Job()
j.setCPUTime(500)
j.setExecutable('/bin/echo hello')
j.setExecutable('yourPythonScript.py')
j.setExecutable('/bin/echo hello again')
j.setName('MyJobName')

dirac = Dirac()
jobID = dirac.submitJob(j)
print 'Submission Result: ',jobID
dirac = Dirac()
jobID = dirac.submitJob(j)
print 'Submission Result: ',jobID

Note that several executables can be provided and wil be executed sequentially.
Note that several executables can be provided and wil be executed sequentially.
"""

import os
import re
import shlex
Expand Down Expand Up @@ -517,6 +518,50 @@ def setDestination(self, destination):
return S_OK()

#############################################################################
def setRAMRequirements(self, ramRequired: int = 0, maxRAM: int = 0):
"""Helper function.
Specify the RAM requirements for the job, in MB. 0 (default) means no specific requirements.

Example usage:

>>> job = Job()
>>> job.setRAMRequirements(ramRequired=2000)
means that the job needs at least 2 GBs of RAM to work. This is taken into consideration at job's matching time.
The job definition does not specify an upper limit.
From a user's point of view this is fine (normally, not for admins).

>>> job.setRAMRequirements(ramRequired=500, maxRAM=3800)
means that the job needs 500 MBs of RAM to work. 3.8 GBs will then be the upper limit for CG2 limits.

>>> job.setRAMRequirements(ramRequired=3200, maxRAM=3200)
means that we should match this job if there is at least 3.2 available GBs of run. At the same time, CG2 will not allow to use more than that.

>>> job.setRAMRequirements(maxRAM=4000)
means that the job does not set a min amount of RAM (so can match--run "everywhere"), but the 4 GBs will then be the upper limit for CG2 limits.

>>> job.setRAMRequirements(ramRequired=8000, maxRAM=4000)
Makes no sense, an error will be raised
"""
if ramRequired and maxRAM and ramRequired > maxRAM:
return self._reportError("Invalid settings, ramRequired is higher than maxRAM")

if ramRequired:
self._addParameter(
self.workflow,
"MinRAM",
"JDL",
ramRequired,
"MBs of RAM requested",
)
if maxRAM:
self._addParameter(
self.workflow,
"MaxRAM",
"JDL",
maxRAM,
"Max MBs of RAM to be used",
)

def setNumberOfProcessors(self, numberOfProcessors=None, minNumberOfProcessors=None, maxNumberOfProcessors=None):
"""Helper function to set the number of processors required by the job.

Expand Down Expand Up @@ -709,7 +754,7 @@ def setTag(self, tags):
Example usage:

>>> job = Job()
>>> job.setTag( ['WholeNode','8GBMemory'] )
>>> job.setTag( ['WholeNode'] )

:param tags: single tag string or a list of tags
:type tags: str or python:list
Expand Down
69 changes: 37 additions & 32 deletions src/DIRAC/Resources/Computing/ComputingElement.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
""" The Computing Element class is a base class for all the various
types CEs. It serves several purposes:
"""The Computing Element class is a base class for all the various
types CEs. It serves several purposes:

- collects general CE related parameters to generate CE description
for the job matching
- provides logic for evaluation of the number of available CPU slots
- provides logic for the proxy renewal while executing jobs
- collects general CE related parameters to generate CE description
for the job matching
- provides logic for evaluation of the number of available CPU slots
- provides logic for the proxy renewal while executing jobs

The CE parameters are collected from the following sources, in hierarchy
descending order:
The CE parameters are collected from the following sources, in hierarchy
descending order:

- parameters provided through setParameters() method of the class
- parameters in /LocalSite configuration section
- parameters in /LocalSite/<ceName>/ResourceDict configuration section
- parameters in /LocalSite/ResourceDict configuration section
- parameters in /LocalSite/<ceName> configuration section
- parameters in /Resources/Computing/<ceName> configuration section
- parameters in /Resources/Computing/CEDefaults configuration section
- parameters provided through setParameters() method of the class
- parameters in /LocalSite configuration section
- parameters in /LocalSite/<ceName>/ResourceDict configuration section
- parameters in /LocalSite/ResourceDict configuration section
- parameters in /LocalSite/<ceName> configuration section
- parameters in /Resources/Computing/<ceName> configuration section
- parameters in /Resources/Computing/CEDefaults configuration section

The ComputingElement objects are usually instantiated with the help of
ComputingElementFactory.
The ComputingElement objects are usually instantiated with the help of
ComputingElementFactory.

The ComputingElement class can be considered abstract. 3 kinds of abstract ComputingElements
can be distinguished from it:
The ComputingElement class can be considered abstract. 3 kinds of abstract ComputingElements
can be distinguished from it:

- Remote ComputingElement: includes methods to interact with a remote ComputingElement
(e.g. HtCondorCEComputingElement, AREXComputingElement).
- Inner ComputingElement: includes methods to locally interact with an underlying worker node.
It is worth noting that an Inner ComputingElement provides synchronous submission
(the submission of a job is blocking the execution until its completion). It deals with one job at a time.
- Inner Pool ComputingElement: includes methods to locally interact with Inner ComputingElements asynchronously.
It can manage a pool of jobs running simultaneously.
- Remote ComputingElement: includes methods to interact with a remote ComputingElement
(e.g. HtCondorCEComputingElement, AREXComputingElement).
- Inner ComputingElement: includes methods to locally interact with an underlying worker node.
It is worth noting that an Inner ComputingElement provides synchronous submission
(the submission of a job is blocking the execution until its completion). It deals with one job at a time.
- Inner Pool ComputingElement: includes methods to locally interact with Inner ComputingElements asynchronously.
It can manage a pool of jobs running simultaneously.

To configure the use of Tokens for CEs:
To configure the use of Tokens for CEs:

* the CE is able to receive any token. Validation: 'Tag = Token' should be included in the CE parameters.
* the CE is able to receive VO-specifc tokens. Validation: 'Tag = Token:<VO>' should be included in the CE parameters.
* the CE is able to receive any token. Validation: 'Tag = Token' should be included in the CE parameters.
* the CE is able to receive VO-specifc tokens. Validation: 'Tag = Token:<VO>' should be included in the CE parameters.

"""

Expand All @@ -50,6 +50,7 @@
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import (
getNumberOfGPUs,
getNumberOfProcessors,
getAvailableRAM,
)

INTEGER_PARAMETERS = ["CPUTime", "NumberOfProcessors", "NumberOfPayloadProcessors", "MaxRAM"]
Expand Down Expand Up @@ -211,12 +212,14 @@ def setParameters(self, ceOptions):
generalCEDict.update(self.ceParameters)
self.ceParameters = generalCEDict

# If NumberOfProcessors/GPUs is present in the description but is equal to zero
# If NumberOfProcessors/GPUs/MaxRAM is present in the description but is equal to zero
# interpret it as needing local evaluation
if self.ceParameters.get("NumberOfProcessors", -1) == 0:
if int(self.ceParameters.get("NumberOfProcessors", -1)) == 0:
self.ceParameters["NumberOfProcessors"] = getNumberOfProcessors()
if self.ceParameters.get("NumberOfGPUs", -1) == 0:
if int(self.ceParameters.get("NumberOfGPUs", -1)) == 0:
self.ceParameters["NumberOfGPUs"] = getNumberOfGPUs()
if int(self.ceParameters.get("MaxRAM", 0)) == 0:
self.ceParameters["MaxRAM"] = getAvailableRAM()

for key in ceOptions:
if key in INTEGER_PARAMETERS:
Expand Down Expand Up @@ -252,6 +255,7 @@ def available(self):
runningJobs = result["RunningJobs"]
waitingJobs = result["WaitingJobs"]
availableProcessors = result.get("AvailableProcessors")

ceInfoDict = dict(result)

maxTotalJobs = int(self.ceParameters.get("MaxTotalJobs", 0))
Expand Down Expand Up @@ -404,6 +408,7 @@ def getDescription(self):
result = self.getCEStatus()
if result["OK"]:
ceDict["NumberOfProcessors"] = result.get("AvailableProcessors", result.get("NumberOfProcessors", 1))
ceDict["MaxRAM"] = result.get("AvailableRAM", result.get("MaxRAM", 1024))
else:
self.log.error(
"Failure getting CE status", "(we keep going without the number of waiting and running pilots/jobs)"
Expand Down
3 changes: 3 additions & 0 deletions src/DIRAC/Resources/Computing/InProcessComputingElement.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self, ceUniqueID):
self.runningJobs = 0

self.processors = int(self.ceParameters.get("NumberOfProcessors", 1))
self.maxRAM = int(self.ceParameters.get("MaxRAM", 0))
self.ceParameters["MaxTotalJobs"] = 1

def submitJob(self, executableFile, proxy=None, inputs=None, **kwargs):
Expand Down Expand Up @@ -118,4 +119,6 @@ def getCEStatus(self):
result["WaitingJobs"] = 0
# processors
result["AvailableProcessors"] = self.processors
# RAM
result["AvailableRAM"] = self.maxRAM
return result
Loading
Loading