Skip to content

Commit f498935

Browse files
committed
[Edited] Fix minor bug in the main function
1 parent d836b31 commit f498935

File tree

1 file changed

+24
-116
lines changed

1 file changed

+24
-116
lines changed

ci/submit-job.py

Lines changed: 24 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -5,167 +5,78 @@
55
import sys
66
import time
77
from datetime import datetime
8-
98
import boto3
109
from botocore.compat import total_seconds
1110
from botocore.config import Config
12-
13-
14-
job_type_info = {
15-
'ci-cpu': {
16-
'job_definition': 'd2l-ci-cpu-builder:2',
17-
'job_queue': 'D2L-CI-CPU'
18-
},
19-
'ci-cpu-push': {
20-
'job_definition': 'd2l-ci-cpu-builder-push:7',
21-
'job_queue': 'D2L-CI-CPU'
22-
},
23-
'ci-cpu-release': {
24-
'job_definition': 'd2l-ci-cpu-builder-release:1',
25-
'job_queue': 'D2L-CI-CPU'
26-
},
27-
'ci-gpu-torch': {
28-
'job_definition': 'd2l-ci-zh-gpu-torch:1',
29-
'job_queue': 'D2L-CI-GPU'
30-
},
31-
'ci-gpu-tf': {
32-
'job_definition': 'd2l-ci-zh-gpu-tf:1',
33-
'job_queue': 'D2L-CI-GPU'
34-
},
35-
'ci-gpu-mxnet': {
36-
'job_definition': 'd2l-ci-zh-gpu-mxnet:1',
37-
'job_queue': 'D2L-CI-GPU'
38-
},
39-
'ci-gpu-paddle': {
40-
'job_definition': 'd2l-ci-zh-gpu-paddle:1',
41-
'job_queue': 'D2L-CI-GPU'
42-
}
43-
}
44-
45-
# Create push job types for GPUs with same definitions
11+
job_type_info = {'ci-cpu': {'job_definition': 'd2l-ci-cpu-builder:2', 'job_queue': 'D2L-CI-CPU'}, 'ci-cpu-push': {'job_definition': 'd2l-ci-cpu-builder-push:7', 'job_queue': 'D2L-CI-CPU'}, 'ci-cpu-release': {'job_definition': 'd2l-ci-cpu-builder-release:1', 'job_queue': 'D2L-CI-CPU'}, 'ci-gpu-torch': {'job_definition': 'd2l-ci-zh-gpu-torch:1', 'job_queue': 'D2L-CI-GPU'}, 'ci-gpu-tf': {'job_definition': 'd2l-ci-zh-gpu-tf:1', 'job_queue': 'D2L-CI-GPU'}, 'ci-gpu-mxnet': {'job_definition': 'd2l-ci-zh-gpu-mxnet:1', 'job_queue': 'D2L-CI-GPU'}, 'ci-gpu-paddle': {'job_definition': 'd2l-ci-zh-gpu-paddle:1', 'job_queue': 'D2L-CI-GPU'}}
4612
for job_type in list(job_type_info.keys()):
4713
if job_type.startswith('ci-gpu'):
48-
job_type_info[job_type+'-push'] = job_type_info[job_type]
49-
job_type_info[job_type+'-release'] = job_type_info[job_type]
50-
14+
job_type_info[job_type + '-push'] = job_type_info[job_type]
15+
job_type_info[job_type + '-release'] = job_type_info[job_type]
5116
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
52-
53-
parser.add_argument('--profile', help='profile name of aws account.', type=str,
54-
default=None)
55-
parser.add_argument('--region', help='Default region when creating new connections', type=str,
56-
default='us-west-2')
17+
parser.add_argument('--profile', help='profile name of aws account.', type=str, default=None)
18+
parser.add_argument('--region', help='Default region when creating new connections', type=str, default='us-west-2')
5719
parser.add_argument('--name', help='name of the job', type=str, default='d2l-ci')
58-
parser.add_argument('--job-type', help='type of job to submit.', type=str,
59-
choices=job_type_info.keys(), default='ci-cpu')
60-
parser.add_argument('--source-ref',
61-
help='ref in d2l-zh main github. e.g. master, refs/pull/500/head',
62-
type=str, default='master')
63-
parser.add_argument('--work-dir',
64-
help='working directory inside the repo. e.g. scripts/preprocess',
65-
type=str, default='.')
66-
parser.add_argument('--saved-output',
67-
help='output to be saved, relative to working directory. '
68-
'it can be either a single file or a directory',
69-
type=str, default='None')
70-
parser.add_argument('--save-path',
71-
help='s3 path where files are saved.',
72-
type=str, default='batch/temp/{}'.format(datetime.now().isoformat()))
73-
parser.add_argument('--command', help='command to run', type=str,
74-
default='git rev-parse HEAD | tee stdout.log')
75-
parser.add_argument('--remote',
76-
help='git repo address. https://github.com/d2l-ai/d2l-zh',
77-
type=str, default="https://github.com/d2l-ai/d2l-zh")
78-
parser.add_argument('--safe-to-use-script',
79-
help='whether the script changes from the actor is safe. We assume it is safe if the actor has write permission to our repo',
80-
action='store_true')
20+
parser.add_argument('--job-type', help='type of job to submit.', type=str, choices=job_type_info.keys(), default='ci-cpu')
21+
parser.add_argument('--source-ref', help='ref in d2l-zh main github. e.g. master, refs/pull/500/head', type=str, default='master')
22+
parser.add_argument('--work-dir', help='working directory inside the repo. e.g. scripts/preprocess', type=str, default='.')
23+
parser.add_argument('--saved-output', help='output to be saved, relative to working directory. it can be either a single file or a directory', type=str, default='None')
24+
parser.add_argument('--save-path', help='s3 path where files are saved.', type=str, default='batch/temp/{}'.format(datetime.now().isoformat()))
25+
parser.add_argument('--command', help='command to run', type=str, default='git rev-parse HEAD | tee stdout.log')
26+
parser.add_argument('--remote', help='git repo address. https://github.com/d2l-ai/d2l-zh', type=str, default='https://github.com/d2l-ai/d2l-zh')
27+
parser.add_argument('--safe-to-use-script', help='whether the script changes from the actor is safe. We assume it is safe if the actor has write permission to our repo', action='store_true')
8128
parser.add_argument('--original-repo', help='name of the repo', type=str, default='d2l-zh')
82-
parser.add_argument('--wait', help='block wait until the job completes. '
83-
'Non-zero exit code if job fails.', action='store_true')
29+
parser.add_argument('--wait', help='block wait until the job completes. Non-zero exit code if job fails.', action='store_true')
8430
parser.add_argument('--timeout', help='job timeout in seconds', default=7200, type=int)
85-
86-
8731
args = parser.parse_args()
88-
8932
session = boto3.Session(profile_name=args.profile, region_name=args.region)
90-
config = Config(
91-
retries = dict(
92-
max_attempts = 20
93-
)
94-
)
33+
config = Config(retries=dict(max_attempts=20))
9534
batch, cloudwatch = [session.client(service_name=sn, config=config) for sn in ['batch', 'logs']]
9635

97-
9836
def printLogs(logGroupName, logStreamName, startTime):
99-
kwargs = {'logGroupName': logGroupName,
100-
'logStreamName': logStreamName,
101-
'startTime': startTime,
102-
'startFromHead': True}
103-
37+
'''"""Auto-generated docstring for function 'printLogs'."""'''
38+
kwargs = {'logGroupName': logGroupName, 'logStreamName': logStreamName, 'startTime': startTime, 'startFromHead': True}
10439
lastTimestamp = startTime - 1
10540
while True:
10641
logEvents = cloudwatch.get_log_events(**kwargs)
107-
10842
for event in logEvents['events']:
10943
lastTimestamp = event['timestamp']
11044
timestamp = datetime.utcfromtimestamp(lastTimestamp / 1000.0).isoformat()
11145
print('[{}] {}'.format((timestamp + '.000')[:23] + 'Z', event['message']))
112-
11346
nextToken = logEvents['nextForwardToken']
11447
if nextToken and kwargs.get('nextToken') != nextToken:
11548
kwargs['nextToken'] = nextToken
11649
else:
11750
break
11851
return lastTimestamp
11952

120-
12153
def nowInMillis():
54+
'''"""Auto-generated docstring for function 'nowInMillis'."""'''
12255
endTime = int(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000
12356
return endTime
12457

125-
12658
def main():
59+
'''"""Auto-generated docstring for function 'main'."""'''
12760
spin = ['-', '/', '|', '\\', '-', '/', '|', '\\']
12861
logGroupName = '/aws/batch/job'
129-
130-
jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules
62+
jobName = re.sub('[^A-Za-z0-9_\\-]', '', args.name)[:128]
13163
jobType = args.job_type
13264
jobQueue = job_type_info[jobType]['job_queue']
13365
jobDefinition = job_type_info[jobType]['job_definition']
13466
wait = args.wait
135-
13667
safe_to_use_script = 'False'
13768
if args.safe_to_use_script:
13869
safe_to_use_script = 'True'
139-
140-
parameters = {
141-
'SOURCE_REF': args.source_ref,
142-
'WORK_DIR': args.work_dir,
143-
'SAVED_OUTPUT': args.saved_output,
144-
'SAVE_PATH': args.save_path,
145-
'COMMAND': f"\"{args.command}\"", # wrap command with double quotation mark, so that batch can treat it as a single command
146-
'REMOTE': args.remote,
147-
'SAFE_TO_USE_SCRIPT': safe_to_use_script,
148-
'ORIGINAL_REPO': args.original_repo
149-
}
150-
kwargs = dict(
151-
jobName=jobName,
152-
jobQueue=jobQueue,
153-
jobDefinition=jobDefinition,
154-
parameters=parameters,
155-
)
70+
parameters = {'SOURCE_REF': args.source_ref, 'WORK_DIR': args.work_dir, 'SAVED_OUTPUT': args.saved_output, 'SAVE_PATH': args.save_path, 'COMMAND': f'"{args.command}"', 'REMOTE': args.remote, 'SAFE_TO_USE_SCRIPT': safe_to_use_script, 'ORIGINAL_REPO': args.original_repo}
71+
kwargs = dict(jobName=jobName, jobQueue=jobQueue, jobDefinition=jobDefinition, parameters=parameters)
15672
if args.timeout is not None:
15773
kwargs['timeout'] = {'attemptDurationSeconds': args.timeout}
15874
submitJobResponse = batch.submit_job(**kwargs)
159-
16075
jobId = submitJobResponse['jobId']
161-
162-
# Export Batch_JobID to Github Actions Environment Variable
16376
with open(os.environ['GITHUB_ENV'], 'a') as f:
16477
f.write(f'Batch_JobID={jobId}\n')
16578
os.environ['batch_jobid'] = jobId
166-
16779
print('Submitted job [{} - {}] to the job queue [{}]'.format(jobName, jobId, jobQueue))
168-
16980
spinner = 0
17081
running = False
17182
status_set = set()
@@ -181,7 +92,6 @@ def main():
18192
print('=' * 80)
18293
print('Job [{} - {}] {}'.format(jobName, jobId, status))
18394
sys.exit(status == 'FAILED')
184-
18595
elif status == 'RUNNING':
18696
logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName']
18797
if not running:
@@ -193,10 +103,8 @@ def main():
193103
startTime = printLogs(logGroupName, logStreamName, startTime) + 1
194104
elif status not in status_set:
195105
status_set.add(status)
196-
print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]),)
106+
print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]))
197107
sys.stdout.flush()
198108
spinner += 1
199-
200-
201109
if __name__ == '__main__':
202-
main()
110+
main()

0 commit comments

Comments
 (0)