From 44d3d934671454ec479809e47c728f8b89d38eec Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Tue, 16 Sep 2025 13:58:01 +0800 Subject: [PATCH 01/12] fix the environment error in copilot chat deployment template (#80) Co-authored-by: Rui Gao --- src/copilot-chat/deploy/copilot-chat-deployment.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template index d80c259e..0285894b 100644 --- a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template +++ b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template @@ -74,7 +74,7 @@ spec: - name: AGENT_HOST value: {{ cluster_cfg["copilot-chat"]["agent-host"] }} - name: RESTSERVER_URL - value: {{ cluster_cfg["copilot-chat"]["rest-server"]["url"] }} + value: {{ cluster_cfg["rest-server"]["url"] }} - name: COPILOT_VALID_VCS value: {{ cluster_cfg["copilot-chat"]["valid-vcs"] }} - name: ENVIRONMENT From 9b313b1d02d01722cd2ea923ed13fb404382a03e Mon Sep 17 00:00:00 2001 From: zhogu <57975490+zhogu@users.noreply.github.com> Date: Wed, 17 Sep 2025 10:36:36 +0800 Subject: [PATCH 02/12] change rest-server request and job-server request to internal url (#79) --- src/model-proxy/deploy/model-proxy.yaml.template | 3 +++ src/model-proxy/src/proxy/model_server.go | 15 ++++++++------- src/model-proxy/src/proxy/proxy.go | 4 +++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/model-proxy/deploy/model-proxy.yaml.template b/src/model-proxy/deploy/model-proxy.yaml.template index f54e67ab..19fd93fd 100644 --- a/src/model-proxy/deploy/model-proxy.yaml.template +++ b/src/model-proxy/deploy/model-proxy.yaml.template @@ -29,6 +29,9 @@ spec: - "--retry={{ cluster_cfg['model-proxy']['retry'] }}" - "--modelkey={{ cluster_cfg['model-proxy']['modelkey'] }}" - "--logdir=/usr/local/ltp/model-proxy/logs" + env: + - name: REST_SERVER_URI + value: {{ cluster_cfg["rest-server"]["uri"] }} volumeMounts: {%- if cluster_cfg['model-proxy']['log_pvc'] %} - name: model-proxy-log-storage diff --git a/src/model-proxy/src/proxy/model_server.go b/src/model-proxy/src/proxy/model_server.go index a2ff5027..f00c113c 100644 --- a/src/model-proxy/src/proxy/model_server.go +++ b/src/model-proxy/src/proxy/model_server.go @@ -9,6 +9,7 @@ import ( "io" "log" "net/http" + "os" "strings" "time" ) @@ -16,10 +17,6 @@ import ( // target job tag to identify model serving jobs const TARGET_JOB_TAG = "model-serving" -// REST server and Job server path segments in the URL -const REST_SERVER_PATH = "rest-server" -const JOB_SERVER_PATH = "job-server" - var httpClient = &http.Client{Timeout: 120 * time.Second} // ListModelServingJobs returns a list of model serving jobs with the given request @@ -128,7 +125,6 @@ func GetJobServerUrl(restServerUrl string, restServerToken string, jobId string) return "", fmt.Errorf("no taskRoles found for job %s", jobId) } - jobServerPath := strings.Replace(restServerUrl, REST_SERVER_PATH, JOB_SERVER_PATH, 1) // Pick first role, first taskStatus for _, role := range details.TaskRoles { if len(role.TaskStatuses) == 0 { @@ -143,7 +139,8 @@ func GetJobServerUrl(restServerUrl string, restServerToken string, jobId string) if !ok || port == "" { return "", fmt.Errorf("no http port found for job %s", jobId) } - jobServerUrl := fmt.Sprintf("%s/%s:%s", jobServerPath, ts.ContainerIp, port) + // return the internal url + jobServerUrl := fmt.Sprintf("http://%s:%s", ts.ContainerIp, port) return jobServerUrl, nil } @@ -233,7 +230,11 @@ func GetJobModelsMapping(req *http.Request, modelToken string) (map[string][]str if req == nil || req.Host == "" { return mapping, fmt.Errorf("invalid request or empty host") } - restBase := fmt.Sprintf("https://%s/rest-server", req.Host) + // get rest server base url from the os environment variable + restBase := os.Getenv("REST_SERVER_URI") + if restBase == "" { + return mapping, fmt.Errorf("REST_SERVER_URI environment variable is not set") + } restServerToken := req.Header.Get("Authorization") jobIDs, err := ListModelServingJobs(restBase, restServerToken) diff --git a/src/model-proxy/src/proxy/proxy.go b/src/model-proxy/src/proxy/proxy.go index b1f50164..31234357 100644 --- a/src/model-proxy/src/proxy/proxy.go +++ b/src/model-proxy/src/proxy/proxy.go @@ -75,9 +75,10 @@ func NewProxyHandler(config *types.Config) *ProxyHandler { // ReverseProxyHandler act as a reverse proxy, it will redirect the request to the destination website and return the response func (ph *ProxyHandler) ReverseProxyHandler(w http.ResponseWriter, r *http.Request) (string, []string, bool) { - + log.Printf("[*] receive a request: %s %s\n", r.Method, r.URL.String()) // handle /healthz if r.URL.Path == "/healthz" { + log.Printf("[*] receive a healthz request from %s\n", r.RemoteAddr) w.WriteHeader(http.StatusOK) if _, err := w.Write([]byte("ok")); err != nil { log.Printf("[-] Error: failed to write healthz response: %v\n", err) @@ -87,6 +88,7 @@ func (ph *ProxyHandler) ReverseProxyHandler(w http.ResponseWriter, r *http.Reque // handle /v1/models if r.URL.Path == "/v1/models" { + log.Printf("[*] receive a models list request from %s\n", r.RemoteAddr) model2Url, err := GetJobModelsMapping(r, ph.authenticator.modelKey) if err != nil { log.Printf("[-] Error: failed to get models mapping: %v\n", err) From e0330a875c399e2c483052d6afbf30a06d53b93e Mon Sep 17 00:00:00 2001 From: Lei Qu <59161330+quge009@users.noreply.github.com> Date: Wed, 17 Sep 2025 19:24:45 +0800 Subject: [PATCH 03/12] fix bug: rest server uri config key mismatch (#83) --- src/copilot-chat/deploy/copilot-chat-deployment.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template index 0285894b..5c4d89ee 100644 --- a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template +++ b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template @@ -74,7 +74,7 @@ spec: - name: AGENT_HOST value: {{ cluster_cfg["copilot-chat"]["agent-host"] }} - name: RESTSERVER_URL - value: {{ cluster_cfg["rest-server"]["url"] }} + value: {{ cluster_cfg["rest-server"]["uri"] }} - name: COPILOT_VALID_VCS value: {{ cluster_cfg["copilot-chat"]["valid-vcs"] }} - name: ENVIRONMENT From 6dc09546178729a01c5102eb97b182d44f18067a Mon Sep 17 00:00:00 2001 From: Lei Qu <59161330+quge009@users.noreply.github.com> Date: Thu, 18 Sep 2025 13:10:03 +0800 Subject: [PATCH 04/12] fix bug: missing CLUSTER_ID env (#85) --- src/copilot-chat/deploy/copilot-chat-deployment.yaml.template | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template index 5c4d89ee..25dfcd5b 100644 --- a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template +++ b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template @@ -89,6 +89,8 @@ spec: value: {{ cluster_cfg["copilot-chat"]["collect-dst-kusto-cluster-url"] }} - name: COLLECT_DST_KUSTO_DATABASE_NAME value: {{ cluster_cfg["copilot-chat"]["collect-dst-kusto-database-name"] }} + - name: CLUSTER_ID + value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }} ports: - containerPort: {{ cluster_cfg["copilot-chat"]["agent-port"] | default("50000") }} hostPort: {{ cluster_cfg["copilot-chat"]["agent-port"] | default("50000") }} From 8eefc92cf64330097e6168b112a34819434e58d0 Mon Sep 17 00:00:00 2001 From: zhogu <57975490+zhogu@users.noreply.github.com> Date: Thu, 25 Sep 2025 18:29:34 +0800 Subject: [PATCH 05/12] CICD: Add github workflow to build and deploy changed services (#75) * add cicd yaml * add deploy all and skip alert-manager * update * fix * update * update * update * fix * update * update * test * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * remove deploy all --- .github/workflows/build-deploy-changes.yaml | 172 ++++++++++++++++++++ build/pai_build.py | 36 ++++ 2 files changed, 208 insertions(+) create mode 100644 .github/workflows/build-deploy-changes.yaml diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml new file mode 100644 index 00000000..ef0ac13a --- /dev/null +++ b/.github/workflows/build-deploy-changes.yaml @@ -0,0 +1,172 @@ +name: Build & Deploy Changed Services + +permissions: + packages: write + contents: read + +on: + push: + branches: [main, dev, 'release/*'] + pull_request: + branches: [main, dev, 'release/*'] + +env: + TAG: ${{ github.run_number }} + +jobs: + build: + name: Build and Deploy + runs-on: [self-hosted, paicicd] + timeout-minutes: 120 + environment: auto-test + container: + image: ubuntu:latest + volumes: + - /var/run/docker.sock:/var/run/docker.sock + steps: + - name: Install git + run: | + DEBIAN_FRONTEND=noninteractive apt update + DEBIAN_FRONTEND=noninteractive apt install -y git + + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: false + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.ref_name }} + + - name: Get Changed Folders (Services) + id: changes + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "Pull request detected" + # Fetch the merge base to get only PR changes + git fetch origin ${{ github.event.pull_request.base.ref }} --depth=50 + base_sha=$(git merge-base origin/${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.sha }}) + head_sha="${{ github.event.pull_request.head.sha }}" + else + base_sha="${{ github.event.before }}" + head_sha="${{ github.sha }}" + fi + + echo "Comparing $base_sha...$head_sha" + changed_files=$(git diff --name-only "$base_sha" "$head_sha") + echo "Changed files: $changed_files" + + # extract service folders under src/, skip alert-manager + folders=$(echo "$changed_files" | grep '^src/' \ + | grep -v 'alert-manager' \ + | awk -F'/' '{print $2}' \ + | sort -u | tr '\n' ' ') + echo "Changed folders: $folders" + + # export as output for next steps + echo "folders=$folders" >> $GITHUB_OUTPUT + + - name: Check if folders are empty + id: check + run: | + if [ -z "${{ steps.changes.outputs.folders }}" ]; then + echo "has_changed=false" >> $GITHUB_OUTPUT + else + echo "has_changed=true" >> $GITHUB_OUTPUT + fi + + - name: Install Package + if: steps.check.outputs.has_changed == 'true' + run: | + DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip docker-cli ca-certificates curl apt-transport-https lsb-release gnupg parallel + curl -sL https://aka.ms/InstallAzureCLIDeb | bash + + - name: Install python libs + if: steps.check.outputs.has_changed == 'true' + run: python -m pip install --break-system-packages pyyaml jinja2 paramiko etcd3 protobuf==3.20.3 kubernetes gitpython + + - name: Decode and unzip config file + if: steps.check.outputs.has_changed == 'true' + run: | + echo "${{ secrets.CONFIG_FILE_B64 }}" | base64 -d > config.zip + mkdir -p $GITHUB_WORKSPACE/config + unzip -o config.zip -d $GITHUB_WORKSPACE/config + ls -l $GITHUB_WORKSPACE/config + + - name: Arrange Config Files + if: steps.check.outputs.has_changed == 'true' + run: | + rm -rf /tmp/auth-configuration + mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/ + ls -l /tmp/auth-configuration + + - name: Build Images of Changed Services + if: steps.check.outputs.has_changed == 'true' + run: | + changed_services="${{ steps.changes.outputs.folders }}" + echo "Building: $changed_services" + $GITHUB_WORKSPACE/build/pai_build.py build \ + -c $GITHUB_WORKSPACE/config/cluster-configuration \ + -s $changed_services + + - name: Push Images of Changed Services to ACR + if: steps.check.outputs.has_changed == 'true' + run: | + changed_services="${{ steps.changes.outputs.folders }}" + echo "Pushing: $changed_services" + $GITHUB_WORKSPACE/build/pai_build.py push \ + -c $GITHUB_WORKSPACE/config/cluster-configuration \ + -s $changed_services + + - name: Push Images of Changed Service to GHCR + if: steps.check.outputs.has_changed == 'true' + run: | + changed_services="${{ steps.changes.outputs.folders }}" + echo "Pushing: $changed_services" + $GITHUB_WORKSPACE/build/pai_build.py push \ + -c $GITHUB_WORKSPACE/config/cluster-configuration \ + -s $changed_services \ + --docker-registry ghcr.io \ + --docker-namespace ${GITHUB_REPOSITORY_OWNER} \ + --docker-username ${{ github.actor }} \ + --docker-password ${{ secrets.GITHUB_TOKEN }} + + - name: Azure CLI get credentials and deploy + if: steps.check.outputs.has_changed == 'true' + run: | + az version + az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }} + az aks install-cli + az aks get-credentials \ + --resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \ + --name ${{ secrets.KUBERNETES_CLUSTER }} \ + --overwrite-existing + kubelogin convert-kubeconfig -l azurecli + kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }} + echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id + echo "Stopping changed pai services \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..." + $GITHUB_WORKSPACE/paictl.py service stop -n ${{ steps.changes.outputs.folders }} < cluster_id + echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..." + $GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id + echo "Starting to update \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..." + $GITHUB_WORKSPACE/paictl.py service start -n ${{ steps.changes.outputs.folders }} < cluster_id + kubectl get pod + kubectl get service + + test: + name: Test rest-server + needs: build + runs-on: [self-hosted, paicicd] + environment: auto-test + steps: + - name: Test rest-server + run: | + echo "Testing rest-server ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/info" + curl ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/info + echo "Checking virtual cluster status..." + vc_info=$(curl -H "Authorization: Bearer ${{ secrets.PAI_WEB_TOKEN }}" -s ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/virtual-clusters) + if [ $? -ne 0 ]; then + echo "Failed to access virtual cluster API" + exit 1 + fi + echo "Virtual cluster info: $vc_info" + diff --git a/build/pai_build.py b/build/pai_build.py index 3cc6fed6..033f5d17 100755 --- a/build/pai_build.py +++ b/build/pai_build.py @@ -120,10 +120,46 @@ def main(): nargs='+', help="The service list that contains corresponding images you want to push" ) + push_parser.add_argument( + '--docker-registry', + type=str, + help="The docker registry you want to push to, which will override the config file" + ) + push_parser.add_argument( + "--docker-namespace", + type=str, + help="The docker namespace you want to push to, which will override the config file if '--docker-registry' is also set" + ) + push_parser.add_argument( + '--docker-username', + type=str, + help="The docker username you want to use for authentication, which will override the config file if '--docker-registry' is also set" + ) + push_parser.add_argument( + '--docker-password', + type=str, + help="The docker password you want to use for authentication, which will override the config file if '--docker-registry' is also set" + ) + push_parser.add_argument( + "--docker-tag", + type=str, + help="The docker tag you want to push to, which will override the config file if '--docker-registry' is also set" + ) push_parser.set_defaults(func=push_image) args = parser.parse_args() config_model = load_build_config(args.config) + if hasattr(args, 'docker_registry') and args.docker_registry is not None: + config_model['dockerRegistryInfo']['dockerRegistryDomain'] = args.docker_registry + if args.docker_namespace is not None: + config_model['dockerRegistryInfo']['dockerNameSpace'] = args.docker_namespace + if args.docker_username is not None: + config_model['dockerRegistryInfo']['dockerUserName'] = args.docker_username + if args.docker_password is not None: + config_model['dockerRegistryInfo']['dockerPassword'] = args.docker_password + if args.docker_tag is not None: + config_model['dockerRegistryInfo']['dockerTag'] = args.docker_tag + args.func(args, config_model) endtime = datetime.datetime.now() From eadcd9efbe7481edbbc5c9f8d591f7594d0db526 Mon Sep 17 00:00:00 2001 From: Lei Qu <59161330+quge009@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:44:35 +0800 Subject: [PATCH 06/12] Data cleanup: Copilot, remove unused prompt (#88) * update: copilot readme for clarity * cleanup: unnecessary data * fix typo: prompt to summarize ltp_human_intervention questions * remove: unused knowledge from prompt * fix: typo * Update src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_rejection.txt Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/copilot-chat/config/copilot-chat.md | 4 +- src/copilot-chat/src/copilot_agent/ltp/ltp.py | 2 +- .../ltp/gen_result_summary_dashboard.txt | 2 +- .../prompts/ltp/gen_result_summary_doc.txt | 2 +- .../prompts/ltp/gen_result_summary_human.txt | 4 +- .../ltp/gen_result_summary_metadata.txt | 8 +- .../ltp/gen_result_summary_metrics.txt | 2 +- .../ltp/gen_result_summary_rejection.txt | 4 +- .../prompts/ltp/ltp_documentation.txt | 23 + .../ltp/ltp_documentation_20250624.txt | 764 ------------------ 10 files changed, 35 insertions(+), 780 deletions(-) create mode 100644 src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation.txt delete mode 100644 src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation_20250624.txt diff --git a/src/copilot-chat/config/copilot-chat.md b/src/copilot-chat/config/copilot-chat.md index f74e35e5..3e961467 100644 --- a/src/copilot-chat/config/copilot-chat.md +++ b/src/copilot-chat/config/copilot-chat.md @@ -80,10 +80,10 @@ These metrics provide a comprehensive view of both correctness and efficiency ac | Category | Count | Support Level | Classification Accuracy | Query Generation Accuracy | Answer Generation Accuracy | Mean Duration (s) | Max Duration (s) | Words per Second | |----------------------|-------|------------------|-------------------------|---------------------------|----------------------------|-------------------|------------------|------------------| | Human Intervention | 17 | Full | 0.76 | NA | TBD | 13.83 | 17.74 | 8.33 | -| User Manual | 28 | Limited (link unavailable) | 0.96 | NA | TBD | 13.72 | 23.73 | 21.00 | +| User Manual | 28 | Limited (empty content) | 0.96 | NA | TBD | 13.72 | 23.73 | 21.00 | | Auto Rejection | 3 | Full | 1 | NA | TBD | 8.84 | 10.70 | 13.15 | | Cluster Job Metrics | 19 | Not supported | (0.89) | TBD | TBD | (16.34) | (26.95) | (12.42) | -| Job Metadata | 1 | Full | 1 | TBD | TBD | 19.91 | 19.91 | 13.25 | +| Job Metadata | 1 | Not supported | 1 | TBD | TBD | 19.91 | 19.91 | 13.25 | | Dashboard | 36 | Full | 1 | 0.84 | TBD | 16.9 | 30.2 | 17.6 | ### Questions to Tryout diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py index 8b74ea8d..0805d731 100644 --- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py +++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py @@ -128,7 +128,7 @@ def query_metadata(question: str, help_msg, skip_summary: bool = False): def query_user_manual(question: str, help_msg): """Query user manual.""" # read documentation - documentation = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'ltp_documentation_20250624.txt')) + documentation = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'ltp_documentation.txt')) ltp_doc = {'lucia training platform documentation': documentation} # generate answer diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_dashboard.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_dashboard.txt index 68b3e00a..0ff869e9 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_dashboard.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_dashboard.txt @@ -1,5 +1,5 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. Output of the answer: Generate an answer to user's question. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_doc.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_doc.txt index af230935..2324c4b2 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_doc.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_doc.txt @@ -1,5 +1,5 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. Output of the answer: Generate an answer to user's question. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_human.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_human.txt index 58b74403..81ac3b85 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_human.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_human.txt @@ -1,9 +1,9 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. The expression of the answer should be like a conversation, no need to use title expressions like: 'Response to User's Question'. Output of the answer: Explain that you understand user's question and this request needs a Lucia Training Platform admin team member's attention. -Explain why user's specific request requires human intevention, this should be question orinted, not general. +Explain why user's specific request requires human intervention, this should be question oriented, not general. Direct user to Lucia Training Platform admin team member's attention with this method: please reach out to the support team via the [Lucia Training Platform Team Group - User Feedback Channel](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fteams.microsoft.com%2Fl%2Fchannel%2F19%253AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%2540thread.tacv2%2FUser%2520Feedback%3FgroupId%3D656a4831-e31d-41fd-9ce0-6384a5156c74&data=05%7C02%7Cltpadmin%40microsoft.com%7C61fb499588384c27482b08ddc9c0ce53%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638888552112734295%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=Bw540k7Du1e1a2E6JbOGfguVCaTmhPgNJys7FEkImJo%3D&reserved=0). If you are not a member of this channel, please refer to [Platform Issue Handling](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Feng.ms%2Fdocs%2Fcloud-ai-platform%2Fazure-core%2Fazure-specialized%2Fhpcai%2Fazure-hpc%2Flucia-platform-team-documentation%2Fluciatrainingplatform%2Fusermanual%2Ftroubleshooting&data=05%7C02%7Cltpadmin%40microsoft.com%7C61fb499588384c27482b08ddc9c0ce53%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638888552112819092%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=wdTLVCG2bQoiGqU8UauXHUIxT4NkehRHVlxya%2Fl5oSQ%3D&reserved=0) for how to join. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metadata.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metadata.txt index 23e0e5ae..1b736a29 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metadata.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metadata.txt @@ -1,5 +1,5 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. Output of the answer: Generate an answer to user's question. @@ -8,10 +8,6 @@ If necessary, generate a suggestion, this is optional, only do this if you think Decide which data is necessary to support the summary, analysis and suggestion, reorganize only the necessary data into several markdown tables and present them in markdown format. Provide a reference about how to query more details about the metadata -knowledge #1: -method for user to manually query the job metadata: -query = f"restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime" -api = f"api/v1/openpai/cluster/we/{query}" -knowledge #2: +knowledge #1: the jobs are indexed using this names: username~jobname, if the result only covers the first 1000 jobs, you still need to perform some analysis for example which user submitted the most jobs. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metrics.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metrics.txt index af230935..2324c4b2 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metrics.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_metrics.txt @@ -1,5 +1,5 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. Output of the answer: Generate an answer to user's question. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_rejection.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_rejection.txt index b54b9191..176e88f7 100644 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_rejection.txt +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/gen_result_summary_rejection.txt @@ -1,5 +1,5 @@ Your task is to generate a markdown format answer based on the result from a database, to answer user's question. -Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question orinted, and the reader of the answer is leadership or executive. +Output the answer using markdown format. Keep the generated answer focused and concise. the answer should be question oriented, and the reader of the answer is leadership or executive. The expression of the answer should be like a conversation, no need to use title expressions like: 'Response to User's Question'. @@ -7,4 +7,4 @@ Output of the answer: Explain that you understand user's question. Explain that what user request is not supported by Lucia Training Platform by design. The design decision is a result after carefully balancing for optimal performance and user experience. -If necessary, mention that the development team do accept user's new feature proposals, to contact the development team, user can send an eamil to ltpadmin@microsoft.com with a title [New Feature Request]. \ No newline at end of file +If necessary, mention that the development team does accept user's new feature proposals, to contact the development team, user can send an email to ltpadmin@microsoft.com with a title [New Feature Request]. \ No newline at end of file diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation.txt new file mode 100644 index 00000000..4c6bd263 --- /dev/null +++ b/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation.txt @@ -0,0 +1,23 @@ +# Lucia Training Platform User Tutorial + +Welcome to the Lucia Training Platform Documentation! This documentation provides an overview and tutorial of the Lucia Training Platform. + +## Introduction + +The Lucia Training Platform is a multi-tenant AI platform designed to efficiently train deep learning models. It is designed to efficiently address reliability issues, optimize resource management, and incorporate state-of-the-art software optimizations to achieve high training efficiency. + + +## Table of Contents + +### User Manual + +- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job. +- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs. +- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs. +- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs. +- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities. +- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications. +- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features. +- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support. +- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob. +- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes. diff --git a/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation_20250624.txt b/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation_20250624.txt deleted file mode 100644 index 02430f20..00000000 --- a/src/copilot-chat/src/copilot_agent/prompts/ltp/ltp_documentation_20250624.txt +++ /dev/null @@ -1,764 +0,0 @@ -# Lucia Training Platform User Tutorial - -Welcome to the Lucia Training Platform Documentation! This documentation provides an overview and tutorial of the Lucia Training Platform. - -## Introduction - -The Lucia Training Platform is a multi-tenant AI platform designed to efficiently train deep learning models. It is designed to efficiently address reliability issues, optimize resource management, and incorporate state-of-the-art software optimizations to achieve high training efficiency. - - -## Table of Contents - -### User Manual - -- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job. -- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs. -- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs. -- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs. -- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities. -- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications. -- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features. -- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support. -- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob. -- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes. - -# How to Manage Data and Code - -## Use Code - -To access user code, first upload the code to a private Git repository, then use the Git token to clone the repository. The token can be defined in the `secret` section in [How to Use Advanced Job Settings](./job-config.md). - -## Use Self-Manage Data - -### Data from Public Website - -To download data from public websites, use the `wget` command within your job configuration. - -### Data from Azure Blob Storage - -To access data from Azure Blob Storage, you can either: - -- Install `blobfuse` to mount the storage. -- Use `azcopy` to copy the data onto the job storage, with the SAS token, which can be defined in the `secret` section in [How to Use Advanced Job Settings](./job-config.md#parameters-and-secrets). - -## Platform-Assisted Data Management - -### Onboard Private Azure Blob Storage - -To onboard your private storage, follow these steps: - -- Assign the Storage Blob Data Contributor role to the platform's managed identities in your storage account. Refer to the detailed instructions in [Grant access to the storage account](https://learn.microsoft.com/en-us/entra/identity-platform/multi-service-web-app-access-storage?tabs=azure-portal%2Cprogramming-language-csharp#grant-access-to-the-storage-account). Make sure you assign the proper role to all below listed platform's managed identities: - - principal ID: `9a37a8cf-a478-4b1d-b8f9-eadbc248bec6` - - principal ID: `d909f402-f702-42cb-8887-2c0f85e4f17d` - - principal ID: `7405b64d-a4a7-4449-a2f6-8c6e24efbf7b` -- Request your UserGroup Admin to contact the Lucia Training Platform Admin to integrate your storage into your UserGroup. - - Please use the [Request for Integrating Private Azure Storage Blob](email-templates/email-templates-user.md#request-for-integrating-private-azure-storage-blob) email template. - - Please include both contacts in the "To" field: - - Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)) - - Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**Note: Private data storage is accessible to all users within the same UserGroup. Other UserGroups will not have access.** - -### Use Private Storage in Job - -To use private storage in your jobs, specify the storage names in the `storageConfigNames` section of the `extras` part in your job configuration file: - -```yaml -extras: - storageConfigNames: - - blob-- -``` - -The corresponding storage will be automatically mounted to the `/mnt/blob--` folder after the job is launched. - -# Notification and Monitoring - -## Job Status Notification - -Notifications will be sent to the user's email address when the job starts, finishes, or fails. - -## Job Behavior Monitoring and Notification - -The platform will monitor the job behavior, take resource management actions, and send a notification email to the user when the job behavior is abnormal, such as: -- The job runs for a long time without active usage -- The job consumes excessive resources unnecessarily -- The job encounters an unexpected failure, etc. - -Notifications will be sent to the job owner's email address, detailing the action taken by the platform and the reason for it. - -**Note: This is an experimental feature. Please submit any question or concern regarding to this feature to [Lucia Training Platform Team Group - User Feedback Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join.** - -# How to Use Advanced Job Settings - -This section covers the following topics: - -- Parameters and Secrets -- Distributed Settings and Examples -- Job Exit Specifications, Retry Policies, and Completion Policies - -## Parameters and Secrets - -### Parameters -It is common to train models with different parameters. We support parameter definition and reference, which provide a flexible way of training and comparing models. You can define your parameters in the Parameters section and reference them by using `<% $parameters.paramKey %>` in commands. - -![Sample Image](./pictures/use-parameters.png) - -### Secrets - -In some cases, it may be necessary to define secret information such as passwords, tokens, etc. You can use the Secrets section for the definition and reference these secrets in commands by `<% $secrets.secretKey %>`. The usage is the same as parameters except that secrets will not be displayed or recorded. - -## Distributed Settings - -### How to Enable InfiniBand Between Nodes - -To enable the InfiniBand between nodes, add `infiniband: true` under extraContainerOptions in the config. -``` -... -taskRoles: - $taskRole: - extraContainerOptions: - infiniband: true - ... -... -``` - -### How to Ensure Different Nodes Can Be SSH Connected and Time Sync - -#### Job SSH -To ensure SSH connectivity across all nodes in the job, add the following SSH plugin to the configuration file: -``` -... -extras: - com.microsoft.pai.runtimeplugin: - - plugin: ssh - parameters: - jobssh: true - sshbarrier: true -``` - -#### SSH Barrier -If you want to synchronize all nodes during job running, such as initializing the distributed torch launch after all nodes finish downloading data, add the following command inside the `commands` section of the task role with a timeout in seconds. -``` -taskRoles: - taskrole: - ... - commands: - - echo 'Downloading Data' - - bash /usr/local/pai/runtime.d/barrier --timeout=500 - - echo 'Start Training' - - ... -``` - -## Environment Variables for Distributed Jobs - -The Lucia Training Platform predefines several environment variables to mange distributed jobs, such as how to connect to the master node, the number of worker nodes, and the worker index. Use these variables in distributed job configuration. - -| **Category** | **Environment Variable Name** | **Description** | -|-----------------------|-----------------------------------------------------------------|---------------------------------------------------------------------------------| -| **Task role level** | `PAI_TASK_ROLE_COUNT` | Total number of different task roles in the config file | -| | `PAI_TASK_ROLE_LIST` | Comma-separated list of all task role names in the config file | -| | `PAI_TASK_ROLE_TASK_COUNT_$taskRole` | Task(node) count of the specific task role | -| | `PAI_HOST_IP_$taskRole_$taskIndex` | Host IP for task `taskIndex` in `taskRole` | -| | `PAI_PORT_LIST_$taskRole_$taskIndex_$portType` | `portType` port list for task `taskIndex` in `taskRole` | -| | `PAI_RESOURCE_$taskRole` | Resource requirement for the task role in `"gpuNumber,cpuNumber,memMB,shmMB"` format | -| **Current task role** | `PAI_CURRENT_TASK_ROLE_NAME` | `taskRole.name` of the current task role | -| **Current task** | `PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX` | Index of the current task in current task role (starting from 0) | - -- `$taskRole`: the name of taskrole which defined as key in the taskRoles. Please change it to the real name you defined in the config to use the env. - -- `$taskIndex`: the num index of the node in the current `$taskRole`. From 0 to `$PAI_TASK_ROLE_TASK_COUNT_$taskRole` - 1. Please change it to the real index number to use the env. - -### How to Set Node Name - -If you need a node name, such as for mpirun, you can use `$taskrole-$taskindex` as the node name. For example, - -``` -mpirun -np 32 -H taskrole-0:8,taskrole-1:8,taskrole-2:8,taskrole-3:8 --allow-run-as-root ... -``` - -### Distributed Job Examples - -Learn how to set up and run distributed jobs across multiple nodes. This section includes examples: -1. [Distributed train with PyTorch](https://microsoftapc.sharepoint.com/:u:/t/LuciaTrainingPlatform/Ef85jIBRRFVMrLqfnBc0IbwBqrLPfs0ffbOdHye_lhLNiA?e=GyTl5I): This example demonstrates how to run a distributed training job with PyTorch with the torchrun and environment variables. -2. [Distributed nccl test with mpirun](https://microsoftapc.sharepoint.com/:u:/t/LuciaTrainingPlatform/EaQWdk0-taREnBfw5pqVxVwByQtmVco8Cf5pjN_c0Uh64Q?e=LDkHJo): This example demonstrates how to run a distributed NCCL test with MPI. - -## Job Exit Specifications - -The Job Exit Specifications define the conditions under which a job exits, primarily governed by the **Completion Policy** and **Retry Policy**. These policies are managed by two key settings: `jobRetryCount` and the `completion` as shown in the sample configuration: - -``` -jobRetryCount: 0 -taskRoles: - taskrole: - completion: - minFailedInstances: 1 - minSucceededInstances: -1 -``` - -A job consists of multiple tasks, where each task represents a single instance within a task role. Understanding these settings is essential for managing job execution behavior. - -### Completion Policy - -The *completion policy* defines the conditions under which a job is considered completed. It includes two key parameters: `minFailedInstances` and `minSucceededInstances`. - - - `minFailedInstances`: this parameter specifies the number of failed tasks required to mark the entire job as failed. - - Valid values: -1 or any value greater than or equal to 1. - - If set to -1, the job will always succeed regardless of any task failures. - - Default value: 1, meaning that a single failed task will cause the entire job to fail. - - - `minSucceededInstances`: this parameter specifies the number of successfully completed tasks required to mark the entire job as successful. - - Valid values: -1 or any value greater than or equal to 1. - - If set to -1, the job will succeed only when all tasks are completed **with exit code 0**, the `minFailedInstances` condition is not triggered. - - Any value ≥ 1: The job succeeds if the specified number of tasks succeed, **and the other tasks will be stopped**. - - Default value: -1. - -### Retry Policy - -The *retry policy* governs whether a job should be retried. A job will be retried if the following conditions are met: - -- The job does not succeed after satisfying the *completion policy*. -- The job fails due to an unknown error. -- The `jobRetryCount` is greater than 0. - -To increase the number of retries, set the `jobRetryCount` to a higher value. - -# User Manual - -- [Quick Start](./quickstart.md): A guide to help you quickly get started with the platform and submit a "Hello World" job. -- [How to Use Docker Image](./docker-images.md): Instructions for using Docker images in your jobs. -- [How to Manage Data and Code](./use-data.md): Guidance on managing data and code within your jobs. -- [How to Write Job Config](./job-config.md): Detailed instructions for configuring jobs and distributed settings. -- [Job Priorities](./job-priorities.md): An explanation of job priority types and how to submit jobs with specific priorities. -- [VC Allocation](./vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications. -- [Notification and Monitoring](./notification.md): Information on the platform's notification and monitoring features. -- [Email Templates, UserGroup Admin](./email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob. -- [Email Templates, Lucia Training Platform Admin](./email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify users about VC allocation and assignment changes. - -# How to Use Docker Images - -## Access to Docker Images - -### Public Docker Images - -Public images on Docker Hub can be pulled directly after the job is submitted. - -**Note:** The Docker Hub registry has rate limitations and may cause compliance issues. We highly recommend pulling images from the Azure Container Registry (ACR). In the future, we may disable pulling images directly from Docker Hub. - -### Private Docker Images - -To use private Docker images, you need to assign the `acrPull` permission to the platform's managed identities in your Azure Container Registry (ACR). Make sure you assign the proper role to all below listed platform's managed identities: -- principal ID: `b956019c-3be9-47e5-ae4d-18bc2c599a0b` -- principal ID: `0604b3f1-d040-4545-a9e0-2f4b4f8cabd3` -- principal ID: `3b8d9c3b-6820-45a5-a346-94d13210a9ba` - -Refer to the detailed instructions in [Grant the identity permissions to access other Azure resources](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-tasks-authentication-managed-identity#3-grant-the-identity-permissions-to-access-other-azure-resources). - -Once the permission is granted, the private image from your ACR can be pulled automatically after the job is submitted. - -## Set Docker Image in Job Configuration - -To use a Docker image in a job, first define the Docker image in the `prerequisites` section of the job configuration file: - -```yaml -prerequisites: - - type: dockerimage - uri: 'nvcr.io/nvidia/pytorch:24.03-py3' # image URL - name: docker_image0 -``` - -Then set the `dockerImage` in the `taskRoles` section of the job configuration file: - -```yaml -taskRoles: - taskrole: - dockerImage: docker_image0 -``` - -# Job Priorities - -Lucia Training Platform supports three types of job priorities: - -- **prod**: Production Priority Jobs. - - Users can submit a production priority job only to the VC (Virtual Cluster) assigned to the UserGroup they belong to. - - Upon receiving a *prod* job, Lucia Training Platform may terminate other running non-production priority jobs if available resources are insufficient to fulfill the requirement of the received production priority job. -- **default**: Default Priority Jobs. - - Users can submit a default priority job only to the VC assigned to the UserGroup they belong to. - - All default priority jobs enter a queue and are served on a First-Come-First-Serve basis. - - A running default job can only be terminated by a production priority job. -- **oppo**: Opportunity Priority Jobs. - - Users can submit an opportunity priority job only to the VC assigned to the UserGroup they belong to. - - If resources within the requested VC are insufficient, the submitted opportunity job may be allocated to any idle resource with the same SKU across the entire cluster, including other VCs. This assignment is not transparent to the user, meaning they will not be informed about the specific origin of the allocated resources. - - A running opportunity priority job may be terminated by either a default priority job or a production priority job due to resource utilization status. - -# Submitting a Job with Specific Priority - -To submit a job with a production or opportunity priority, add `jobPriorityClass: $PriorityValue` to the job configuration file. Accepted values are *prod*, *oppo*. Below is an example you can reference. - -```ymal -extras: - hivedScheduler: - jobPriorityClass: $PriorityValue -``` - -To submis a job with a default priority, simply remove the `jobPriorityClass` from the job configuration file. - -## Submitting a Production Priority Job - -By default, users are allowed to submit jobs only with *default* and *oppo* priorities. Submission of jobs with *prod* priority is restricted. - -To submit a job with *prod* priority, users must contact their UserGroup Admin. The UserGroup Admin must request permission from the Lucia Training Platform Admin. When sending the email, please adhere to the following practices: -- Please use the [Request for Production Priority Job Submission Approval](email-templates/email-templates-user.md#request-for-production-priority-job-submission-approval) email template. -- Please include both contacts in the "To" field: - - Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)) - - Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) -- The request must specify the duration for which the user needs access to submit the *prod* job, and the access will automatically expire after the specified duration. - -The Lucia Training Platform Admin will provide a decision (approval or rejection) regarding the request. If approved, the UserGroup Admin is responsible for disseminating the decision to the relevant users. Users in the specified UserGroup will be permitted to submit production priority jobs after receiving the approval notification before expiration. - -## Recommendations for Job Priority Selection - -For an optimal job submission experience, consider the following recommendations: - -### For Preemptible or Short-Running Debugging Jobs: -- **Submit with Opportunity Priority to VC Assigned to You:** This setting will automatically utilize available resources from other VCs and will retry automatically if preempted. - -### For Critical Long-Running Jobs: -- **Submit with Default Priority:** If resources are sufficient, submit your job with the default priority to ensure uninterrupted execution. -- **Contact UserGroup Admin:** If resources are insufficient, reach out to your UserGroup Admin related to the target VC. They can assist in scheduling other jobs with *oppo* priority or help you apply *prod* priority to ensure your job is not interrupted or preempted. -# VC Allocation - -This section provides guidelines for UserGroup Admins to manage VC (Virtual Cluster) allocation requests and VC assignment change notifications. - -## VC Allocation Requests - -UserGroup Admins can perform specific management requests related to the VC. The accepted types of VC allocation requests are `Access VC`, `Depart VC`, `Create VC`, and `Delete VC`. To proceed, please follow the steps below: - -- The Admin should create a Security Group with email enabled on [idweb](https://idweb.microsoft.com/IdentityManagement/aspx/groups/MyGroups.aspx), and mention the name of the Security Group in the *UserGroup* field while making a VC Allocation request. The Lucia Training Platform manages access based on UserGroups rather than individual user accounts. -- Send an request email to Lucia Training Platform Admin. - - Please using the [Request for VC Allocation](email-templates/email-templates-user.md#request-for-vc-allocation) email template. - - Please include both contacts in the "To" field: - - Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)) - - Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) -- Allow up to 24 hours for Lucia Training Platform Admin to make a decision and provide an estimated time to complete your request. -- Once a decision is made, an acknowledgment email will be sent to the UserGroup Admin who requested the allocation, sharing the decision and estimated time of completion. -- A completion email will be sent back to the UserGroup Admin when the request is completed. -- The UserGroup Admin is required to broadcast the request completion result to all affected users. - -## VC Assignment Change Notifications - -All existing users associated with the affected VC will receive a notification email from [Lucia Training Platform Admin](mailto:ltpadmin@microsoft.com) whenever a `VC Assignment` or a `VC Re-assignment` event is triggered. - -The notification emails will specify the exact date, time, and timezone when the change will take effect. Additionally, they will list the VC names related to these changes and the UserGroup names that will be affected. - -If any user has questions or issues, please do not reply to the email, as the mailbox is not monitored. Instead, we encourage all users to raise their concerns or questions to the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. -# Quick Start - -## Submit a Hello World Job - -A job in the Lucia Training Platform defines how to execute code(s) and command(s) in specified environment(s). It can be run on a single node or in a distributed manner. - -**Step 0**. To access the Lucia Training Platform, your Microsoft account must be added to a UserGroup associated with the VC (Virtual Cluster) based on your requirements. If you know the specific UserGroup, contact the UserGroup Admin directly. If you are unsure, consult with your manager or program manager. They will guide you to the correct UserGroup Admin responsible for specific VC. - -Proceed with the following steps only after you have been successfully added to a Lucia Training Platform UserGroup. - -**Step 1**. Login to Lucia Training Platform (you can get the link from your UserGroup Admin). - -**Step 2**. Click `Submit Job` on the left pane, then click `Single Job` to reach the Job submission page. - -**Step 3**. Click the `Edit YAML` button at the bottom-right of the page, and paste the contents of [hello-world-job.yaml](https://microsoftapc.sharepoint.com/:u:/t/LuciaTrainingPlatform/ETgawzdHDz5MhWKbFwEafsABMKYwWBKcO9Gwb9xsGgBrZA?e=XcNctq). - -**Step 4**. Select your VC (Virtual Cluster), and set the job name. - -**Step 5**. Define the `task role name`, `commands` to run, and `resouce SKU` requirements. They will be automatically updated in the config file. -- `task role name`: What is a taskrole? For single server jobs, there should be only one task role. For some distributed jobs, there may be multiple task roles. For example, when TensorFlow is used to run distributed jobs, it has two roles: the parameter server and the worker. -**Note: The task role name will be used in the platform environment variables shown in the right-hand box, which can be referenced inside `commands`.** -- `commands`: the commands will be run on each node parallelly, it can be written like a bash script. -**Note: Please DO NOT use # for comments or \ for line continuation in the command box, as these symbols may break the syntax.** -- `Instances`: Total node count in the current task role. -- `SKU count`: The number of GPUs per node in the current task role. When `Instance` = 1: the SKU count can range from 1 to the maximum number of GPUs in a single node. When `Instance` > 1: it should always be set to the maximum number of GPUs in a single node. - -**Step 6.**. Click the `submit` button to submit the job. - -![Sample Image](./pictures/job-submission.png) - -**Here is a demo video for the job submission: [helloworld.gif](https://microsoftapc.sharepoint.com/:i:/t/LuciaTrainingPlatform/EZPy9TQ_rx9EhqcRW79VuhkBA5T7v8wwSTLXnL4L73vZLw?e=FXMWQ3)** - -![Sample Image](./pictures/demo.gif) - -## Browse Stdout, Stderr, Full logs - -Click the `Stdout` and `Stderr` buttons on the job detail page to view the stdout and stderr logs for a job. To view a merged log, click `...` on the right and select `Stdout + Stderr`. - -![Sample Image](./pictures/log.png) - -## Debug and SSH into a Running Job - -By default, SSH access to a running job is not allowed. If you have special requirements, please contact the Lucia Training Platform Admin via [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -# Troubleshooting Guide for Lucia Training Platform - -## Platform Issue Handling - -If you have any questions or issues while using the platform, please reach out to the platform support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel]. - -- Join the **Lucia Training Platform User Feedback** team through the code: `zdvi2c9` by Teams App -> Teams -> New Items -> Join team -> Join a team with a code. -- Submit your question or issue in the [**User Feedback** channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). The support team will respond to your inquiry as soon as possible. - - -## Common Issues -1. **Job is always in `Waiting` status** - - Check if the IP address of each task index is present on the job detail page. If yes, then the resource has been allocated; if not, then there are not enough resources to allocate the job, you can then check the available resources on the home page. - - If the IP addresses are present, but the job is still in `Waiting` status, please click `Go to Job Event Page` on the job detail page to check the job event log for image pulling errors or other errors. - -2. **Cannot see the latest VC setting on the home page** - - Log out and log in again to refresh the user information. - -3. **Multi-Nodes throughput is very slow** - - Check if the `infiniband` is enabled in the job config. Please refer to the [How to enable Infiniband between nodes](./job-config.md#how-to-enable-infiniband-between-nodes) section for more details. - -# Email Templates, for *UserGroup Admin* - -## Request for VC Allocation - ---- - -**Subject:** Request for VC Allocation - -**From:** UserGroup Admin - -**To:** Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)); Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**Body:** - -Hello Lucia Training Platform Admin, - -I am writing to request approval for **VC allocation**. Please find the details of the request below: - -**Allocation Type:** enum(Access VC, Depart VC, Create VC, Delete VC) - -**UserGroup:** [UserGroup name(s) who requested this allocation] - -**VC:** [VC name(s) targeted by this allocation action] - -**Quota:** [GPU model, number of GPUs requested, e.g. 512 MI300x. Must be filled if the Allocation Type is Create VC; can be left blank for Access VC, Depart VC, Delete VC requests] - -**Justification:** [The UserGroup Admin must provide a justification if the Allocation Type is either Access VC or Create VC] - -**Effective Date and Time:** [Date and time when the requester plans for this allocation to take place. (YYYY-MM-DD, HH:MM:SS, Timezone)] - -Thank you for your attention to this request. - -Best regards, - -Your Name - ---- - -## Request for Production Priority Job Submission Approval - ---- - -**Subject:** Request for Production Priority Job Submission Approval - -**From:** UserGroup Admin - -**To:** Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)); Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**Body:** - -Hello Lucia Training Platform Admin, - -I am writing to request your approval for a **production priority job submission**. Below are the details of the request: - -**UserGroup:** [UserGroup name whose member will submit production priority jobs] - -**VC:** [VC name these production priority jobs will be submitted to] - -**Duration:** [Specify the duration needed for the production job submission. Note that access will expire after 1 day by default unless otherwise specified.] - -**Justification:** [Provide a brief description of the usage purpose, urgency, or any other relevant information] - -Thank you for considering this request. - -Best regards, - -Your Name - ---- - -## Request for Integrating Private Azure Storage Blob - ---- - -**Subject:** Request for Integrating Private Azure Storage Blob - -**From:** UserGroup Admin - -**To:** Lucia Training Platform Admin Group ([ltp-admin-alert@microsoft.com](mailto:ltp-admin-alert@microsoft.com)); Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**Body:** - -Hello Lucia Training Platform Admin, - -I am writing to request integrating the below Azure Storage Blob to the requested VC, for data access purpose. - -**Resource Group Name:** [The resource group name where the Blob Storage Account was created from] - -**Blob URL:** [The full URL to the Blob, e,g. https://.blob.core.windows.net//] - -**VC:** [VC name(s) targeted by this allocation action] - -Thank you for your attention to this request. - -Best regards, - -Your Name - ---- - -# Email Templates, for *Lucia Training Platform Admin* - -**IMPORTANT**: Always include the Lucia Training Platform Admin email address in the "To" field to ensure all admins are updated so as to prevent conflicting replies. - -## Responses to: Production Job Submission Request - -### Approved - ---- - -**Subject:** Approved - Production Job Submission Request - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup Admin, Lucia Training Platform Admin - -**Body:** - -Hello UserGroup Admin, - -This is to inform you the completion of the Production Job Submission request you made. - -**Decision:** Approved - -**Effective Date and Time:** [Date and time when users can start to submit production priority jobs. (YYYY-MM-DD, HH:MM:SS, Timezone)] - -**Duration:** [Duration before users lose the permission to submit production priority jobs. The default duration is 1 day.] - -If you have any questions or issues while using the Lucia Training Platform, please reach out to the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - -### Rejected - ---- - -**Subject:** Rejected - Production Job Submission Request - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup Admin, Lucia Training Platform Admin - -**Body:** - -Hello UserGroup Admin, - -This is to inform you the completion of the Production Job Submission request you made. - -**Decision:** Rejected - -**Details:** [Please clearly state the reason for rejection. Direct them to applicable LTP resources to facilitate their work.] - -If you have any questions or issues while using the Lucia Training Platform, please reach out to the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - -## Responses to: VC Allocation Request - -### Acknowledgment - ---- - -**Subject:** Acknowledgment of VC Allocation Request - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup, UserGroup Admin, Lucia Training Platform Admin - -**Body:** - -Hello UserGroup Admin, - -Your VC allocation request has been received. Below are the details of your request as processed by the Lucia Training Platform Admin: - -**Allocation Type:** enum(Access VC, Depart VC, Create VC, Delete VC) - -**Decision:** enum(Approved, Rejected, Under Review). If the decision is Rejected, please provide a brief explanation to the UserGroup Admin. - -**Estimated Time of Completion:** [Expected completion date and time (YYYY-MM-DD, HH:MM:SS, Timezone)] - -We appreciate your patience and understanding. - -Best regards, - -Lucia Training Platform Admin - ---- - -### Completion - ---- - -**Subject:** Completion of VC Allocation Request - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup, UserGroup Admin, Lucia Training Platform Admin - -**Body:** - -Hello UserGroup Admin, - -We are pleased to inform you that your VC allocation request has been successfully completed. - -**Platform URL:** [the exact url] - -**Allocation Type:** enum(Access VC, Depart VC, Create VC, Delete VC) - -**Quota:** [The assigned GPU model and the number of GPUs, e.g., 512 MI300x. This field is only required if the original request type was Create VC. For other VC Allocation request types, leave it blank.] - -**Status:** Completed - -**Effective Date and Time:** [Date and time when the change takes effect. (YYYY-MM-DD, HH:MM:SS, Timezone)] - -If you have any questions or issues while using the Lucia Training Platform, please reach out to the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - -## Responses to: Request for Integrating Private Azure Storage Blob - -### Completion - ---- - -**Subject:** Completion of Integrating Private Azure Storage Blob - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup, UserGroup Admin, Lucia Training Platform Admin - -**Body:** - -Hello UserGroup Admin, - -We are pleased to inform you that your Blob integration request has been successfully completed. - -**Resource Group Name:** [The resource group name where the Blob Storage Account was created from] - -**Blob URL:** [The full URL to the Blob, e,g. https://.blob.core.windows.net//] - -**VC:** [VC name(s) targeted by this allocation action] - -Please note that all users associated with the UserGroup related to the above VC can access the content of this Blob. Users not associated with this UserGroup do not have access. - -If you have any questions or issues while using the Lucia Training Platform, please reach out to the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - -## Platform Notification - -### VC Assignment Change - ---- - -**Subject:** Notification of VC Assignment Change - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup(s), UserGroup Admin(s), Lucia Training Platform Admin - -**Body:** - -Dear Users, - -This is to notify you of the following upcoming VC assignment change: - -**Effective Date and Time:** [Date and time when this assignment will take effect. (YYYY-MM-DD, HH:MM:SS, Timezone)] - -**Platform URL:** [the exact url] - -**VC:** [VC name] - -**Quota:** [GPU model, number of GPUs being assigned, e.g. 512 MI300x.] - -**Assignment Type:** Assignment - -**User Groups Added:** [UserGroup names being added] - -For any questions or concerns regarding this assignment, please email it to your UserGroup Admin and Lucia Training Platform Admin. If you have any questions or issues while using the Lucia Training Platform, please contact the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - -### VC Re-Assignment Change - ---- - -**Subject:** Notification of VC Re-Assignment Change - -**From:** Lucia Training Platform Admin ([ltpadmin@microsoft.com](mailto:ltpadmin@microsoft.com)) - -**To:** UserGroup(s), UserGroup Admin(s), Lucia Training Platform Admin - -**Body:** - -Dear Users, - -This is to notify you of the following upcoming VC re-assignment change: - -**Effective Date and Time:** [Date and time when this re-assignment will take effect. (YYYY-MM-DD, HH:MM:SS, Timezone)] - -**Platform URL:** [the exact url] - -**VC:** [VC name] - -**Quota:** [GPU model, number of GPUs being re-assigned, e.g. 512 MI300x.] - -**Assignment Type:** Re-Assignment - -**User Group Removed:** [UserGroup names being removed] - -**User Group Added:** [UserGroup names being added] - -**Impact:** -- Users in Removed UserGroups: Please save your jobs and log out before the Effective Date and Time. -- Users in Added UserGroups: Log out and log back in after the Effective Date and Time to ensure resource availability. - -We appreciate your understanding and cooperation. These changes are aimed at enhancing overall system efficiency and providing a better experience for everyone. - -For any questions or concerns regarding this re-assignment, please email it to your UserGroup Admin and Lucia Training Platform Admin. If you have any questions or issues while using the Lucia Training Platform, please contact the support team via the [**Lucia Training Platform** Team Group - **User Feedback** Channel](https://teams.microsoft.com/l/channel/19%3AlrUjYbE4bhxd5hG34dJkRXEdSJF02WrcpEXayX58OdQ1%40thread.tacv2/User%20Feedback?groupId=656a4831-e31d-41fd-9ce0-6384a5156c74). If you are not a member of this channel, please refer to [Platform Issue Handling](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-specialized/hpcai/azure-hpc/lucia-platform-team-documentation/luciatrainingplatform/usermanual/troubleshooting) for how to join. - -Best regards, - -Lucia Training Platform Admin - ---- - From 4302fe76ec0d54e13a140253a0b254393b8ed4f7 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Sat, 27 Sep 2025 23:16:11 -0700 Subject: [PATCH 07/12] Fix gpu and default eth detection in job exporter (#81) Fix GPU and default Ethernet interface detection in job exporter. --- .../build/moneo-gpu-exporter_entrypoint.sh | 4 ++-- .../src/worker/exporters/node_exporter.py | 20 ++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh index 997bbc34..9798fd96 100755 --- a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh +++ b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh @@ -1,13 +1,13 @@ #!/bin/bash -if lspci | grep -qi 'Advanced Micro Devices'; then +if lsmod | grep -qi amdgpu; then echo "AMD Graphics Card Detected." # Launches AMD RDC Daemon nohup /opt/rocm/bin/rdcd -u /dev/null 2>&1 & echo "rdc Daemon Started!" python3 /Moneo/src/worker/exporters/amd_exporter.py & echo "AMD Exporter Started!" -elif lspci | grep -qi 'NVIDIA'; then +elif lsmod | grep -qi nvidia; then echo "NVIDIA Graphics card detected." python3 /update-dcgm.py # Launches NVIDIA DCGM Daemon diff --git a/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py b/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py index 382a29ec..b0ee72af 100644 --- a/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py +++ b/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py @@ -23,6 +23,7 @@ import signal import logging import argparse +import netifaces from base_exporter import BaseExporter import subprocess import shlex @@ -61,6 +62,19 @@ def shell_cmd(cmd, timeout): return result.decode() +def get_default_iface(): + try: + g = netifaces.gateways() + default = g.get('default', {}) + gw4 = default.get(netifaces.AF_INET) + if gw4: + _, iface = gw4[:2] + return iface + return None + except Exception: + return None + + class NodeExporter(BaseExporter): '''Example custom node exporter''' @@ -103,7 +117,7 @@ def init_gauges(self): def collect(self, field_name): # noqa: C901 '''Custom collection Method''' value = None - if 'net' in field_name: + if 'net' in field_name and config['ethernet_device']: cmd = "grep '{}' ".format(config['ethernet_device']) + self.config['fieldFiles'][field_name] val = None if 'net_rx' in field_name: @@ -322,7 +336,7 @@ def init_config(job_id, port=None, ethernet_device='eth0', interval=30): # initalize field specific config parameters for field_name in FIELD_LIST: config['sample_timestamp'][field_name] = datetime.now() - timedelta(seconds=5) - if 'net' in field_name: + if 'net' in field_name and config['ethernet_device']: config['fieldFiles'][field_name] = '/proc/net/dev' # initialize counter, this will ensure a initial value is present # to calculate bandwidth @@ -451,7 +465,7 @@ def main(): "-e", "--ethernet_device", type=str, - default='eth0', + default=get_default_iface(), help='Ethernet device to monitor') parser.add_argument( "-i", From b5d8a630b5f2dc23bba67609b21644ff84adb5d7 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Sun, 28 Sep 2025 14:17:18 +0800 Subject: [PATCH 08/12] Bootstrap - add blobfuse checking and updating for VMSS (#63) This PR adds version checking and conditional updating for blobfuse2 in the VMSS installation script. The change ensures that blobfuse2 is upgraded to at least version 1.5.0 if an older version is already installed. Key changes: Added version detection for the installed blobfuse2 package Implemented conditional upgrade logic based on minimum version requirement --- contrib/aks/scripts/install-fuse.sh | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/contrib/aks/scripts/install-fuse.sh b/contrib/aks/scripts/install-fuse.sh index 3976f51c..974c79b6 100644 --- a/contrib/aks/scripts/install-fuse.sh +++ b/contrib/aks/scripts/install-fuse.sh @@ -6,4 +6,26 @@ set -xe DEBIAN_FRONTEND=noninteractive apt-get update -y -DEBIAN_FRONTEND=noninteractive apt-get install libfuse3-dev fuse3 blobfuse2 -y || echo "Failed to install fuse" \ No newline at end of file +DEBIAN_FRONTEND=noninteractive apt-get install libfuse3-dev fuse3 blobfuse2 -y || echo "Failed to install fuse" + +# Check if blobfuse2 is installed +if ! command -v blobfuse2 >/dev/null 2>&1; then + echo "blobfuse2 is not installed. Exiting." + exit 1 +fi + +INSTALLED_VERSION=$(blobfuse2 --version | grep -oP '\d+\.\d+\.\d+') +REQUIRED_VERSION="2.5.0" + +# Check if version extraction succeeded +if [ -z "$INSTALLED_VERSION" ]; then + echo "Failed to extract blobfuse2 version. Exiting." + exit 1 +fi + +if dpkg --compare-versions "$INSTALLED_VERSION" "lt" "$REQUIRED_VERSION"; then + echo "Updating blobfuse2 to a version newer than $REQUIRED_VERSION" + DEBIAN_FRONTEND=noninteractive apt-get install --only-upgrade blobfuse2 -y || echo "Failed to update blobfuse2" +else + echo "blobfuse2 is already up-to-date (version $INSTALLED_VERSION)" +fi \ No newline at end of file From 3fa75331229266dab07b6e5a5b39b456652ec8c4 Mon Sep 17 00:00:00 2001 From: Lei Qu <59161330+quge009@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:20:18 +0800 Subject: [PATCH 09/12] Improvement: Copilot: Seperate embedding model url config (#90) * improve seperate embedding model url setting env from llm endpoint, for flexible configuration when using sglang interface add default value for COPILOT_EMBEDDING_URL --- src/copilot-chat/config/copilot-chat.yaml | 1 + .../deploy/copilot-chat-deployment.yaml.template | 2 ++ src/copilot-chat/src/copilot_agent/utils/llmsession.py | 5 +++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/copilot-chat/config/copilot-chat.yaml b/src/copilot-chat/config/copilot-chat.yaml index 38bf7f38..2adb9e0b 100644 --- a/src/copilot-chat/config/copilot-chat.yaml +++ b/src/copilot-chat/config/copilot-chat.yaml @@ -29,6 +29,7 @@ powerbi-key: "" openai-api-key: "" llm-provider: "azure" llm-endpoint: "https://endpoint.openai.com" +embedding-url: "" llm-model: "gpt-4o" llm-version: "2025-01-01-preview" embedding-model: "text-embedding-ada-002" diff --git a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template index 25dfcd5b..194014c7 100644 --- a/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template +++ b/src/copilot-chat/deploy/copilot-chat-deployment.yaml.template @@ -65,6 +65,8 @@ spec: value: {{ cluster_cfg["copilot-chat"]["llm-provider"] }} - name: COPILOT_LLM_ENDPOINT value: {{ cluster_cfg["copilot-chat"]["llm-endpoint"] }} + - name: COPILOT_EMBEDDING_URL + value: {{ cluster_cfg["copilot-chat"]["embedding-url"] | default(cluster_cfg["copilot-chat"]["llm-endpoint"], true) }} - name: COPILOT_LLM_MODEL value: {{ cluster_cfg["copilot-chat"]["llm-model"] }} - name: COPILOT_LLM_VERSION diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py index ad8b93fd..69269937 100644 --- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py +++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py @@ -18,6 +18,7 @@ def __init__(self): self.azure_api_key = os.environ.get("AZURE_OPENAI_API_KEY") self.openai_api_key = os.environ.get("OPENAI_API_KEY") self.endpoint = os.environ.get("COPILOT_LLM_ENDPOINT") + self.embedding_url = os.environ.get("COPILOT_EMBEDDING_URL") self.model_name = os.environ.get("COPILOT_LLM_MODEL") self.model_version = os.environ.get("COPILOT_LLM_VERSION") self.embedding_model_name = os.environ.get("COPILOT_EMBEDDING_MODEL") @@ -27,7 +28,7 @@ def __init__(self): api_key=self.openai_api_key ) self.embedding_model = openai.OpenAI( - base_url=self.endpoint, + base_url=self.embedding_url, api_key=self.openai_api_key ) elif self.provider == "azure": @@ -37,7 +38,7 @@ def __init__(self): api_version=self.model_version ) self.embedding_model = openai.AzureOpenAI( - azure_endpoint=self.endpoint, + azure_endpoint=self.embedding_url, api_key=self.azure_api_key, api_version=self.model_version ) From 90d9b7f055f19eb332d50feca17cefaad30f9dc8 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 29 Sep 2025 03:46:02 -0700 Subject: [PATCH 10/12] Update /etc/hosts for dns records (#87) Update /etc/hosts for dns records in cluster configuration. --- .../deploy/configmap-create.sh | 4 ++ src/cluster-configuration/deploy/refresh.sh | 9 ++- .../deploy/start.sh.template | 3 + .../deploy/stop.sh.template | 5 ++ .../deploy/write-etc-hosts.yaml | 57 +++++++++++++++++++ 5 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 src/cluster-configuration/deploy/write-etc-hosts.yaml diff --git a/src/cluster-configuration/deploy/configmap-create.sh b/src/cluster-configuration/deploy/configmap-create.sh index fd1d9e48..e96cbfc1 100755 --- a/src/cluster-configuration/deploy/configmap-create.sh +++ b/src/cluster-configuration/deploy/configmap-create.sh @@ -22,3 +22,7 @@ kubectl create configmap docker-credentials --from-file=docker-credentials/ --d kubectl create configmap gpu-configuration --from-file=gpu-configuration/ --dry-run=client -o yaml | kubectl apply --overwrite=true -f - || exit $? kubectl create configmap pai-version --from-file=../../../version/PAI.VERSION --dry-run=client -o yaml | kubectl apply --overwrite=true -f - || exit $? kubectl create configmap k8s-version --from-file=../../../version/K8S.VERSION --dry-run=client -o yaml | kubectl apply --overwrite=true -f - || exit $? + +kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\t"}{.metadata.name}{"\n"}{end}' \ + | kubectl create configmap k8s-etc-hosts --from-file=k8s-etc-hosts.txt=/dev/stdin --dry-run=client -o yaml \ + | kubectl apply -f - || exit $? diff --git a/src/cluster-configuration/deploy/refresh.sh b/src/cluster-configuration/deploy/refresh.sh index 19bdc37f..34c33732 100644 --- a/src/cluster-configuration/deploy/refresh.sh +++ b/src/cluster-configuration/deploy/refresh.sh @@ -29,4 +29,11 @@ kubectl create configmap docker-credentials --from-file=docker-credentials/ --dr echo "refresh gpu-configuration" kubectl create configmap gpu-configuration --from-file=gpu-configuration/ --dry-run=client -o yaml | kubectl apply -f - || exit $? -popd > /dev/null \ No newline at end of file +echo "refresh k8s-etc-hosts" +kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\t"}{.metadata.name}{"\n"}{end}' \ + | kubectl create configmap k8s-etc-hosts --from-file=k8s-etc-hosts.txt=/dev/stdin --dry-run=client -o yaml \ + | kubectl apply -f - || exit $? +kubectl delete job write-etc-hosts --ignore-not-found +kubectl apply -f write-etc-hosts.yaml || exit $? + +popd > /dev/null diff --git a/src/cluster-configuration/deploy/start.sh.template b/src/cluster-configuration/deploy/start.sh.template index c9e2bb8c..216d8696 100644 --- a/src/cluster-configuration/deploy/start.sh.template +++ b/src/cluster-configuration/deploy/start.sh.template @@ -35,6 +35,9 @@ rm -rf secret-system.yaml # Create priorityClass for PAI daemon kubectl apply --overwrite=true -f priority-class.yaml || exit $? +# Amend /etc/hosts for master node +kubectl apply -f write-etc-hosts.yaml || exit $? + # Add `pai-master`, `pai-worker`, `pai-storage` label to corresponding nodes and remove irrelant labels ( {%- for host in cluster_cfg['layout']['machine-list'] %} diff --git a/src/cluster-configuration/deploy/stop.sh.template b/src/cluster-configuration/deploy/stop.sh.template index 70f8cc98..ff95d79e 100644 --- a/src/cluster-configuration/deploy/stop.sh.template +++ b/src/cluster-configuration/deploy/stop.sh.template @@ -39,6 +39,11 @@ if kubectl get configmap | grep -q "k8s-version"; then kubectl delete configmap k8s-version || exit $? fi +if kubectl get configmap | grep -q "k8s-etc-hosts"; then + kubectl delete configmap k8s-etc-hosts || exit $? +fi +kubectl delete job write-etc-hosts --ignore-not-found || exit $? + if kubectl get secret | grep -q "{{ cluster_cfg['cluster']['docker-registry']['secret-name'] }}"; then kubectl delete secret {{ cluster_cfg['cluster']['docker-registry']['secret-name'] }} || exit $? fi diff --git a/src/cluster-configuration/deploy/write-etc-hosts.yaml b/src/cluster-configuration/deploy/write-etc-hosts.yaml new file mode 100644 index 00000000..1977c5d8 --- /dev/null +++ b/src/cluster-configuration/deploy/write-etc-hosts.yaml @@ -0,0 +1,57 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: write-etc-hosts +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + nodeSelector: + pai-master: "true" + volumes: + - name: host-etc + hostPath: + path: /etc + type: Directory + - name: hosts-config + configMap: + name: k8s-etc-hosts + items: + - key: k8s-etc-hosts.txt + path: k8s-etc-hosts.txt + containers: + - name: writer + image: alpine:3.18 + securityContext: + runAsUser: 0 + privileged: true + command: + - sh + - -c + - | + set -euo pipefail + MARKER_START="# BEGIN K8S NODES (managed)" + MARKER_END="# END K8S NODES (managed)" + HOSTS="/host/etc/hosts" + BACKUP="${HOSTS}.k8s.bak.$(date +%s)" + TMP="$(mktemp)" + echo "Backing up $HOSTS -> $BACKUP" + cp "$HOSTS" "$BACKUP" + if grep -qF "$MARKER_START" "$HOSTS"; then + awk "/$MARKER_START/{exit} {print}" "$HOSTS" > "$TMP" + else + cat "$HOSTS" > "$TMP" + fi + echo "$MARKER_START" >> "$TMP" + cat /etc/k8s-etc-hosts.txt >> "$TMP" + echo "$MARKER_END" >> "$TMP" + cp "$TMP" "$HOSTS" + rm -f "$TMP" + echo "Wrote managed block into $HOSTS; backup at $BACKUP" + volumeMounts: + - name: host-etc + mountPath: /host/etc + - name: hosts-config + mountPath: /etc/k8s-etc-hosts.txt + subPath: k8s-etc-hosts.txt From 9b5a9889127d1f74c44209027769573bd11917d3 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 30 Sep 2025 10:22:59 +0800 Subject: [PATCH 11/12] AlertManager: Add alert logging webhook to replace logging with admin-email (#91) **Description** Add alert logging webhook to replace logging with admin-email implementation to prevent the alert parser fail to capture alerts due to email feature not enable. **Major Change** - Added new /alert-handler/log-alerts webhook endpoint with dedicated controller - Updated alert parsing regex pattern to match new log format - Configured logging webhook for all alert receivers in AlertManager --- .../alert-manager-configmap.yaml.template | 30 ++++++++++++ .../src/alert-handler/controllers/log.js | 46 +++++++++++++++++++ .../src/alert-handler/routes/actions.js | 9 +++- .../src/alert-parser/utils/alert_util.py | 4 +- 4 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 src/alert-manager/src/alert-handler/controllers/log.js diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 4c768e4c..4b30d0ee 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -157,9 +157,13 @@ data: - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' send_resolved: true {% endif %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false - name: pai-cert-expiration-checker webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'cert-expiration-checker' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-group/?template=cert-expiration' send_resolved: false @@ -167,6 +171,8 @@ data: - name: pai-cluster-usage webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-group' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-group/?template=cluster-usage' send_resolved: false @@ -174,6 +180,8 @@ data: - name: pai-job-status-change webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-user' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-user/?template=job-status-change' send_resolved: false @@ -183,6 +191,8 @@ data: - name: pai-prod-job-status-change webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-group' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-group/?template=job-status-change' send_resolved: false @@ -190,6 +200,8 @@ data: - name: pai-unvalidate-nodes webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' send_resolved: false @@ -197,6 +209,8 @@ data: - name: pai-cordon-nodes webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes' send_resolved: false @@ -208,6 +222,8 @@ data: - name: pai-uncordon-nodes webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'uncordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/uncordon-nodes' send_resolved: false @@ -219,6 +235,8 @@ data: - name: pai-reboot-nodes webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'reboot-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/reboot-nodes' send_resolved: false @@ -230,6 +248,8 @@ data: - name: pai-drain-nodes webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'drain-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/drain-nodes' send_resolved: false @@ -241,6 +261,8 @@ data: - name: pai-abnormal-job-email webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-user' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-user/?template=kill-low-efficiency-job-alert' send_resolved: false @@ -249,6 +271,8 @@ data: {% endif %} - name: pai-abnormal-job-terminate webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'stop-jobs' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/stop-jobs' http_config: @@ -258,6 +282,8 @@ data: - name: pai-adjust-user-quota webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'set-quota' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/set-quotas' send_resolved: false @@ -267,6 +293,8 @@ data: - name: pai-adjust-user-quota-email webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if 'email-user' in cluster_cfg["alert-manager"]["actions-available"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-user/?template=adjust-user-quota' send_resolved: false @@ -277,6 +305,8 @@ data: {% for receiver in cluster_cfg["alert-manager"]["customized-receivers"] %} - name: {{ receiver.name}} webhook_configs: + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/log-alerts' + send_resolved: false {% if (receiver["actions"]["email-admin"] is defined) and ('email-admin' in cluster_cfg["alert-manager"]["actions-available"]) %} {% set template = receiver["actions"]["email-admin"]["template"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin/?template={{ template }}' diff --git a/src/alert-manager/src/alert-handler/controllers/log.js b/src/alert-manager/src/alert-handler/controllers/log.js new file mode 100644 index 00000000..b4079830 --- /dev/null +++ b/src/alert-manager/src/alert-handler/controllers/log.js @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation +// under the MIT license. + +const logger = require('@alert-handler/common/logger'); + +// log alerts +const logAlerts = async (req, res) => { + logger.info( + 'alert-handler received `log-alerts` post request from alert-manager.', + ); + + const timestamp = new Date().toISOString(); + // Log each alert with detailed information + if (req.body.alerts && Array.isArray(req.body.alerts)) { + req.body.alerts.forEach(alert => { + const summary = alert.annotations?.summary || 'No summary available'; + const labels = JSON.stringify(alert.labels); + const annotations = alert.annotations && Object.keys(alert.annotations).length > 0 + ? JSON.stringify(alert.annotations) + : 'No annotations available'; + + logger.info( + `[${timestamp}] alert-handler received alerts: Alertname: ${alert.labels?.alertname}, Severity: ${alert.labels?.severity}, Summary: ${summary}, Labels: ${labels}, Annotations: ${annotations}` + ); + }); + } else { + logger.warn('No alerts found in request body or alerts is not an array'); + } + + try { + res.status(200).json({ + message: 'alert-handler successfully logged alerts', + count: req.body.alerts ? req.body.alerts.length : 0 + }); + } catch (error) { + logger.error('Failed to log alerts:', error); + res.status(500).json({ + message: 'alert-handler failed to log alerts', + }); + } +}; + +// module exports +module.exports = { + logAlerts, +}; diff --git a/src/alert-manager/src/alert-handler/routes/actions.js b/src/alert-manager/src/alert-handler/routes/actions.js index 49feacf5..31d8549f 100644 --- a/src/alert-manager/src/alert-handler/routes/actions.js +++ b/src/alert-manager/src/alert-handler/routes/actions.js @@ -19,7 +19,8 @@ const express = require('express'); const emailController = require('@alert-handler/controllers/mail'); const jobController = require('@alert-handler/controllers/job'); const nodeController = require('@alert-handler/controllers/node'); -const userController = require('@alert-handler/controllers/user') +const userController = require('@alert-handler/controllers/user'); +const logController = require('@alert-handler/controllers/log'); const router = express.Router(); @@ -81,4 +82,10 @@ router /** POST /alert-handler/set-quota */ .post(userController.setQuotas); +// log +router + .route('/alert-handler/log-alerts') + /** POST /alert-handler/log-alerts */ + .post(logController.logAlerts); + module.exports = router; diff --git a/src/alert-manager/src/alert-parser/utils/alert_util.py b/src/alert-manager/src/alert-parser/utils/alert_util.py index 3344a078..f2c83d48 100644 --- a/src/alert-manager/src/alert-parser/utils/alert_util.py +++ b/src/alert-manager/src/alert-parser/utils/alert_util.py @@ -26,7 +26,7 @@ class AlertParser: @staticmethod def parse_message(log): """Parse a single alert log message""" - pattern = r"\[(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\] alert-handler send alert to admin with alerts: Alertname: (?P[^,]+), Severity: (?P[^,]+), Summary: (?P.+), Labels: (?P\{.*?\}), Annotations: (?P.*?)" + pattern = r"\[(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\] alert-handler received alerts: Alertname: (?P[^,]+), Severity: (?P[^,]+), Summary: (?P.+), Labels: (?P\{.*?\}), Annotations: (?P.*?)" match = re.search(pattern, log) if match: timestamp = match.group("timestamp") @@ -90,7 +90,7 @@ def fetch_logs(self, end_time_stamp, time_offset): query = ( f"ContainerLogV2| " f'where ContainerName contains "alerthandler" | ' - f'where LogMessage contains "alert-handler send alert to admin with alerts" and ' + f'where LogMessage contains "alert-handler received alerts" and ' f'LogMessage !contains "NodeFilesystemUsage" and LogMessage !contains "NodeGpuCountChanged" and LogMessage !contains "NodeUnschedulable" | ' f"where TimeGenerated between(datetime({start_time})..datetime({end_time})) | " f"project TimeGenerated, PodName, LogMessage | " From f116016f4babd20f32d947c4281993af9b935896 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 30 Sep 2025 15:10:05 +0800 Subject: [PATCH 12/12] Docs - Add release notes (#92) __Description__ Add LTP release note. __Major Revision__ - Add release note for v1.3 v1.2 v1.1 v1.0 --- .../blog/2025-04-30-release-1-0.md | 32 ++++++++++++ .../blog/2025-06-20-release-1-1.md | 30 +++++++++++ .../blog/2025-08-11-release-1-2.md | 40 +++++++++++++++ .../blog/2025-09-30-release-1-3.md | 50 +++++++++++++++++++ .../manual/admin}/cert_update_readme.md | 0 .../manual/admin}/service_setup_readme.md | 0 6 files changed, 152 insertions(+) create mode 100644 docs/LuciaTrainingPlatform/blog/2025-04-30-release-1-0.md create mode 100644 docs/LuciaTrainingPlatform/blog/2025-06-20-release-1-1.md create mode 100644 docs/LuciaTrainingPlatform/blog/2025-08-11-release-1-2.md create mode 100644 docs/LuciaTrainingPlatform/blog/2025-09-30-release-1-3.md rename docs/{ltp => LuciaTrainingPlatform/manual/admin}/cert_update_readme.md (100%) rename docs/{ltp => LuciaTrainingPlatform/manual/admin}/service_setup_readme.md (100%) diff --git a/docs/LuciaTrainingPlatform/blog/2025-04-30-release-1-0.md b/docs/LuciaTrainingPlatform/blog/2025-04-30-release-1-0.md new file mode 100644 index 00000000..e519adc8 --- /dev/null +++ b/docs/LuciaTrainingPlatform/blog/2025-04-30-release-1-0.md @@ -0,0 +1,32 @@ +--- +slug: release-ltp-v1.0 +title: Releasing Lucia Training Platform v1.0 +author: Lucia Training Platform Team +tags: [ltp, announcement, release] +--- + +We are pleased to announce the official release of **Lucia Training Platform v1.0.0**! + +## Lucia Training Platform v1.0.0 Release Notes + +This inaugural release establishes Lucia Training Platform as a comprehensive AI platform solution, built on the foundation of OpenPAI with significant enhancements and customizations for enterprise AI workloads. + +## Platform Features & Stability +- Updated Virtual Machine Scale Set deployment scripts with MI300 GPU support and kubelet bug fixes +- Fixed launch order issues between AMD device plugin and AMDGPU module loading +- Fixed local disk mounting into containers for high-speed data loading +- Implemented priority restrictions for production jobs to ensure resource allocation +- Automated daily backup of user logs to blob storage with cordon trigger functionality +- Updated OpenPAI-runtime image to resolve SSH crashes in large-scale training jobs +- Added refresh API to clean storage cache when new Persistent Volumes (PV) or Persistent Volume Claims (PVC) are added +- Implemented automated email notifications for production jobs to specific user groups + +## Job Reliability & Monitoring +- Implemented automatic detection metrics and rules for AMD GPU issues during runtime +- Enabled job execution on specific cordoned nodes for admin management +- Automated node cordoning and uncordoning with single node validation +- Added support for monitoring count of per-VC available/used nodes in Prometheus + +## User Experience +- Complete revision of the homepage with acknowledgment of OpenPAI's great contribution +- Updated all titles and references from OpenPAI to Lucia Training Platform (LTP) throughout the web portal diff --git a/docs/LuciaTrainingPlatform/blog/2025-06-20-release-1-1.md b/docs/LuciaTrainingPlatform/blog/2025-06-20-release-1-1.md new file mode 100644 index 00000000..2faa4862 --- /dev/null +++ b/docs/LuciaTrainingPlatform/blog/2025-06-20-release-1-1.md @@ -0,0 +1,30 @@ +--- +slug: release-ltp-v1.1 +title: Releasing Lucia Training Platform v1.1 +author: Lucia Training Platform Team +tags: [ltp, announcement, release] +--- + +We are pleased to announce the official release of **Lucia Training Platform v1.1.0**! + +## Lucia Training Platform v1.1.0 Release Notes + +This release introduces new inference capabilities, enhanced stability improvements, comprehensive monitoring systems, and significant security enhancements. + +## Platform Features & Stability +- Added support for inference job submission +- Added prototype user interface with webportal plugin + +## Job Reliability & Monitoring +- Automated Azure VM recycling and validation processing workflows +- Automated pipeline for submitting ICM tickets for unhealthy Azure VMs. +- Kusto database implementation for action status tracking, node status monitoring, and job status analytics + +## User Experience +- Enhanced dashboard with comprehensive platform performance metrics + +## Security +- Forced upgrades of operating system, Linux, and Python packages to address security vulnerabilities +- Updated Golang and Node.js packages to latest secure versions +- Disabled and replaced unapproved registries (non-ACR/MCR) on LTP platform +- Disabled SSH access for all users to enhance security posture diff --git a/docs/LuciaTrainingPlatform/blog/2025-08-11-release-1-2.md b/docs/LuciaTrainingPlatform/blog/2025-08-11-release-1-2.md new file mode 100644 index 00000000..ab302b52 --- /dev/null +++ b/docs/LuciaTrainingPlatform/blog/2025-08-11-release-1-2.md @@ -0,0 +1,40 @@ +--- +slug: release-ltp-v1.2 +title: Releasing Lucia Training Platform v1.2 +author: Lucia Training Platform Team +tags: [ltp, announcement, release] +--- + +We are pleased to announce the official release of **Lucia Training Platform v1.2.0**! + +## Lucia Training Platform v1.2.0 Release Notes + +This release introduces significant new features, enhanced reliability monitoring, improved user experience, and strengthened security measures. + +## Platform Features & Stability +- Virtual Cluster administrators can now stop jobs in their own VC +- Enhanced inference job interface with external IP gateway support +- Portal displays only active clusters for improved user experience +- Enhanced job execution capabilities with Docker support within jobs +- Resolved CUDA version mismatch issues causing job-exporter crashes +- Fixed configuration refresh issues when updating user settings +- Resolved blob mount failures and Azure copy token issues + +## Local Storage +- Local storage service with user API interface implementation +- Integration with node recycling processes + +## Job Reliability & Monitoring +- Initial automatic node failure detection module design and implementation +- Enhanced job monitoring kusto data pipeline with summary and reaction time tracking +- Proactive alerting email for certificate expiration management + +## User Experience +- Added webportal plugin integration for Copilot functionality +- Initial backend support for Copilot features +- Enhanced dashboard with comprehensive platform metrics +- Added Mean Time Between Incidents (MTBI) tracking for virtual machines and nodes in dashboard + +## Security +- Updates to address security vulnerabilities in container images +- Kubernetes version upgrade for enhanced security and performance diff --git a/docs/LuciaTrainingPlatform/blog/2025-09-30-release-1-3.md b/docs/LuciaTrainingPlatform/blog/2025-09-30-release-1-3.md new file mode 100644 index 00000000..8200bbfd --- /dev/null +++ b/docs/LuciaTrainingPlatform/blog/2025-09-30-release-1-3.md @@ -0,0 +1,50 @@ +--- +slug: release-ltp-v1.3 +title: Releasing Lucia Training Platform v1.3 +author: Lucia Training Platform Team +tags: [ltp, announcement, release] +--- + +We are pleased to announce the official release of **Lucia Training Platform v1.3.0**! + +## Lucia Training Platform v1.3.0 Release Notes + +This release brings significant improvements across platform stability, inference capabilities, user experience, and security enhancements. + +## Platform Features & Stability +- Migrated PostgreSQL database to Azure disk for improved performance +- Upgraded blobfuse version to support non-empty cache folder +- Integrated ssh-proxy and utilization reporter services +- Added local disk support for Prometheus deployment +- Added H200 GPU support for VMSS provisioning scripts +- Updated Kubernetes deployment scripts for Kubespray compatibility on Bare Metal Machines +- CI/CD: Added GitHub workflow to build and deploy changed services +- Fixed GPU and default eth detection in job exporter +- Updated /etc/hosts for DNS records +- Added alert logging webhook in alert manager + +## Inference Plugin +- Added support for output streaming in web portal +- Implemented long-context thinking output folding support +- Deployed inference model proxy service for improved model serving +- Messages now display as markdown and copy as plain text + +## Tools +- Added comprehensive administration tools for LTP service management + +## Security +- Upgraded Dockerfiles with latest system updates +- Added Docker-in-Docker (dind) support for Webportal + +## User Experience +- Code refactoring for improved maintainability +- Integrated Copilot SGLANG/OpenAI Interface backend support +- Added Copilot support for Dashboard Metrics visualization +- Implemented User Feedback Loop design and functionality +- Added VC based user group membership authentication + +## License +- Added Microsoft license headers to examples directory +- Added Microsoft license headers to contrib directory +- Added Microsoft license headers to deployment directory +- Added Microsoft license headers to src directory diff --git a/docs/ltp/cert_update_readme.md b/docs/LuciaTrainingPlatform/manual/admin/cert_update_readme.md similarity index 100% rename from docs/ltp/cert_update_readme.md rename to docs/LuciaTrainingPlatform/manual/admin/cert_update_readme.md diff --git a/docs/ltp/service_setup_readme.md b/docs/LuciaTrainingPlatform/manual/admin/service_setup_readme.md similarity index 100% rename from docs/ltp/service_setup_readme.md rename to docs/LuciaTrainingPlatform/manual/admin/service_setup_readme.md