aws-samples · aktrepos · Jul 17, 2021 · Jul 17, 2021 · Jul 17, 2021 · Jul 17, 2021
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-## SageMaker Studio Custom Image Samples
+# SageMaker Studio Custom Image Samples
 
-### Overview
+## Overview
 
 This repository contains examples of Docker images that are valid custom images for KernelGateway Apps in SageMaker Studio. These custom images enable you to bring your own packages, files, and kernels for use with notebooks, terminals, and interactive consoles within SageMaker Studio.
 
@@ -13,11 +13,13 @@ This repository contains examples of Docker images that are valid custom images
 - [rapids-image](examples/rapids-image) - This example uses the offical rapids.ai image from Dockerhub. Use with a GPU instance on Studio
 - [scala-image](examples/scala-image) - This example adds a Scala kernel based on [Almond Scala Kernel](https://almond.sh/).
 - [tf2.3-image](examples/tf23-image) - This examples uses the official TensorFlow 2.3 image from DockerHub and demonstrates bundling custom files along with the image.
+- [spark-image](examples/spark-image) - This example gives the spark image to do python spark code developement in SageMaker studio.
+
 #### One-time setup
 
 All examples have a one-time setup to create an ECR repository
 
-```
+```bash
 REGION=<aws-region>
 aws --region ${REGION} ecr create-repository \
     --repository-name smstudio-custom
@@ -29,4 +31,4 @@ See [DEVELOPMENT.md](DEVELOPMENT.md)
 
 ### License
 
-This sample code is licensed under the MIT-0 License. See the LICENSE file.
+This sample code is licensed under the MIT-0 License. See the LICENSE file.
diff --git a/examples/.gitignore b/examples/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/examples/echo-kernel-image/Dockerfile.orig b/examples/echo-kernel-image/Dockerfile.orig
@@ -0,0 +1,38 @@
+FROM python:3.6
+
+ARG NB_USER="sagemaker-user"
+ARG NB_UID="1000"
+ARG NB_GID="100"
+
+
+######################
+# OVERVIEW
+# 1. Creates the `sagemaker-user` user with UID/GID 1000/100.
+# 2. Ensures this user can `sudo` by default. 
+# 3. Install the echo kernel from PyPI and install its dependencies.
+# 4. Make the default shell `bash`. This enhances the experience inside a Jupyter terminal as otherwise Jupyter defaults to `sh`
+######################
+
+# Setup the "sagemaker-user" user with root privileges.
+RUN \
+    apt-get update && \
+    apt-get install -y sudo && \
+    useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \
+    chmod g+w /etc/passwd && \
+    echo "${NB_USER}    ALL=(ALL)    NOPASSWD:    ALL" >> /etc/sudoers && \
+    # Prevent apt-get cache from being persisted to this layer.
+    rm -rf /var/lib/apt/lists/*
+
+# Install and configure the kernel. 
+RUN \
+    pip install echo_kernel \
+        # These are dependencies of echo_kernel but the version on PyPI is old and doesn't declare them correctly.
+        jupyter_client IPython ipykernel && \
+    # This ensures that the kernelspec.json is installed in location expected by Jupyter/KernelGateway.
+    python -m echo_kernel.install --sys-prefix
+
+# Make the default shell bash (vs "sh") for a better Jupyter terminal UX
+ENV SHELL=/bin/bash
+
+USER $NB_UID
+
diff --git a/examples/spark-image/Dockerfile b/examples/spark-image/Dockerfile
@@ -0,0 +1,165 @@
+FROM ubuntu:18.04
+
+ARG NB_USER="sagemaker-user"
+ARG NB_UID="1000"
+ARG NB_GID="100"
+
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+# Setup the "sagemaker-user" user with root privileges.
+RUN apt-get update && \
+    apt-get install -y sudo && \
+    useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \
+    chmod g+w /etc/passwd && \
+    echo "${NB_USER}    ALL=(ALL)    NOPASSWD:    ALL" >> /etc/sudoers && \
+    # Prevent apt-get cache from being persisted to this layer.
+    apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    echo "en_US.UTF-8 UTF-8" > /etc/locale.gen
+
+USER $NB_UID
+
+# Make the default shell bash (vs "sh") for a better Jupyter terminal UX
+ENV SHELL=/bin/bash \
+    NB_USER=$NB_USER \
+    NB_UID=$NB_UID \
+    NB_GID=$NB_GID \
+    LC_ALL=en_US.UTF-8 \
+    LANG=en_US.UTF-8 \
+    LANGUAGE=en_US.UTF-8 \
+    HOME=/home/$NB_USER \
+    MINICONDA_VERSION=4.6.14 \
+    CONDA_VERSION=4.6.14 \
+    MINICONDA_MD5=718259965f234088d785cad1fbd7de03 \
+    CONDA_DIR=/opt/conda \
+    PATH=$CONDA_DIR/bin:${PATH}
+
+
+USER root
+RUN apt-get update --yes
+# COPY ./apt-packages.txt /root/apt-packages.txt
+# RUN xargs -a /root/apt-packages.txt apt-get install -y --no-install-recommends
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    wget \
+    curl \
+    ca-certificates \
+    sudo \
+    locales \
+    fonts-liberation \
+    run-one && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \
+    locale-gen
+
+RUN mkdir -p $CONDA_DIR && \
+    chown -R $NB_USER:$NB_GID $CONDA_DIR && \
+    # Fix for devtools https://github.com/conda-forge/r-devtools-feedstock/issues/4
+    ln -s /bin/tar /bin/gtar
+
+# Copy a script that we will use to correct permissions after running certain commands
+COPY fix-permissions /usr/local/bin/fix-permissions
+RUN chmod a+rx /usr/local/bin/fix-permissions
+COPY fix-host-settings /usr/local/bin/fix-host-settings
+RUN chmod a+rx /usr/local/bin/fix-host-settings
+
+RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \
+    sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \
+    sed -i.bak -e 's/^%sudo/#%sudo/' /etc/sudoers && \
+    usermod -G root ${NB_USER} && \
+    mkdir -p "${CONDA_DIR}" && \
+    chown "${NB_USER}:${NB_GID}" "${CONDA_DIR}" && \
+    chmod g+w /etc/passwd && \
+    fix-permissions "${HOME}" && \
+    fix-permissions "${CONDA_DIR}"
+
+USER ${NB_UID}
+ARG PYTHON_VERSION=3.6.14
+ENV PATH=$CONDA_DIR/bin:${PATH}
+WORKDIR /tmp
+
+# Install conda via Miniconda
+RUN curl --silent --show-error --output miniconda-installer.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \
+    echo "${MINICONDA_MD5} *miniconda-installer.sh" | md5sum -c - && \
+    /bin/bash miniconda-installer.sh -f -b -p $CONDA_DIR && \
+    rm miniconda-installer.sh && \
+    conda config --system --prepend channels conda-forge && \
+    conda config --system --set auto_update_conda false && \
+    conda config --system --set show_channel_urls true && \
+    conda config --system --set pip_interop_enabled true && \
+    conda install --quiet --yes conda="${CONDA_VERSION%.*}.*" && \
+    conda update --all --quiet --yes && \
+    conda clean --all -f -y && \
+    rm -rf /home/$NB_USER/.cache/yarn
+
+RUN conda install --quiet --yes \
+    tini \
+    boto3 \
+    'awscli>=1.18' \
+    sagemaker_pyspark \
+    'pyspark==2.4.0' \
+    'notebook=6.4.0' \
+    'jupyterhub=1.4.1' \
+    'jupyterlab=3.0.16' && \
+    conda clean --all -f -y && \
+    npm cache clean --force && \
+    jupyter notebook --generate-config && \
+    jupyter lab clean && \
+    rm -rf "/home/${NB_USER}/.cache/yarn" && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+EXPOSE 8888
+
+# Copy local files as late as possible to avoid cache busting
+COPY start.sh start-notebook.sh start-singleuser.sh /usr/local/bin/
+# Currently need to have both jupyter_notebook_config and jupyter_server_config to support classic and lab
+COPY jupyter_notebook_config.py /etc/jupyter/
+
+# Fix permissions on /etc/jupyter as root
+USER root
+
+# Prepare upgrade to JupyterLab V3.0 #1205
+RUN sed -re "s/c.NotebookApp/c.ServerApp/g" \
+    /etc/jupyter/jupyter_notebook_config.py > /etc/jupyter/jupyter_server_config.py && \
+    fix-permissions /etc/jupyter/
+
+# Spark dependencies
+# Default values can be overridden at build time
+# (ARGS are in lower case to distinguish them from ENV)
+ARG spark_version="2.4.0"
+ARG hadoop_version="2.7"
+ARG spark_checksum="5F4184E0FE7E5C8AE67F5E6BC5DEEE881051CC712E9FF8AEDDF3529724C00E402C94BB75561DD9517A372F06C1FCB78DC7AE65DCBD4C156B3BA4D8E267EC2936"
+ARG openjdk_version="8"
+
+ENV APACHE_SPARK_VERSION="${spark_version}" \
+    HADOOP_VERSION="${hadoop_version}"
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Spark installation
+WORKDIR /tmp
+RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
+    echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
+    tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
+    rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
+
+WORKDIR /usr/local
+
+# Configure Spark
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
+    PATH="${PATH}:${SPARK_HOME}/bin"
+
+RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \
+    # Add a link in the before_notebook hook in order to source automatically PYTHONPATH
+    mkdir -p /usr/local/bin/before-notebook.d && \
+    ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
+
+USER ${NB_UID}
+
+WORKDIR $HOME
diff --git a/examples/spark-image/README.md b/examples/spark-image/README.md
@@ -0,0 +1,67 @@
+# Spark Image
+
+## Overview
+
+This image is Spark kernel as a Custom Image in SageMaker Studio. This custom image can be used to do interactive spark development in Python. This allow to read and write data from Amazon S3 buckets.
+
+The image is based on Spark version 2.4.0, Hadoop version 2.7 and openjdk 8. This image also have latest version of sagemaker_pyspark (1.4.2), to have aws-hadoop and other dependents jar, to work with AWS services like Amazon simple storage service (s3).
+
+Example notebook (pyspark-kernel-file-read) explains on how to use different credential provider to make calls to S3.
+
+### Building the image
+
+Build the Docker image and push to Amazon ECR.
+
+```bash
+# Modify these as required. The Docker registry endpoint can be tuned based on your current region from https://docs.aws.amazon.com/general/latest/gr/ecr.html#ecr-docker-endpoints
+REGION=<aws-region>
+ACCOUNT_ID=<account-id>
+
+
+# Build the image
+IMAGE_NAME=spark-kernel
+aws --region ${REGION} ecr get-login-password | docker login --username AWS --password-stdin ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom
+docker build . -t ${IMAGE_NAME} -t ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}
+```
+
+```bash
+docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}
+```
+
+### Using with SageMaker Studio
+
+Create a SageMaker Image with the image in ECR.
+
+```bash
+# Role in your account to be used for the SageMaker Image
+ROLE_ARN=<role-arn>
+
+aws --region ${REGION} sagemaker create-image \
+    --image-name ${IMAGE_NAME} \
+    --role-arn ${ROLE_ARN}
+
+aws --region ${REGION} sagemaker create-image-version \
+    --image-name ${IMAGE_NAME} \
+    --base-image "${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}"
+
+# Verify the image-version is created successfully. Do NOT proceed if image-version is in CREATE_FAILED state or in any other state apart from CREATED.
+aws --region ${REGION} sagemaker describe-image-version --image-name ${IMAGE_NAME}
+```
+
+Create a AppImageConfig for this image
+
+```bash
+aws --region ${REGION} sagemaker create-app-image-config --cli-input-json file://app-image-config-input.json
+```
+
+Create a Domain, providing the SageMaker Image and AppImageConfig in the Domain creation. Replace the placeholders for VPC ID, Subnet IDs, and Execution Role in `create-domain-input.json`
+
+```bash
+aws --region ${REGION} sagemaker create-domain --cli-input-json file://create-domain-input.json
+```
+
+If you have an existing Domain, you can also use the `update-domain`
+
+```bash
+aws --region ${REGION} sagemaker update-domain --cli-input-json file://update-domain-input.json
+```
diff --git a/examples/spark-image/app-image-config-input.json b/examples/spark-image/app-image-config-input.json
@@ -0,0 +1,16 @@
+{
+    "AppImageConfigName": "custom-spark-image-config",
+    "KernelGatewayImageConfig": {
+        "KernelSpecs": [
+            {
+                "Name": "python3",
+                "DisplayName": "spark"
+            }
+        ],
+        "FileSystemConfig": {
+            "MountPath": "/home/sagemaker-user",
+            "DefaultUid": 1000,
+            "DefaultGid": 100
+        }
+    }
+}
diff --git a/examples/spark-image/create-domain-input.json b/examples/spark-image/create-domain-input.json
@@ -0,0 +1,19 @@
+{
+    "DomainName": "domain-with-spark-kernel-image",
+    "VpcId": "<vpc-id>",
+    "SubnetIds": [
+        "<subnet-ids>"
+    ],
+    "DefaultUserSettings": {
+        "ExecutionRole": "<execution-role>",
+        "KernelGatewayAppSettings": {
+            "CustomImages": [
+                {
+                    "ImageName": "spark-kernel",
+                    "AppImageConfigName": "custom-spark-image-config"
+                }
+            ]
+        }
+    },
+    "AuthMode": "IAM"
+}
diff --git a/examples/spark-image/fix-host-settings b/examples/spark-image/fix-host-settings
@@ -0,0 +1,4 @@
+#!/bin/sh
+sudo -i chmod 777 /etc/hosts > /dev/null 2>&1
+if !(grep -qi $HOSTNAME /etc/hosts); then echo "127.0.0.1 ${HOSTNAME}" >> /etc/hosts; fi
+cat /etc/hosts
diff --git a/examples/spark-image/fix-permissions b/examples/spark-image/fix-permissions
@@ -0,0 +1,35 @@
+#!/bin/bash
+# set permissions on a directory
+# after any installation, if a directory needs to be (human) user-writable,
+# run this script on it.
+# It will make everything in the directory owned by the group ${NB_GID}
+# and writable by that group.
+# Deployments that want to set a specific user id can preserve permissions
+# by adding the `--group-add users` line to `docker run`.
+
+# uses find to avoid touching files that already have the right permissions,
+# which would cause massive image explosion
+
+# right permissions are:
+# group=${NB_GID}
+# AND permissions include group rwX (directory-execute)
+# AND directories have setuid,setgid bits set
+
+set -e
+
+for d in "$@"; do
+    find "${d}" \
+        ! \( \
+            -group "${NB_GID}" \
+            -a -perm -g+rwX \
+        \) \
+        -exec chgrp "${NB_GID}" {} \; \
+        -exec chmod g+rwX {} \;
+    # setuid, setgid *on directories only*
+    find "${d}" \
+        \( \
+            -type d \
+            -a ! -perm -6000 \
+        \) \
+        -exec chmod +6000 {} \;
+done