Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## SageMaker Studio Custom Image Samples
# SageMaker Studio Custom Image Samples

### Overview
## Overview

This repository contains examples of Docker images that are valid custom images for KernelGateway Apps in SageMaker Studio. These custom images enable you to bring your own packages, files, and kernels for use with notebooks, terminals, and interactive consoles within SageMaker Studio.

Expand All @@ -13,11 +13,13 @@ This repository contains examples of Docker images that are valid custom images
- [rapids-image](examples/rapids-image) - This example uses the offical rapids.ai image from Dockerhub. Use with a GPU instance on Studio
- [scala-image](examples/scala-image) - This example adds a Scala kernel based on [Almond Scala Kernel](https://almond.sh/).
- [tf2.3-image](examples/tf23-image) - This examples uses the official TensorFlow 2.3 image from DockerHub and demonstrates bundling custom files along with the image.
- [spark-image](examples/spark-image) - This example gives the spark image to do python spark code developement in SageMaker studio.

#### One-time setup

All examples have a one-time setup to create an ECR repository

```
```bash
REGION=<aws-region>
aws --region ${REGION} ecr create-repository \
--repository-name smstudio-custom
Expand All @@ -29,4 +31,4 @@ See [DEVELOPMENT.md](DEVELOPMENT.md)

### License

This sample code is licensed under the MIT-0 License. See the LICENSE file.
This sample code is licensed under the MIT-0 License. See the LICENSE file.
1 change: 1 addition & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
38 changes: 38 additions & 0 deletions examples/echo-kernel-image/Dockerfile.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM python:3.6

ARG NB_USER="sagemaker-user"
ARG NB_UID="1000"
ARG NB_GID="100"


######################
# OVERVIEW
# 1. Creates the `sagemaker-user` user with UID/GID 1000/100.
# 2. Ensures this user can `sudo` by default.
# 3. Install the echo kernel from PyPI and install its dependencies.
# 4. Make the default shell `bash`. This enhances the experience inside a Jupyter terminal as otherwise Jupyter defaults to `sh`
######################

# Setup the "sagemaker-user" user with root privileges.
RUN \
apt-get update && \
apt-get install -y sudo && \
useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \
chmod g+w /etc/passwd && \
echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \
# Prevent apt-get cache from being persisted to this layer.
rm -rf /var/lib/apt/lists/*

# Install and configure the kernel.
RUN \
pip install echo_kernel \
# These are dependencies of echo_kernel but the version on PyPI is old and doesn't declare them correctly.
jupyter_client IPython ipykernel && \
# This ensures that the kernelspec.json is installed in location expected by Jupyter/KernelGateway.
python -m echo_kernel.install --sys-prefix

# Make the default shell bash (vs "sh") for a better Jupyter terminal UX
ENV SHELL=/bin/bash

USER $NB_UID

165 changes: 165 additions & 0 deletions examples/spark-image/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
FROM ubuntu:18.04

ARG NB_USER="sagemaker-user"
ARG NB_UID="1000"
ARG NB_GID="100"

SHELL ["/bin/bash", "-o", "pipefail", "-c"]

# Setup the "sagemaker-user" user with root privileges.
RUN apt-get update && \
apt-get install -y sudo && \
useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \
chmod g+w /etc/passwd && \
echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \
# Prevent apt-get cache from being persisted to this layer.
apt-get clean && rm -rf /var/lib/apt/lists/* && \
echo "en_US.UTF-8 UTF-8" > /etc/locale.gen

USER $NB_UID

# Make the default shell bash (vs "sh") for a better Jupyter terminal UX
ENV SHELL=/bin/bash \
NB_USER=$NB_USER \
NB_UID=$NB_UID \
NB_GID=$NB_GID \
LC_ALL=en_US.UTF-8 \
LANG=en_US.UTF-8 \
LANGUAGE=en_US.UTF-8 \
HOME=/home/$NB_USER \
MINICONDA_VERSION=4.6.14 \
CONDA_VERSION=4.6.14 \
MINICONDA_MD5=718259965f234088d785cad1fbd7de03 \
CONDA_DIR=/opt/conda \
PATH=$CONDA_DIR/bin:${PATH}


USER root
RUN apt-get update --yes
# COPY ./apt-packages.txt /root/apt-packages.txt
# RUN xargs -a /root/apt-packages.txt apt-get install -y --no-install-recommends

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
wget \
curl \
ca-certificates \
sudo \
locales \
fonts-liberation \
run-one && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \
locale-gen

RUN mkdir -p $CONDA_DIR && \
chown -R $NB_USER:$NB_GID $CONDA_DIR && \
# Fix for devtools https://github.com/conda-forge/r-devtools-feedstock/issues/4
ln -s /bin/tar /bin/gtar

# Copy a script that we will use to correct permissions after running certain commands
COPY fix-permissions /usr/local/bin/fix-permissions
RUN chmod a+rx /usr/local/bin/fix-permissions
COPY fix-host-settings /usr/local/bin/fix-host-settings
RUN chmod a+rx /usr/local/bin/fix-host-settings

RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \
sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \
sed -i.bak -e 's/^%sudo/#%sudo/' /etc/sudoers && \
usermod -G root ${NB_USER} && \
mkdir -p "${CONDA_DIR}" && \
chown "${NB_USER}:${NB_GID}" "${CONDA_DIR}" && \
chmod g+w /etc/passwd && \
fix-permissions "${HOME}" && \
fix-permissions "${CONDA_DIR}"

USER ${NB_UID}
ARG PYTHON_VERSION=3.6.14
ENV PATH=$CONDA_DIR/bin:${PATH}
WORKDIR /tmp

# Install conda via Miniconda
RUN curl --silent --show-error --output miniconda-installer.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \
echo "${MINICONDA_MD5} *miniconda-installer.sh" | md5sum -c - && \
/bin/bash miniconda-installer.sh -f -b -p $CONDA_DIR && \
rm miniconda-installer.sh && \
conda config --system --prepend channels conda-forge && \
conda config --system --set auto_update_conda false && \
conda config --system --set show_channel_urls true && \
conda config --system --set pip_interop_enabled true && \
conda install --quiet --yes conda="${CONDA_VERSION%.*}.*" && \
conda update --all --quiet --yes && \
conda clean --all -f -y && \
rm -rf /home/$NB_USER/.cache/yarn

RUN conda install --quiet --yes \
tini \
boto3 \
'awscli>=1.18' \
sagemaker_pyspark \
'pyspark==2.4.0' \
'notebook=6.4.0' \
'jupyterhub=1.4.1' \
'jupyterlab=3.0.16' && \
conda clean --all -f -y && \
npm cache clean --force && \
jupyter notebook --generate-config && \
jupyter lab clean && \
rm -rf "/home/${NB_USER}/.cache/yarn" && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

EXPOSE 8888

# Copy local files as late as possible to avoid cache busting
COPY start.sh start-notebook.sh start-singleuser.sh /usr/local/bin/
# Currently need to have both jupyter_notebook_config and jupyter_server_config to support classic and lab
COPY jupyter_notebook_config.py /etc/jupyter/

# Fix permissions on /etc/jupyter as root
USER root

# Prepare upgrade to JupyterLab V3.0 #1205
RUN sed -re "s/c.NotebookApp/c.ServerApp/g" \
/etc/jupyter/jupyter_notebook_config.py > /etc/jupyter/jupyter_server_config.py && \
fix-permissions /etc/jupyter/

# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="2.4.0"
ARG hadoop_version="2.7"
ARG spark_checksum="5F4184E0FE7E5C8AE67F5E6BC5DEEE881051CC712E9FF8AEDDF3529724C00E402C94BB75561DD9517A372F06C1FCB78DC7AE65DCBD4C156B3BA4D8E267EC2936"
ARG openjdk_version="8"

ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Spark installation
WORKDIR /tmp
RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"

WORKDIR /usr/local

# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH
mkdir -p /usr/local/bin/before-notebook.d && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh

USER ${NB_UID}

WORKDIR $HOME
67 changes: 67 additions & 0 deletions examples/spark-image/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Spark Image

## Overview

This image is Spark kernel as a Custom Image in SageMaker Studio. This custom image can be used to do interactive spark development in Python. This allow to read and write data from Amazon S3 buckets.

The image is based on Spark version 2.4.0, Hadoop version 2.7 and openjdk 8. This image also have latest version of sagemaker_pyspark (1.4.2), to have aws-hadoop and other dependents jar, to work with AWS services like Amazon simple storage service (s3).

Example notebook (pyspark-kernel-file-read) explains on how to use different credential provider to make calls to S3.

### Building the image

Build the Docker image and push to Amazon ECR.

```bash
# Modify these as required. The Docker registry endpoint can be tuned based on your current region from https://docs.aws.amazon.com/general/latest/gr/ecr.html#ecr-docker-endpoints
REGION=<aws-region>
ACCOUNT_ID=<account-id>


# Build the image
IMAGE_NAME=spark-kernel
aws --region ${REGION} ecr get-login-password | docker login --username AWS --password-stdin ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom
docker build . -t ${IMAGE_NAME} -t ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}
```

```bash
docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}
```

### Using with SageMaker Studio

Create a SageMaker Image with the image in ECR.

```bash
# Role in your account to be used for the SageMaker Image
ROLE_ARN=<role-arn>

aws --region ${REGION} sagemaker create-image \
--image-name ${IMAGE_NAME} \
--role-arn ${ROLE_ARN}

aws --region ${REGION} sagemaker create-image-version \
--image-name ${IMAGE_NAME} \
--base-image "${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}"

# Verify the image-version is created successfully. Do NOT proceed if image-version is in CREATE_FAILED state or in any other state apart from CREATED.
aws --region ${REGION} sagemaker describe-image-version --image-name ${IMAGE_NAME}
```

Create a AppImageConfig for this image

```bash
aws --region ${REGION} sagemaker create-app-image-config --cli-input-json file://app-image-config-input.json
```

Create a Domain, providing the SageMaker Image and AppImageConfig in the Domain creation. Replace the placeholders for VPC ID, Subnet IDs, and Execution Role in `create-domain-input.json`

```bash
aws --region ${REGION} sagemaker create-domain --cli-input-json file://create-domain-input.json
```

If you have an existing Domain, you can also use the `update-domain`

```bash
aws --region ${REGION} sagemaker update-domain --cli-input-json file://update-domain-input.json
```
16 changes: 16 additions & 0 deletions examples/spark-image/app-image-config-input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"AppImageConfigName": "custom-spark-image-config",
"KernelGatewayImageConfig": {
"KernelSpecs": [
{
"Name": "python3",
"DisplayName": "spark"
}
],
"FileSystemConfig": {
"MountPath": "/home/sagemaker-user",
"DefaultUid": 1000,
"DefaultGid": 100
}
}
}
19 changes: 19 additions & 0 deletions examples/spark-image/create-domain-input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"DomainName": "domain-with-spark-kernel-image",
"VpcId": "<vpc-id>",
"SubnetIds": [
"<subnet-ids>"
],
"DefaultUserSettings": {
"ExecutionRole": "<execution-role>",
"KernelGatewayAppSettings": {
"CustomImages": [
{
"ImageName": "spark-kernel",
"AppImageConfigName": "custom-spark-image-config"
}
]
}
},
"AuthMode": "IAM"
}
4 changes: 4 additions & 0 deletions examples/spark-image/fix-host-settings
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
sudo -i chmod 777 /etc/hosts > /dev/null 2>&1
if !(grep -qi $HOSTNAME /etc/hosts); then echo "127.0.0.1 ${HOSTNAME}" >> /etc/hosts; fi
cat /etc/hosts
35 changes: 35 additions & 0 deletions examples/spark-image/fix-permissions
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# set permissions on a directory
# after any installation, if a directory needs to be (human) user-writable,
# run this script on it.
# It will make everything in the directory owned by the group ${NB_GID}
# and writable by that group.
# Deployments that want to set a specific user id can preserve permissions
# by adding the `--group-add users` line to `docker run`.

# uses find to avoid touching files that already have the right permissions,
# which would cause massive image explosion

# right permissions are:
# group=${NB_GID}
# AND permissions include group rwX (directory-execute)
# AND directories have setuid,setgid bits set

set -e

for d in "$@"; do
find "${d}" \
! \( \
-group "${NB_GID}" \
-a -perm -g+rwX \
\) \
-exec chgrp "${NB_GID}" {} \; \
-exec chmod g+rwX {} \;
# setuid, setgid *on directories only*
find "${d}" \
\( \
-type d \
-a ! -perm -6000 \
\) \
-exec chmod +6000 {} \;
done
Loading