diff --git a/.github/workflows/mkdocs-release.yml b/.github/workflows/mkdocs-release.yml index 6c39a58f7..310e97d3f 100644 --- a/.github/workflows/mkdocs-release.yml +++ b/.github/workflows/mkdocs-release.yml @@ -3,6 +3,9 @@ name: mkdocs-release on: push: branches: [branch-*\.*] + repository_dispatch: + types: + - trigger-rebuild concurrency: group: ${{ github.workflow }} @@ -13,25 +16,93 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - name: Extract branch name (push) + if: ${{ github.event_name == 'push' }} + run: echo "BRANCH=${GITHUB_REF#refs/heads/}" >> "$GITHUB_ENV" + + - name: Extract branch name (repository_dispatch) + if: ${{ github.event_name == 'repository_dispatch' }} + run: echo "BRANCH=${{ github.event.client_payload.branch }}" >> "$GITHUB_ENV" + + - name: Extract version from branch name + run: echo "HOPSWORKS_VERSION=${BRANCH#branch-}" >> "$GITHUB_ENV" + + - name: Is latest release? + id: is-latest + uses: actions/github-script@v8 + with: + script: | + const branches_url = context.payload.repository.branches_url + const branches_url_new = branches_url.replace("{/branch}", "") + const result = await github.request(branches_url_new) + const names = result.data.map(branch => branch.name) + const versions = names.filter(name => name.startsWith('branch-')).map(name => name.replace('branch-', '')) + const minorLength = Math.max(...versions.map(v => v.split('.')[1].length)) + const convertVersionToNumber = (version) => { + const parts = version.split('.').map(Number) + return parts[0]*10**minorLength + parts[1] + } + + return Math.max(...versions.map(convertVersionToNumber)) === convertVersionToNumber(process.env.HOPSWORKS_VERSION) + + - name: Checkout main repo + uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ env.BRANCH }} + + - name: Checkout the API repo + uses: actions/checkout@v4 + with: + repository: logicalclocks/hopsworks-api + ref: ${{ env.BRANCH }} + path: hopsworks-api + + - name: Cache local Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Set up JDK 8 + uses: actions/setup-java@v5 + with: + java-version: "8" + distribution: "adopt" + + - name: Build javadoc documentation + working-directory: hopsworks-api/java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../../docs/javadoc - uses: actions/setup-python@v5 with: python-version: "3.10" - - name: Install ubuntu dependencies - run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + activate-environment: true + working-directory: hopsworks-api/python - - name: install deps - run: pip3 install -r requirements-docs.txt + - name: Install Python API dependencies + run: uv sync --extra dev --group docs --project hopsworks-api/python - - name: setup git + - name: Install Python dependencies + run: uv pip install -r requirements-docs.txt + + - name: Install Ubuntu dependencies + run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev + + - name: Setup git for mike run: | git config --global user.name Mike git config --global user.email mike@docs.hopsworks.ai - # Put this back and increment version when cutting a new release branch - # - name: mike deploy docs - # run: mike deploy 3.0 latest -u --push + - name: Deploy the docs with mike + run: mike deploy ${HOPSWORKS_VERSION} --push + + - name: Update latest docs if needed + if: ${{ steps.is-latest.outputs.result == 'true' }} + run: mike alias ${HOPSWORKS_VERSION} latest --update-aliases --push diff --git a/.github/workflows/mkdocs-test.yml b/.github/workflows/mkdocs-test.yml index 5811f6a6d..a3ac88325 100644 --- a/.github/workflows/mkdocs-test.yml +++ b/.github/workflows/mkdocs-test.yml @@ -12,25 +12,59 @@ jobs: with: fetch-depth: 0 + - name: Checkout the API repo + uses: actions/checkout@v4 + with: + repository: logicalclocks/hopsworks-api + ref: ${{ github.base_ref }} + path: hopsworks-api + + - name: Markdownlint + uses: DavidAnson/markdownlint-cli2-action@v21 + with: + globs: '**/*.md' + + - name: Cache local Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Set up JDK 8 + uses: actions/setup-java@v5 + with: + java-version: "8" + distribution: "adopt" + + - name: Build javadoc documentation + working-directory: hopsworks-api/java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../../docs/javadoc + - uses: actions/setup-python@v5 with: python-version: "3.10" - - name: Install ubuntu dependencies - run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + activate-environment: true + working-directory: hopsworks-api/python - - name: install deps - run: pip3 install -r requirements-docs.txt + - name: Install Python API dependencies + run: uv sync --extra dev --group docs --project hopsworks-api/python - - name: setup git - run: | - git config --global user.name Mike - git config --global user.email mike@docs.hopsworks.ai + - name: Install Python dependencies + run: uv pip install -r requirements-docs.txt + + - name: Install Ubuntu dependencies + run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev - - name: test broken links + - name: Check for broken links run: | # run the server - mkdocs serve > /dev/null 2>&1 & + mkdocs serve > /dev/null 2>&1 & SERVER_PID=$! echo "mk server in PID $SERVER_PID" # Give enough time for deployment @@ -41,5 +75,10 @@ jobs: # If ok just kill the server kill -9 $SERVER_PID - - name: mike deploy docs + - name: Setup git for mike + run: | + git config --global user.name Mike + git config --global user.email mike@docs.hopsworks.ai + + - name: Generate the docs with mike run: mike deploy 3.2-SNAPSHOT dev -u diff --git a/.gitignore b/.gitignore index d90b8f4b0..5814a6d71 100644 --- a/.gitignore +++ b/.gitignore @@ -128,3 +128,4 @@ target/ # Mac .DS_Store +/temp_dir diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 000000000..6160b7f2b --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,8 @@ +MD041: false +MD013: false +MD033: false +MD045: false +MD046: false +MD052: false +MD004: + style: dash diff --git a/README.md b/README.md index 185f84e56..f4db48d35 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,69 @@ -# Documentation landing page +# Hopsworks Documentation -This is the source of the landing page for https://docs.hopsworks.ai +This is the source of the Hopsworks Documentation published at . ## Build instructions -### Step 1: Setup python environment +We use `mkdocs` together with [`mike`]((https://github.com/jimporter/mike/) for versioning to build the documentation. +We also use this two main mkdocs plugins: [`mkdocstrings`](https://mkdocstrings.github.io/) and [its Python handler](https://mkdocstrings.github.io/python/), and [`mkdocs-material`](https://squidfunk.github.io/mkdocs-material/) as the theme. -Create a python 3.10 environment, using a python environment manager of your own choosing. For example `virtualenv` or `anaconda`. +**Background about `mike`:** +`mike` builds the documentation and commits it as a new directory to the `gh-pages` branch. +Each directory corresponds to one version of the documentation. +Additionally, `mike` maintains a json in the root of `gh-pages` with the mappings of versions/aliases for each of the directories available. +With aliases, you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. -### Step 2 +### Versioning on docs.hopsworks.ai + +On docs.hopsworks.ai we implement the following versioning scheme: + +- the latest release: rendered with full current version, e.g. **4.4 [latest]** with `latest` alias to indicate that this is the latest stable release. +- previous stable releases: rendered without alias, e.g. **4.3**. + +### Step 1 -Clone this repository +Clone this repository: ```bash git clone https://github.com/logicalclocks/logicalclocks.github.io.git ``` -### Step 3 - -Install the required dependencies to build the documentation in the python environment created in the previous step. +### Step 2 -**Note that {PY_ENV} is the path to your python environment.** +Create a python virtual environment to build the documentation: ```bash -cd logicalclocks.github.io -{PY_ENV}/bin/pip3 install -r requirements-docs.txt +uv venv +uv pip install -r requirements-docs.txt +# Install hopsworks-api for gathering docstrings for the API reference +uv pip install git+https://github.com/logicalclocks/hopsworks-api.git@main#subdirectory=python ``` -### Step 4 +Alternatively, you can just activate the virtual environment you use for development of `hopsworks-api` (obtained via `uv sync`), this is the way it is done in the actions. +Namely, in `.github/workflows/mkdocs-release.yml` and `.github/workflows/mkdocs-test.yml`, the `hopsworks-api` repo is cloned, and its uv virtual environment is used with `dev` extra and all development groups. -Use mkdocs to build the documentation and serve it locally +A callback is set in `hopsworks-api` GitHub Actions, which triggers `.github/workflows/mkdocs-release.yml` on any pushes to release branches (that is, `branch-x.x`). + +### Step 3 + +Build and serve the docs using mike. ```bash -{PY_ENV}/bin/mkdocs serve +# Use the current version instead of 4.4: +mike deploy 4.4 latest --update-alias +# Next, serve the docs to access them locally: +mike serve ``` -The documentation should now be available locally on the following URL: http://127.0.0.1:8000/ +**Important**: The first time you serve the docs, you have to choose a default version, as follows: + +```bash +mike set-default latest +``` ## Adding new pages -The `mkdocs.yml` file of this repository defines the pages to show in the navigation. +The `mkdocs.yml` file of this repository defines the pages to show in the navigation. After adding your new page in the docs folder, you also need to add it to this file for it to show up in the navigation. ## Checking links @@ -56,4 +80,4 @@ linkchecker http://127.0.0.1:8000/ # If ok just kill the server kill -9 $SERVER_PID -``` \ No newline at end of file +``` diff --git a/docs/concepts/dev/inside.md b/docs/concepts/dev/inside.md index 9013246b7..5be75710c 100644 --- a/docs/concepts/dev/inside.md +++ b/docs/concepts/dev/inside.md @@ -1,34 +1,46 @@ -Hopsworks provides a complete self-service development environment for feature engineering and model training. You can develop programs as Jupyter notebooks or jobs, customize the bundled FTI (feature, training and inference pipeline) python environments, you can manage your source code with Git, and you can orchestrate jobs with Airflow. - +Hopsworks provides a complete self-service development environment for feature engineering and model training. +You can develop programs as Jupyter notebooks or jobs, customize the bundled FTI (feature, training and inference pipeline) python environments, you can manage your source code with Git, and you can orchestrate jobs with Airflow. + +Hopsworks Development Environment ### Jupyter Notebooks -Hopsworks provides a Jupyter notebook development environment for programs written in Python, Spark, Flink, and SparkSQL. You can also develop in your IDE (PyCharm, IntelliJ, etc), test locally, and then run your programs as Jobs in Hopsworks. Jupyter notebooks can also be run as Jobs. +Hopsworks provides a Jupyter notebook development environment for programs written in Python, Spark, Flink, and SparkSQL. +You can also develop in your IDE (PyCharm, IntelliJ, etc), test locally, and then run your programs as Jobs in Hopsworks. +Jupyter notebooks can also be run as Jobs. ### Source Code Control -Hopsworks provides source code control support using Git (GitHub, GitLab or BitBucket). You can securely checkout code into your project and commit and push updates to your code to your source code repository. +Hopsworks provides source code control support using Git (GitHub, GitLab or BitBucket). +You can securely check out code into your project and commit and push updates to your code to your source code repository. ### FTI Pipeline Environments -Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. This architecture consists of three independently developed and operated ML pipelines: +Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. +This architecture consists of three independently developed and operated ML pipelines: -* Feature pipeline: takes as input raw data that it transforms into features (and labels) -* Training pipeline: takes as input features (and labels) and outputs a trained model -* Inference pipeline: takes new feature data and a trained model and makes predictions +- Feature pipeline: takes as input raw data that it transforms into features (and labels) +- Training pipeline: takes as input features (and labels) and outputs a trained model +- Inference pipeline: takes new feature data and a trained model and makes predictions -In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. Each of these environments may then also be customized further by cloning it and installing additional dependencies from PyPi, Conda channels, Wheel files, GitHub repos or a custom Dockerfile. Internal compute such as Jobs and Jupyter is run in one of these environments and changes are applied transparently when you install new libraries using our APIs. That is, there is no need to write a Dockerfile, users install libraries directly in one or more of the environments. You can setup custom development and production environments by creating separate projects or creating multiple clones of an environment within the same project. +In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. +Each of these environments may then also be customized further by cloning it and installing additional dependencies from PyPi, Conda channels, Wheel files, GitHub repos or a custom Dockerfile. +Internal compute such as Jobs and Jupyter is run in one of these environments and changes are applied transparently when you install new libraries using our APIs. +That is, there is no need to write a Dockerfile, users install libraries directly in one or more of the environments. +You can setup custom development and production environments by creating separate projects or creating multiple clones of an environment within the same project. ### Jobs -In Hopsworks, a Job is a schedulable program that is allocated compute and memory resources. You can run a Job in Hopsworks: +In Hopsworks, a Job is a schedulable program that is allocated compute and memory resources. +You can run a Job in Hopsworks: -* From the UI -* Programmatically with the Hopsworks SDK (Python, Java) or REST API -* From Airflow programs (either inside our outside Hopsworks) -* From your IDE using a plugin ([PyCharm/IntelliJ plugin](https://plugins.jetbrains.com/plugin/15537-hopsworks)) +- From the UI +- Programmatically with the Hopsworks SDK (Python, Java) or REST API +- From Airflow programs (either inside our outside Hopsworks) +- From your IDE using a plugin ([PyCharm/IntelliJ plugin](https://plugins.jetbrains.com/plugin/15537-hopsworks)) ### Orchestration -Airflow comes out-of-the box with Hopsworks, but you can also use an external Airflow cluster (with the Hopsworks Job operator) if you have one. Airflow can be used to schedule the execution of Jobs, individually or as part of Airflow DAGs. \ No newline at end of file +Airflow comes out-of-the box with Hopsworks, but you can also use an external Airflow cluster (with the Hopsworks Job operator) if you have one. +Airflow can be used to schedule the execution of Jobs, individually or as part of Airflow DAGs. diff --git a/docs/concepts/dev/outside.md b/docs/concepts/dev/outside.md index 53825dcac..15f7c868c 100644 --- a/docs/concepts/dev/outside.md +++ b/docs/concepts/dev/outside.md @@ -1,5 +1,8 @@ -You can write programs that use Hopsworks in any [Python, Spark, PySpark, or Flink environment](../../user_guides/integrations/index.md). Hopsworks also running SQL queries to compute features in external data warehouses. The Feature Store can also be queried with SQL. +You can write programs that use Hopsworks in any [Python, Spark, PySpark, or Flink environment](../../user_guides/integrations/index.md). +Hopsworks also running SQL queries to compute features in external data warehouses. +The Feature Store can also be queried with SQL. -There is REST API for Hopsworks that can be used with a valid API key, generated in Hopsworks. However, it is often easier to develop your programs against SDKs available in Python and Java/Scala for HSFS, in Python for HSML, and in Python for the Hopsworks API. +There is REST API for Hopsworks that can be used with a valid API key, generated in Hopsworks. +However, it is often easier to develop your programs against SDKs available in Python and Java/Scala for HSFS, in Python for HSML, and in Python for the Hopsworks API. diff --git a/docs/concepts/fs/feature_group/external_fg.md b/docs/concepts/fs/feature_group/external_fg.md index 7d260b816..01fd2119d 100644 --- a/docs/concepts/fs/feature_group/external_fg.md +++ b/docs/concepts/fs/feature_group/external_fg.md @@ -1,6 +1,9 @@ -External feature groups are offline feature groups where their data is stored in an external table. An external table requires a data source, defined with the Connector API (or more typically in the user interface), to enable HSFS to retrieve data from the external table. An external feature group doesn't allow for offline data ingestion or modification; instead, it includes a user-defined SQL string for retrieving data. You can also perform SQL operations, including projections, aggregations, and so on. The SQL query is executed on-demand when HSFS retrieves data from the external Feature Group, for example, when creating training data using features in the external table. +External feature groups are offline feature groups where their data is stored in an external table. +An external table requires a data source, defined with the Connector API (or more typically in the user interface), to enable HSFS to retrieve data from the external table. +An external feature group doesn't allow for offline data ingestion or modification; instead, it includes a user-defined SQL string for retrieving data. +You can also perform SQL operations, including projections, aggregations, and so on. +The SQL query is executed on-demand when HSFS retrieves data from the external Feature Group, for example, when creating training data using features in the external table. In the image below, we can see that HSFS currently supports a large number of data sources, including any JDBC-enabled source, Snowflake, Data Lake, Redshift, BigQuery, S3, ADLS, GCS, RDS, and Kafka - diff --git a/docs/concepts/fs/feature_group/feature_monitoring.md b/docs/concepts/fs/feature_group/feature_monitoring.md index f2a4c16a2..b8a05c7b0 100644 --- a/docs/concepts/fs/feature_group/feature_monitoring.md +++ b/docs/concepts/fs/feature_group/feature_monitoring.md @@ -8,7 +8,8 @@ HSFS supports monitoring features on your Feature Group by: ## Scheduled Statistics -After creating a Feature Group in HSFS, you can setup statistics monitoring to compute statistics over one or more features on a scheduled basis. Statistics are computed on the whole or a subset of feature data (i.e., detection window) already inserted into the Feature Group. +After creating a Feature Group in HSFS, you can setup statistics monitoring to compute statistics over one or more features on a scheduled basis. +Statistics are computed on the whole or a subset of feature data (i.e., detection window) already inserted into the Feature Group. ## Statistics Comparison @@ -16,5 +17,3 @@ In addition to scheduled statistics, you can enable the comparison of statistics !!! info "Feature Monitoring Guide" More information can be found in the [Feature monitoring guide](../../../user_guides/fs/feature_monitoring/index.md). - - diff --git a/docs/concepts/fs/feature_group/feature_pipelines.md b/docs/concepts/fs/feature_group/feature_pipelines.md index 9c99d5a59..18ddb465f 100644 --- a/docs/concepts/fs/feature_group/feature_pipelines.md +++ b/docs/concepts/fs/feature_group/feature_pipelines.md @@ -1,45 +1,71 @@ -A feature pipeline is a program that orchestrates the execution of a dataflow graph of data validation, aggregation, dimensionality reduction, transformation, and other feature engineering steps on input data to create and/or update feature data. With HSFS, you can write feature pipelines in different languages as shown in the figure below. +A feature pipeline is a program that orchestrates the execution of a dataflow graph of data validation, aggregation, dimensionality reduction, transformation, and other feature engineering steps on input data to create and/or update feature data. +With HSFS, you can write feature pipelines in different languages as shown in the figure below. ### Data Sources -Your feature pipeline needs to connect to some (external) data source to read the data to be processed. Python, Spark, and Flink have connectors to a huge number of different data sources, while SQL feature pipelines are often restricted to a single data source (for example, your connector to SnowFlake only runs SQL on SnowFlake). SparkSQL, in contrast, can be used over tables that originate in different data sources. + +Your feature pipeline needs to connect to some (external) data source to read the data to be processed. +Python, Spark, and Flink have connectors to a huge number of different data sources, while SQL feature pipelines are often restricted to a single data source (for example, your connector to SnowFlake only runs SQL on SnowFlake). +SparkSQL, in contrast, can be used over tables that originate in different data sources. ### Data Validation -In order to be able to train and serve models that you can rely on, you need clean, high quality features. Data validation operations include removing bad data, removing or imputing missing values, and identifying problems such as feature shift. HSFS supports Great Expectations to specify data validation rules that are executed in the client before features are written to the Feature Store. The validation results are collected and shown in Hopsworks. + +In order to be able to train and serve models that you can rely on, you need clean, high quality features. +Data validation operations include removing bad data, removing or imputing missing values, and identifying problems such as feature shift. +HSFS supports Great Expectations to specify data validation rules that are executed in the client before features are written to the Feature Store. +The validation results are collected and shown in Hopsworks. ### Aggregations -Aggregations are used to summarize large datasets into more concise, signal-rich features. Popular aggregations include count(), sum(), mean(), median(), stddev(), min(), and max(). These aggregations produce a single number (a numerical feature) that captures information about a potentially large dataset. Both numerical and categorical features are often transformed before being used to train or serve models. +Aggregations are used to summarize large datasets into more concise, signal-rich features. +Popular aggregations include count(), sum(), mean(), median(), stddev(), min(), and max(). +These aggregations produce a single number (a numerical feature) that captures information about a potentially large dataset. +Both numerical and categorical features are often transformed before being used to train or serve models. ### Dimensionality Reduction -If input data is impractically large or if it has a significant amount of redundancy, it can often be transformed into a reduced set of features with dimensionality reduction (often called feature extraction). Popular dimensionality algorithms include embedding algorithms, PCA, and TSNE. + +If input data is impractically large or if it has a significant amount of redundancy, it can often be transformed into a reduced set of features with dimensionality reduction (often called feature extraction). +Popular dimensionality algorithms include embedding algorithms, PCA, and TSNE. ### Transformations -Transformations are covered in more detail in [training/inference pipelines](../feature_view/training_inference_pipelines.md), as transformations typically happen after the feature store. If you store transformed features in feature groups, the feature data is no longer useful for EDA (as it near to impossible for Data Scientists to understand the transformed values). It also makes it impossible for inference pipelines to log untransformed feature values and predictions for an operational model. There is one use case for storing transformed features in feature groups - when you need to have ultra low latency when reading precomputed features (and online transformations when reading features add too much latency for your use case). The figure below shows to include transformations in your feature pipelines. + +Transformations are covered in more detail in [training/inference pipelines](../feature_view/training_inference_pipelines.md), as transformations typically happen after the feature store. +If you store transformed features in feature groups, the feature data is no longer useful for EDA (as it near to impossible for Data Scientists to understand the transformed values). +It also makes it impossible for inference pipelines to log untransformed feature values and predictions for an operational model. +There is one use case for storing transformed features in feature groups - when you need to have ultra low latency when reading precomputed features (and online transformations when reading features add too much latency for your use case). +The figure below shows to include transformations in your feature pipelines. ### Feature Engineering in Python -Python is the most widely used framework for feature engineering due to its extensive library support for aggregations (Pandas/Polars), data validation (Great Expectations), and dimensionality reduction (embeddings, PCA), and transformations (in Scikit-Learn, TensorFlow, PyTorch). Python also supports open-source feature engineering frameworks used for automated feature engineering, such as [featuretools](https://www.featuretools.com/) that supports relational and temporal sources. +Python is the most widely used framework for feature engineering due to its extensive library support for aggregations (Pandas/Polars), data validation (Great Expectations), and dimensionality reduction (embeddings, PCA), and transformations (in Scikit-Learn, TensorFlow, PyTorch). +Python also supports open-source feature engineering frameworks used for automated feature engineering, such as [featuretools](https://www.featuretools.com/) that supports relational and temporal sources. ### Feature Engineering in Spark/PySpark -Spark is popular as a feature engineering framework as it can scale to process larger volumes of data than Python, and provides native support for aggregations, and it supports many of the same data validation (Great Expectations), and dimensionality reduction algorithms (embeddings, PCA) as Python. Spark also has native support for transformations, which are useful for analytical models (batch scoring), but less useful for operational models, where online transformations are required, and Spark environments are less common. Online model serving environments typically only support online transformations in Python. +Spark is popular as a feature engineering framework as it can scale to process larger volumes of data than Python, and provides native support for aggregations, and it supports many of the same data validation (Great Expectations), and dimensionality reduction algorithms (embeddings, PCA) as Python. +Spark also has native support for transformations, which are useful for analytical models (batch scoring), but less useful for operational models, where online transformations are required, and Spark environments are less common. +Online model serving environments typically only support online transformations in Python. ### Feature Engineering in SQL -SQL has grown in popularity for performing heavy lifting in feature pipelines - computing aggregates on data - when the input data already resides in a data warehouse. Data warehouses also support data validation, for example, through Great Expectations in DBT. However, SQL is not mature as a platform for transformations and dimensionality reductions, where UDFs are applied row-wise. +SQL has grown in popularity for performing heavy lifting in feature pipelines - computing aggregates on data - when the input data already resides in a data warehouse. +Data warehouses also support data validation, for example, through Great Expectations in DBT. +However, SQL is not mature as a platform for transformations and dimensionality reductions, where UDFs are applied row-wise. You can do aggregation in SQL for data in your data warehouse or database. - ### Feature Engineering in Flink -Apache Flink is a powerful and flexible framework for stateful feature computation operations over unbounded and bounded data streams. It is used for feature engineering when you need very fresh features computed in real-time. Flink provides a rich set of operators and functions such as time windows and aggregation operations that can be applied to keyed and/or global window streams. Flink’s stateful operations allow users to maintain and update state across multiple data records or events, which is particularly useful for feature engineering tasks such as sessionization and/or maintaining rolling aggregates over a sliding window of data. -Flink feature engineering pipelines are supported in Java/Scala only. +Apache Flink is a powerful and flexible framework for stateful feature computation operations over unbounded and bounded data streams. +It is used for feature engineering when you need very fresh features computed in real-time. +Flink provides a rich set of operators and functions such as time windows and aggregation operations that can be applied to keyed and/or global window streams. +Flink’s stateful operations allow users to maintain and update state across multiple data records or events, which is particularly useful for feature engineering tasks such as sessionization and/or maintaining rolling aggregates over a sliding window of data. +Flink feature engineering pipelines are supported in Java/Scala only. ### Feature Engineering in Beam -Beam feature engineering pipelines are supported in Java/Scala only. \ No newline at end of file + +Beam feature engineering pipelines are supported in Java/Scala only. diff --git a/docs/concepts/fs/feature_group/fg_overview.md b/docs/concepts/fs/feature_group/fg_overview.md index d7a9311af..277ae5dee 100644 --- a/docs/concepts/fs/feature_group/fg_overview.md +++ b/docs/concepts/fs/feature_group/fg_overview.md @@ -1,10 +1,13 @@ -As a programmer, you can consider a feature, in machine learning, to be a variable associated with some entity that contains a value that is useful for helping train a model to solve a prediction problem. That is, the feature is just a variable with predictive power for a machine learning problem, or task. +As a programmer, you can consider a feature, in machine learning, to be a variable associated with some entity that contains a value that is useful for helping train a model to solve a prediction problem. +That is, the feature is just a variable with predictive power for a machine learning problem, or task. -A feature group is a table of features, where each feature group has a primary key, and optionally an event_time column (indicating when the features in that row were observed), and a partition key. Collectively, they are referred to as columns. The partition key determines how to layout the feature group rows on disk such that you can efficiently query the data using queries with the partition key. For example, if your partition key is the day and you have hundreds of days worth of data, with a partition key, you can query the day for only a given day or a range of days, and only the data for those days will be read from disk. +A feature group is a table of features, where each feature group has a primary key, and optionally an event_time column (indicating when the features in that row were observed), and a partition key. +Collectively, they are referred to as columns. +The partition key determines how to layout the feature group rows on disk such that you can efficiently query the data using queries with the partition key. +For example, if your partition key is the day and you have hundreds of days worth of data, with a partition key, you can query the day for only a given day or a range of days, and only the data for those days will be read from disk. - ### Online and offline Storage Feature groups can be stored in a low-latency "online" database and/or in low cost, high throughput "offline" storage, typically a data lake or data warehouse. @@ -13,10 +16,13 @@ Feature groups can be stored in a low-latency "online" database and/or in low co #### Online Storage -The online store stores only the latest values of features for a feature group. It is used to serve pre-computed features to models at runtime. +The online store stores only the latest values of features for a feature group. +It is used to serve pre-computed features to models at runtime. #### Offline Storage -The offline store stores the historical values of features for a feature group so that it may store much more data than the online store. Offline feature groups are used, typically, to create training data for models, but also to retrieve data for batch scoring of models. +The offline store stores the historical values of features for a feature group so that it may store much more data than the online store. +Offline feature groups are used, typically, to create training data for models, but also to retrieve data for batch scoring of models. -In most cases, offline data is stored in Hopsworks, but through the implementation of data sources, it can reside in an external file system. The externally stored data can be managed by Hopsworks by defining ordinary feature groups or it can be used for reading only by defining [External Feature Group](external_fg.md). \ No newline at end of file +In most cases, offline data is stored in Hopsworks, but through the implementation of data sources, it can reside in an external file system. +The externally stored data can be managed by Hopsworks by defining ordinary feature groups or it can be used for reading only by defining [External Feature Group](external_fg.md). diff --git a/docs/concepts/fs/feature_group/fg_statistics.md b/docs/concepts/fs/feature_group/fg_statistics.md index a1c368ab7..d4a29679b 100644 --- a/docs/concepts/fs/feature_group/fg_statistics.md +++ b/docs/concepts/fs/feature_group/fg_statistics.md @@ -1,22 +1,23 @@ HSFS supports monitoring, validation, and alerting for features: - - transparently compute statistics over features on writing to a feature group; - - validation of data written to feature groups using Great Expectations - - alerting users when there was a problem writing or update features. +- transparently compute statistics over features on writing to a feature group; +- validation of data written to feature groups using Great Expectations +- alerting users when there was a problem writing or update features. ### Statistics -When you create a Feature Group in HSFS, you can configure it to compute statistics over the features inserted into the Feature Group by setting the `statistics_config` dict parameter, see [Feature Group Statistics](../../../../user_guides/fs/feature_group/statistics/) for details. Every time you write to the Feature Group, new statistics will be computed over all of the data in the Feature Group. - +When you create a Feature Group in HSFS, you can configure it to compute statistics over the features inserted into the Feature Group by setting the `statistics_config` dict parameter, see [Feature Group Statistics](../../../user_guides/fs/feature_group/statistics.md) for details. +Every time you write to the Feature Group, new statistics will be computed over all of the data in the Feature Group. ### Data Validation -You can define expectation suites in Great Expectations and associate them with feature groups. When you write to a feature group, the expectations are executed, then you can define a policy on the feature group for what to do if any expectation fails. +You can define expectation suites in Great Expectations and associate them with feature groups. +When you write to a feature group, the expectations are executed, then you can define a policy on the feature group for what to do if any expectation fails. - - ### Alerting -HSFS also supports alerts, that can be triggered when there are problems in your feature pipelines, for example, when a write fails due to an error or a failed expectation. You can send alerts to different alerting endpoints, such as email or Slack, that can be configured in the Hopsworks UI. For example, you can send a slack message if features being written to a feature group are missing some input data. +HSFS also supports alerts, that can be triggered when there are problems in your feature pipelines, for example, when a write fails due to an error or a failed expectation. +You can send alerts to different alerting endpoints, such as email or Slack, that can be configured in the Hopsworks UI. +For example, you can send a slack message if features being written to a feature group are missing some input data. diff --git a/docs/concepts/fs/feature_group/on_demand_feature.md b/docs/concepts/fs/feature_group/on_demand_feature.md index d919e6e87..787751654 100644 --- a/docs/concepts/fs/feature_group/on_demand_feature.md +++ b/docs/concepts/fs/feature_group/on_demand_feature.md @@ -4,9 +4,11 @@ description: On-demand feature computation. # On-demand features -Features are defined as on-demand when their value cannot be pre-computed beforehand, rather they need to be computed in real-time during inference. This is achieved by implementing the on-demand features as a Python function in a Python module. Also ensure that the same version of the Python module is installed in both the feature and inference pipelines. +Features are defined as on-demand when their value cannot be pre-computed beforehand, rather they need to be computed in real-time during inference. +This is achieved by implementing the on-demand features as a Python function in a Python module. +Also ensure that the same version of the Python module is installed in both the feature and inference pipelines. -In the image below shows an example of a housing price model that demonstrates how to implement an on-demand feature, a zip code (or post code) that is computed using longitude/latitude parameters. In your online application, longitude and latitude are provided as parameters to the application, and the same python function used to calculate the zip code in the feature pipeline is used to compute the zip code in the Online Inference pipeline. +In the image below shows an example of a housing price model that demonstrates how to implement an on-demand feature, a zip code (or post code) that is computed using longitude/latitude parameters. +In your online application, longitude and latitude are provided as parameters to the application, and the same python function used to calculate the zip code in the feature pipeline is used to compute the zip code in the Online Inference pipeline. - diff --git a/docs/concepts/fs/feature_group/spine_group.md b/docs/concepts/fs/feature_group/spine_group.md index 3ac7919b7..cc22625c6 100644 --- a/docs/concepts/fs/feature_group/spine_group.md +++ b/docs/concepts/fs/feature_group/spine_group.md @@ -8,10 +8,12 @@ It is possible to maintain labels or prediction events among the regular feature with a feature pipeline updating the labels at a specific cadence. Often times, however, it is more convenient to provide the training events or entities in a Dataframe when reading -feature data from the feature store through a feature view. We call such a Dataframe a Spine as it is the structure around which +feature data from the feature store through a feature view. +We call such a Dataframe a Spine as it is the structure around which the training data or batch data is built. In order to retrieve the correct feature values for the entities in the Dataframe, using -a point-in-time correct join, some additional metadata apart from the Dataframe schema is necessary. Namely, the information about which +a point-in-time correct join, some additional metadata apart from the Dataframe schema is necessary. +Namely, the information about which columns define the **primary key**, and which column indicates the **event time** at which the label was valid. The spine Dataframe together with this additional metadata is what we call a **Spine Group**. diff --git a/docs/concepts/fs/feature_group/versioning.md b/docs/concepts/fs/feature_group/versioning.md index c0bd1cfd3..a537d9842 100644 --- a/docs/concepts/fs/feature_group/versioning.md +++ b/docs/concepts/fs/feature_group/versioning.md @@ -2,11 +2,14 @@ See here for informat ### Schema Versioning -The schema of feature groups is versioned. If you make a breaking change to the schema of a feature group, you need to increment the version of the feature group, and then backfill the new feature group. A breaking schema change is when you: +The schema of feature groups is versioned. +If you make a breaking change to the schema of a feature group, you need to increment the version of the feature group, and then backfill the new feature group. +A breaking schema change is when you: - - drop a column from the schema - - add a new feature without any default value for the new feature - - change how a feature is computed, such that, for training models, the data for the old feature is not compatible with the data for the new feature. For example, if you have an embedding as a feature and change the algorithm to compute that embedding, you probably should not mix feature values computed with the old embedding model with feature values computed with the new embedding model. +- drop a column from the schema +- add a new feature without any default value for the new feature +- change how a feature is computed, such that, for training models, the data for the old feature is not compatible with the data for the new feature. + For example, if you have an embedding as a feature and change the algorithm to compute that embedding, you probably should not mix feature values computed with the old embedding model with feature values computed with the new embedding model. @@ -15,4 +18,3 @@ The schema of feature groups is versioned. If you make a breaking change to the Data Versioning of a feature group involves tracking updates to the feature group, so that you can recover the state of the feature group at a given point-in-time in the past. - diff --git a/docs/concepts/fs/feature_group/write_apis.md b/docs/concepts/fs/feature_group/write_apis.md index 0565ee2b8..6c0ed6cef 100644 --- a/docs/concepts/fs/feature_group/write_apis.md +++ b/docs/concepts/fs/feature_group/write_apis.md @@ -2,32 +2,35 @@ You write to feature groups, and read from feature views. There are 3 APIs for writing to feature groups, as shown in the table below: -| | Stream API | Batch API | Connector API | -|---|---|---|---| -| Python | X | - | - | +| | Stream API | Batch API | Connector API | +| --- | --- | --- | --- | +| Python | X | - | - | | Spark | X | X | - | -| Flink | X | - | - | +| Flink | X | - | - | | External Table | - | - | X | - ## Stream API The Stream API is the only API for Python and Flink clients, and is the preferred API for Spark, as it ensures consistent features between offline and online feature stores. -The Stream API first writes data to be ingested to a Kafka topic, and then Hopsworks ensures that the data is synchronized to the Online and Offline Feature Groups through the OnlineFS service and Hudi DeltaStreamer jobs, respectively. The data in the feature groups is guaranteed to arrive at-most-once, through idempotent writes to the online feature group (only the latest values of features are stored there, and duplicates in Kafka only cause idempotent updates) and duplicate removal by Apache Hudi for the offline feature group. +The Stream API first writes data to be ingested to a Kafka topic, and then Hopsworks ensures that the data is synchronized to the Online and Offline Feature Groups through the OnlineFS service and Hudi DeltaStreamer jobs, respectively. +The data in the feature groups is guaranteed to arrive at-most-once, through idempotent writes to the online feature group (only the latest values of features are stored there, and duplicates in Kafka only cause idempotent updates) and duplicate removal by Apache Hudi for the offline feature group. - ## Batch API -For very large updates to feature groups, such as when you are backfilling large amounts of data to an offline feature group, it is often preferential to write directly to the Hudi tables in Hopsworks, instead of via Kafka - thus reducing write amplification. Spark clients can write directly to Hudi tables on Hopsworks with Hopsworks libraries and certificates using a HDFS API. This requires network connectivity between the Spark clients and the datanodes in Hopsworks. +For very large updates to feature groups, such as when you are backfilling large amounts of data to an offline feature group, it is often preferential to write directly to the Hudi tables in Hopsworks, instead of via Kafka - thus reducing write amplification. +Spark clients can write directly to Hudi tables on Hopsworks with Hopsworks libraries and certificates using a HDFS API. +This requires network connectivity between the Spark clients and the datanodes in Hopsworks. - ## Connector API -Hopsworks supports external tables as feature groups. You can mount a table from an external database as an offline feature group using the Connector API - you create an external table using the connector. This enables you to use features from your external data source (Snowflake, Redshift, Delta Lake, etc) as you would any feature in an offline feature group in Hopsworks. You can, for example, join features from different feature groups (external or not) together to create feature views and training data for models. +Hopsworks supports external tables as feature groups. +You can mount a table from an external database as an offline feature group using the Connector API - you create an external table using the connector. +This enables you to use features from your external data source (Snowflake, Redshift, Delta Lake, etc) as you would any feature in an offline feature group in Hopsworks. +You can, for example, join features from different feature groups (external or not) together to create feature views and training data for models. diff --git a/docs/concepts/fs/feature_view/feature_monitoring.md b/docs/concepts/fs/feature_view/feature_monitoring.md index f3ce7764e..2dce5e4bf 100644 --- a/docs/concepts/fs/feature_view/feature_monitoring.md +++ b/docs/concepts/fs/feature_view/feature_monitoring.md @@ -8,7 +8,8 @@ HSFS supports monitoring features on your Feature View by: ## Scheduled Statistics -After creating a Feature View in HSFS, you can setup statistics monitoring to compute statistics over one or more features on a scheduled basis. Statistics are computed on the whole or a subset of feature data (i.e., detection window) using the Feature View query. +After creating a Feature View in HSFS, you can setup statistics monitoring to compute statistics over one or more features on a scheduled basis. +Statistics are computed on the whole or a subset of feature data (i.e., detection window) using the Feature View query. ## Statistics Comparison @@ -16,5 +17,3 @@ In addition to scheduled statistics, you can enable the comparison of statistics !!! info "Feature Monitoring Guide" More information can be found in the [Feature monitoring guide](../../../user_guides/fs/feature_monitoring/index.md). - - diff --git a/docs/concepts/fs/feature_view/fv_overview.md b/docs/concepts/fs/feature_view/fv_overview.md index a15ff877a..a7afa2b82 100644 --- a/docs/concepts/fs/feature_view/fv_overview.md +++ b/docs/concepts/fs/feature_view/fv_overview.md @@ -1,17 +1,24 @@ -A feature view is a logical view over (or interface to) a set of features that may come from different feature groups. You create a feature view by joining together features from existing feature groups. In the illustration below, we can see that features are joined together from the two feature groups: seller_delivery_time_monthly and the seller_reviews_quarterly. You can also see that features in the feature view inherit not only the feature type from their feature groups, but also whether they are the primary key and/or the event_time. The image also includes transformation functions that are applied to individual features. Transformation functions are a part of the feature types included in the feature view. That is, a feature in a feature view is not only defined by its data type (int, string, etc) or its feature type (categorical, numerical, embedding), but also by its transformation. +A feature view is a logical view over (or interface to) a set of features that may come from different feature groups. +You create a feature view by joining together features from existing feature groups. +In the illustration below, we can see that features are joined together from the two feature groups: seller_delivery_time_monthly and the seller_reviews_quarterly. +You can also see that features in the feature view inherit not only the feature type from their feature groups, but also whether they are the primary key and/or the event_time. +The image also includes transformation functions that are applied to individual features. +Transformation functions are a part of the feature types included in the feature view. +That is, a feature in a feature view is not only defined by its data type (int, string, etc) or its feature type (categorical, numerical, embedding), but also by its transformation. - Feature views can also include: - - the label for the supervised ML problem - - transformation functions that should be applied to specified features consistently between training and serving - - the ability to create training data - - the ability to retrieve a feature vector with the most recent feature values +- the label for the supervised ML problem +- transformation functions that should be applied to specified features consistently between training and serving +- the ability to create training data +- the ability to retrieve a feature vector with the most recent feature values In the flow chart below, we can see the decisions that can be taken when creating (1) a feature view, and (2) creating training data with the feature view. -We can see here how the feature view is a representation for a model in the feature store - the same feature view is used to retrieve feature vectors for operational model that was created with training data from this feature view. As such, you can see that the most common use case for creating a feature view is to define the features that will be used in a model. In this way, feature views enable features from different feature groups to be reused across different models, and if features are stored untransformed in feature groups, they become even more reusable, as different feature views can apply different transformations to the same feature. +We can see here how the feature view is a representation for a model in the feature store - the same feature view is used to retrieve feature vectors for operational model that was created with training data from this feature view. +As such, you can see that the most common use case for creating a feature view is to define the features that will be used in a model. +In this way, feature views enable features from different feature groups to be reused across different models, and if features are stored untransformed in feature groups, they become even more reusable, as different feature views can apply different transformations to the same feature. diff --git a/docs/concepts/fs/feature_view/offline_api.md b/docs/concepts/fs/feature_view/offline_api.md index da8cd561a..a36c84a89 100644 --- a/docs/concepts/fs/feature_view/offline_api.md +++ b/docs/concepts/fs/feature_view/offline_api.md @@ -1,60 +1,75 @@ The feature view provides an *Offline API* for - * creating training data - * creating batch (scoring) data +- creating training data +- creating batch (scoring) data ## Training Data -Training data is created using a feature view. You can create training data as either: +Training data is created using a feature view. +You can create training data as either: - - in-memory Pandas/Polars DataFrames, useful when you have a small amount of training data; - - materialized training data in files, in a file format of your choice (such as .tfrecord, .csv, or .parquet). +- in-memory Pandas/Polars DataFrames, useful when you have a small amount of training data; +- materialized training data in files, in a file format of your choice (such as .tfrecord, .csv, or .parquet). You can apply filters when creating training data from a feature view: - - start-time and end-time, for example, to create the train-set from an earlier time range, and the test-set from a later (unseen) time range; - - feature value features, for example, only train a model on customers from a particular country. - -Note that filters are not applied when retrieving feature vectors using feature views, as we only look up features for a specific entity, like a customer. In this case, the application should know that predictions for this customer should be made on the model trained on customers in USA, for example. +- start-time and end-time, for example, to create the train-set from an earlier time range, and the test-set from a later (unseen) time range; +- feature value features, for example, only train a model on customers from a particular country. +Note that filters are not applied when retrieving feature vectors using feature views, as we only look up features for a specific entity, like a customer. +In this case, the application should know that predictions for this customer should be made on the model trained on customers in USA, for example. ### Point-in-time Correct Training Data -When you create training data from features in different feature groups, it is possible that the feature groups are updated at different cadences. For example, maybe one feature group is updated hourly, while another feature group is updated daily. It is very complex to write code that joins features together from such feature groups and ensures there is no data leakage in the resultant training data. HSFS hides this complexity by performing the point-in-time JOIN transparently, similar to the illustration below: - +When you create training data from features in different feature groups, it is possible that the feature groups are updated at different cadences. +For example, maybe one feature group is updated hourly, while another feature group is updated daily. +It is very complex to write code that joins features together from such feature groups and ensures there is no data leakage in the resultant training data. +HSFS hides this complexity by performing the point-in-time JOIN transparently, similar to the illustration below: -HSFS uses the event_time columns on both feature groups to determine the most recent (but not newer) feature values that are joined together with the feature values from the feature group containing the label. That is, the features in the feature group containing the label are the observation times for the features in the resulting training data, and we want feature values from the other feature groups that have the most recent timestamps, but not newer than the timestamp in the label-containing feature group. +HSFS uses the event_time columns on both feature groups to determine the most recent (but not newer) feature values that are joined together with the feature values from the feature group containing the label. +That is, the features in the feature group containing the label are the observation times for the features in the resulting training data, and we want feature values from the other feature groups that have the most recent timestamps, but not newer than the timestamp in the label-containing feature group. -#### Spine Dataframes +#### Spine Groups -The left side of the point-in-time join is typically the set of training entities/primary key values for which the relevant features need to be retrieved. This left side of the join can also be replaced by a [spine group](../feature_group/spine_group.md). +The left side of the point-in-time join is typically the set of training entities/primary key values for which the relevant features need to be retrieved. +This left side of the join can also be replaced by a [spine group](../feature_group/spine_group.md). When using feature groups also so save labels/prediction targets, it can happen that you end up with the same entity multiple times in the training dataset depending on the cadence at which the label group was updated and the length of the event time interval -that is being used to generate the training dataset. This can lead to bias in the training dataset and should be avoided. To avoid this kind of situation, users can either narrow down the event time interval during training dataset creation or use a spine -in order to precisely define the entities to be included in the training dataset. This is just one example where spines are helpful. +that is being used to generate the training dataset. +This can lead to bias in the training dataset and should be avoided. +To avoid this kind of situation, users can either narrow down the event time interval during training dataset creation or use a spine +in order to precisely define the entities to be included in the training dataset. +This is just one example where spines are helpful. ### Splitting Training Data -You can create random train/validation/test splits of your training data using the HSFS API. You can also time-based splits with the HSFS API. +You can create random train/validation/test splits of your training data using the HSFS API. +You can also time-based splits with the HSFS API. ### Evaluation Sets -Test data can also be split into evaluation sets to help evaluate a model for potential bias. First, you have to identify the classes of samples that could be at risk of bias, and generate *evaluation sets* from your unseen test set - one evaluation set for each group of samples at risk of bias. For example, if you have a feature group of users, where one of the features is gender, and you want to evaluate the risk of bias due to gender, you can use filters to generate 3 evaluation sets from your test set - one for male, female, and non-binary. Then you score your model against all 3 evaluation sets to ensure that the prediction performance is comparable and non-biased across all 3 gender. +Test data can also be split into evaluation sets to help evaluate a model for potential bias. +First, you have to identify the classes of samples that could be at risk of bias, and generate *evaluation sets* from your unseen test set - one evaluation set for each group of samples at risk of bias. +For example, if you have a feature group of users, where one of the features is gender, and you want to evaluate the risk of bias due to gender, you can use filters to generate 3 evaluation sets from your test set - one for male, female, and non-binary. +Then you score your model against all 3 evaluation sets to ensure that the prediction performance is comparable and non-biased across all 3 gender. ## Batch (Scoring) Data -Batch data for scoring models is created using a feature view. Similar to training data, you can create batch data as either: +Batch data for scoring models is created using a feature view. +Similar to training data, you can create batch data as either: - - in-memory Pandas/Polars DataFrames, useful when you have a small amount of data to score; - - materialized data in files, in a file format of your choice (such as .tfrecord, .csv, or .parquet) +- in-memory Pandas/Polars DataFrames, useful when you have a small amount of data to score; +- materialized data in files, in a file format of your choice (such as .tfrecord, .csv, or .parquet) -Batch data requires specification of a `start_time` for the start of the batch scoring data. You can also specify the `end_time` (default is the current date). +Batch data requires specification of a `start_time` for the start of the batch scoring data. +You can also specify the `end_time` (default is the current date). ### Spine Dataframes -Similar to training dataset generation, it might be helpful to specify a spine when retrieving features for batch inference. The only difference in this case is that the spine dataframe doesn't +Similar to training dataset generation, it might be helpful to specify a spine when retrieving features for batch inference. +The only difference in this case is that the spine dataframe doesn't need to contain the label, as this will be the output of the inference pipeline. A typical use case is the handling of opt-ins, where certain customers have to be excluded from an inference pipeline due to a missing marketing opt-in. diff --git a/docs/concepts/fs/feature_view/online_api.md b/docs/concepts/fs/feature_view/online_api.md index d531a4944..d891eb05a 100644 --- a/docs/concepts/fs/feature_view/online_api.md +++ b/docs/concepts/fs/feature_view/online_api.md @@ -1,4 +1,6 @@ -The Feature View provides an Online API to return an individual feature vector, or a batch of feature vectors, containing the latest feature values. To retrieve a feature vector, a client needs to provide the primary key(s) for the feature groups backing the feature view. For example, if you have `customer_profile` and `customer_purchases` Feature Groups both with `customer_id` as a primary key, and a Feature View made up from features from both Feature Groups, then, you would use `customer_id` to retrieve a feature vector using the Feature View object. +The Feature View provides an Online API to return an individual feature vector, or a batch of feature vectors, containing the latest feature values. +To retrieve a feature vector, a client needs to provide the primary key(s) for the feature groups backing the feature view. +For example, if you have `customer_profile` and `customer_purchases` Feature Groups both with `customer_id` as a primary key, and a Feature View made up from features from both Feature Groups, then, you would use `customer_id` to retrieve a feature vector using the Feature View object. ## Feature Vectors @@ -6,7 +8,8 @@ A feature vector is a row of features (without the primary key(s) and event time -It may be the case that for any given feature vector, not all features will come pre-engineered from the feature store. Some features will be provided by the client (or at least the raw data to compute the feature will come from the client). We call these 'passed' features and, similar to precomputed features from the feature store, they can also be transformed by the HSFS client in the method: - -* feature_view.get_feature_vector(entry, passed_features={...}) +It may be the case that for any given feature vector, not all features will come pre-engineered from the feature store. +Some features will be provided by the client (or at least the raw data to compute the feature will come from the client). +We call these 'passed' features and, similar to precomputed features from the feature store, they can also be transformed by the HSFS client in the method: +- feature_view.get_feature_vector(entry, passed_features={...}) diff --git a/docs/concepts/fs/feature_view/statistics.md b/docs/concepts/fs/feature_view/statistics.md index 9f610de9d..9fc44a129 100644 --- a/docs/concepts/fs/feature_view/statistics.md +++ b/docs/concepts/fs/feature_view/statistics.md @@ -1,5 +1,5 @@ The feature view does not contain any statistics, as it is simply an interface consisting of a number of features and any transformation functions applied to those features. -However, training data can have descriptive statistics over it computed by HSFS. Descriptive statistics for training data is important for model monitoring, as it can enable model monitoring. If you compute the same descriptive statistics over windows of input features to models, you can help determine when there is a significant change in the distribution of an input feature, so-called feature shift. - - +However, training data can have descriptive statistics over it computed by HSFS. +Descriptive statistics for training data is important for model monitoring, as it can enable model monitoring. +If you compute the same descriptive statistics over windows of input features to models, you can help determine when there is a significant change in the distribution of an input feature, so-called feature shift. diff --git a/docs/concepts/fs/feature_view/training_inference_pipelines.md b/docs/concepts/fs/feature_view/training_inference_pipelines.md index 8fe90ee52..e9896bc1a 100644 --- a/docs/concepts/fs/feature_view/training_inference_pipelines.md +++ b/docs/concepts/fs/feature_view/training_inference_pipelines.md @@ -1,27 +1,40 @@ -A *training pipeline* is a program that orchestrates the training of a machine learning model. For supervised machine learning, a training pipeline requires both features and labels, and these can typically be retrieved from the feature store as either in-memory Pandas/Polars DataFrames or read as training data files, created from the feature store. An *inference pipeline* is a program that takes user input, optionally enriches it with features from the feature store, and builds a feature vector (or batch of feature vectors) with with it uses a model to make a prediction. - +A *training pipeline* is a program that orchestrates the training of a machine learning model. +For supervised machine learning, a training pipeline requires both features and labels, and these can typically be retrieved from the feature store as either in-memory Pandas/Polars DataFrames or read as training data files, created from the feature store. +An *inference pipeline* is a program that takes user input, optionally enriches it with features from the feature store, and builds a feature vector (or batch of feature vectors) with with it uses a model to make a prediction. ## Transformations -Feature transformations are mathematical operations that change feature values with the goal of improving model convergence or performance properties. Transformation functions take as input a single value (or small number of values), they often require state (such as the mean value of a feature to normalize the input), and they output a single value or a list of values. + +Feature transformations are mathematical operations that change feature values with the goal of improving model convergence or performance properties. +Transformation functions take as input a single value (or small number of values), they often require state (such as the mean value of a feature to normalize the input), and they output a single value or a list of values. ## Training Serving Skew -It is crucial that the transformations performed when creating features (for training or serving) are consistent - use the same code - to avoid training/serving skew. In the image below, you can see that transformations happen after the Feature Store, but that the implementation of the transformation functions need to be consistent between the training and inference pipelines. +It is crucial that the transformations performed when creating features (for training or serving) are consistent - use the same code - to avoid training/serving skew. +In the image below, you can see that transformations happen after the Feature Store, but that the implementation of the transformation functions need to be consistent between the training and inference pipelines. -There are 3 main approaches to prevent training/serving skew that we support in Hopsworks. These are (1) perform transformations in models, (2) perform transformations in pipelines (sklearn, TF, PyTorch) and use the model registry to save the transformation pipeline so that the same transformation is used in your inference pipeline, and (3) use HSFS transformations, defined as UDFs in Python. - +There are 3 main approaches to prevent training/serving skew that we support in Hopsworks. +These are (1) perform transformations in models, (2) perform transformations in pipelines (sklearn, TF, PyTorch) and use the model registry to save the transformation pipeline so that the same transformation is used in your inference pipeline, and (3) use HSFS transformations, defined as UDFs in Python. ### Transformations as Pre-Processing Layers in Models -Transformation functions can be implemented as preprocessing steps within a model. For example, you can write a transformation function as a pre-processing layer in Keras/TensorFlow. When you save the model, the preprocessing steps will also be saved as part of the model. Any state required to compute the transformation, such as the arithmetic mean of a numerical feature in the train set, is also stored with the function, enabling consistent transformations during inference. When data preprocessing is part of the model, users can just send the untransformed feature values to the model and the model itself will apply any transformation functions as preprocessing layers (such as encoding categorical variables or normalizing numerical variables). - +Transformation functions can be implemented as preprocessing steps within a model. +For example, you can write a transformation function as a pre-processing layer in Keras/TensorFlow. +When you save the model, the preprocessing steps will also be saved as part of the model. +Any state required to compute the transformation, such as the arithmetic mean of a numerical feature in the train set, is also stored with the function, enabling consistent transformations during inference. When data preprocessing is part of the model, users can just send the untransformed feature values to the model and the model itself will apply any transformation functions as preprocessing layers (such as encoding categorical variables or normalizing numerical variables). ### Transformation Pipelines in Scikit-Learn/TensorFlow/PyTorch -You have to save your transformation pipeline (serialize the object or the parameters) and make sure you apply exactly the same transformations in your inference pipeline. This means you should version the transformations. In Hopsworks, you can store the transformations with your versioned models in the Model Registry, helping you to ensure the same transformation pipeline is applied to both training/serving for the same model version. +You have to save your transformation pipeline (serialize the object or the parameters) and make sure you apply exactly the same transformations in your inference pipeline. +This means you should version the transformations. +In Hopsworks, you can store the transformations with your versioned models in the Model Registry, helping you to ensure the same transformation pipeline is applied to both training/serving for the same model version. ### Transformations as Python UDFs in HSFS -Hopsworks feature store also supports consistent transformation functions by enabling a Python UDF, that implements a transformation, to be attached a to feature in a feature view. When training data is created with a feature view or when a feature vector is retrieved from a feature view, HSFS ensures that any transformation functions defined over any features will be applied before returning feature values. You can use built-in transformation objects in HSFS or write your own custom transformation functions as Python UDFs. The benefit of this approach is that transformations are applied consistently when creating training data and when retrieving feature data from the online feature store. Transformations no longer need to be included in either your training pipeline or inference pipeline, as they are applied transparently when creating training data and retrieving feature vectors. Hopsworks uses Spark to create training data as files, and any transformation functions for features are executed as Python UDFs in Spark - enabling transformation functions to be applied on large volumes of data and removing potentially CPU-intensive transformations from training pipelines. +Hopsworks feature store also supports consistent transformation functions by enabling a Python UDF, that implements a transformation, to be attached a to feature in a feature view. +When training data is created with a feature view or when a feature vector is retrieved from a feature view, HSFS ensures that any transformation functions defined over any features will be applied before returning feature values. +You can use built-in transformation objects in HSFS or write your own custom transformation functions as Python UDFs. +The benefit of this approach is that transformations are applied consistently when creating training data and when retrieving feature data from the online feature store. +Transformations no longer need to be included in either your training pipeline or inference pipeline, as they are applied transparently when creating training data and retrieving feature vectors. +Hopsworks uses Spark to create training data as files, and any transformation functions for features are executed as Python UDFs in Spark - enabling transformation functions to be applied on large volumes of data and removing potentially CPU-intensive transformations from training pipelines. diff --git a/docs/concepts/fs/feature_view/versioning.md b/docs/concepts/fs/feature_view/versioning.md index 0f475c9a5..1236cfea2 100644 --- a/docs/concepts/fs/feature_view/versioning.md +++ b/docs/concepts/fs/feature_view/versioning.md @@ -1,5 +1,6 @@ Feature views are interfaces, and if there is a change in the interface (the types of the features, the transformations applied to the features), then you need to change the version, to prevent breaking existing clients. -Training datasets are associated with a specific feature view version. Training data also has its own version number (along with the version of its parent feature view). For example, online transformation functions often need training data statistics (e.g., normalizing a numerical feature requires you to divide the feature value by the mean value for that feature in the training dataset). As many training datasets can be created from a feature view, when you initialize the feature view you need to tell it which version of the training data to use - `feature_view.init(1)` means use version 1 of the training data for this feature view. - - +Training datasets are associated with a specific feature view version. +Training data also has its own version number (along with the version of its parent feature view). +For example, online transformation functions often need training data statistics (e.g., normalizing a numerical feature requires you to divide the feature value by the mean value for that feature in the training dataset). +As many training datasets can be created from a feature view, when you initialize the feature view you need to tell it which version of the training data to use - `feature_view.init(1)` means use version 1 of the training data for this feature view. diff --git a/docs/concepts/fs/index.md b/docs/concepts/fs/index.md index d29561ef8..40c8b34a7 100644 --- a/docs/concepts/fs/index.md +++ b/docs/concepts/fs/index.md @@ -1,26 +1,31 @@ + ## What is Hopsworks Feature Store? -Hopsworks and its Feature Store are an open source data-intensive AI platform used for the development and operation of machine learning models at scale. The Hopsworks Feature Store provides the HSFS API to enable clients to write features to feature groups in the feature store, and to read features from feature views - either through a low latency Online API to retrieve pre-computed features for operational models or through a high throughput, latency insensitive Offline API, used to create training data and to retrieve batch data for scoring. +Hopsworks and its Feature Store are an open source data-intensive AI platform used for the development and operation of machine learning models at scale. +The Hopsworks Feature Store provides the HSFS API to enable clients to write features to feature groups in the feature store, and to read features from feature views - either through a low latency Online API to retrieve pre-computed features for operational models or through a high throughput, latency insensitive Offline API, used to create training data and to retrieve batch data for scoring. -##HSFS API - +## HSFS API The HSFS (Hopsworks Feature Store) API is how you, as a developer, will use the feature store. The HSFS API helps simplify some of the problems that feature stores address including: - - consistent features for training and serving - - centralized, secure access to features - - point-in-time JOINs of features to create training data with no data leakage - - easier connection and backfilling of features from external data sources - - use of external tables as features - - transparent computation of statistics and usage data for features. +- consistent features for training and serving +- centralized, secure access to features +- point-in-time JOINs of features to create training data with no data leakage +- easier connection and backfilling of features from external data sources +- use of external tables as features +- transparent computation of statistics and usage data for features. -## Write to feature groups, read from feature views. -You write to feature groups with a feature pipeline program. The program can be written in Python, Spark, Flink, or SQL. +## Write to feature groups, read from feature views -You read from views on top of the feature groups, called feature views. That is, a feature view does not store feature data, but is a logical grouping of features. Typically, you define a feature view because you want to train/deploy a model with exactly those features in the feature view. Feature views enable the reuse of feature data from different feature groups across different models. +You write to feature groups with a feature pipeline program. +The program can be written in Python, Spark, Flink, or SQL. +You read from views on top of the feature groups, called feature views. +That is, a feature view does not store feature data, but is a logical grouping of features. +Typically, you define a feature view because you want to train/deploy a model with exactly those features in the feature view. +Feature views enable the reuse of feature data from different feature groups across different models. diff --git a/docs/concepts/hopsworks.md b/docs/concepts/hopsworks.md index ca25831cb..fffc95efb 100644 --- a/docs/concepts/hopsworks.md +++ b/docs/concepts/hopsworks.md @@ -1,24 +1,35 @@ Hopsworks is a **modular** MLOps platform with: - - a feature store (available as standalone) - - model registry and model serving based on KServe - - vector database based on OpenSearch - - a data science and data engineering platform +- a feature store (available as standalone) +- model registry and model serving based on KServe +- vector database based on OpenSearch +- a data science and data engineering platform ## Standalone Feature Store + Hopsworks was the first open-source and first enterprise feature store for ML. You can use Hopsworks as a standalone feature store with the HSFS API. ## Model Management -Hopsworks includes support for model management, with model deployments using [the KServe framework](https://github.com/kserve/kserve) and a model registry designed for KServe. Hopsworks logs all inference requests to Kafka to enable easy monitoring of deployed models, and provides model metrics with grafana/prometheus. + +Hopsworks includes support for model management, with model deployments using [the KServe framework](https://github.com/kserve/kserve) and a model registry designed for KServe. +Hopsworks logs all inference requests to Kafka to enable easy monitoring of deployed models, and provides model metrics with grafana/prometheus. ## Vector DB -Hopsworks provides a vector database (or embedding store) based on [OpenSearch kNN](https://opensearch.org/docs/latest/search-plugins/knn/index/) ([FAISS](https://ai.facebook.com/tools/faiss/) and [nmslib](https://github.com/nmslib/nmslib)). Hopsworks Vector DB includes out-of-the-box support for authentication, access control, filtering, backup-and-restore, and horizontal scalability. Hopsworks' Feature Store and vector DB are often used together to build scalable recommender systems, such as ranking-and-retrieval for real-time recommendations. + +Hopsworks provides a vector database (or embedding store) based on [OpenSearch kNN](https://opensearch.org/docs/latest/search-plugins/knn/index/) ([FAISS](https://ai.facebook.com/tools/faiss/) and [nmslib](https://github.com/nmslib/nmslib)). +Hopsworks Vector DB includes out-of-the-box support for authentication, access control, filtering, backup-and-restore, and horizontal scalability. +Hopsworks' Feature Store and vector DB are often used together to build scalable recommender systems, such as ranking-and-retrieval for real-time recommendations. ## Governance -Hopsworks provides a data-mesh architecture for managing ML assets and teams, with multi-tenant projects. Not unlike a GitHub repository, a project is a sandbox containing team members, data, and ML assets. In Hopsworks, all ML assets (features, models, training data) are versioned, taggable, lineage-tracked, and support free-text search. Data can be also be securely shared between projects. + +Hopsworks provides a data-mesh architecture for managing ML assets and teams, with multi-tenant projects. +Not unlike a GitHub repository, a project is a sandbox containing team members, data, and ML assets. +In Hopsworks, all ML assets (features, models, training data) are versioned, taggable, lineage-tracked, and support free-text search. +Data can be also be securely shared between projects. ## Data Science Platform -You can develop feature engineering, model training and inference pipelines in Hopsworks. There is support for version control (GitHub, GitLab, BitBucket), Jupyter notebooks, a shared distributed file system, many bundled modular project python environments for managing python dependencies without needing to write Dockerfiles, jobs (Python, Spark, Flink), and workflow orchestration with Airflow. +You can develop feature engineering, model training and inference pipelines in Hopsworks. +There is support for version control (GitHub, GitLab, BitBucket), Jupyter notebooks, a shared distributed file system, many bundled modular project python environments for managing python dependencies without needing to write Dockerfiles, jobs (Python, Spark, Flink), and workflow orchestration with Airflow. diff --git a/docs/concepts/mlops/bi_tools.md b/docs/concepts/mlops/bi_tools.md index bb54257e1..e7f60fb82 100644 --- a/docs/concepts/mlops/bi_tools.md +++ b/docs/concepts/mlops/bi_tools.md @@ -1,3 +1,4 @@ The Hopsworks Feature Store is based on an offline data store, queryable via an Apache Hive API, and an online data store, queryable via a MySQL Server API. -Given that Feature Groups in Hopsworks have well-defined schemas, features in the Hopsworks Feature Store can be analyzed and reports can be generated from them using any BI Tools that include connectors for MySQL (JDBC) and Apache Hive (2-way TLS required). One platform we use with customers is [Apache Superset](https://superset.apache.org/), as it can be configured alongside Hopsworks to provide BI Tooling capabilities. +Given that Feature Groups in Hopsworks have well-defined schemas, features in the Hopsworks Feature Store can be analyzed and reports can be generated from them using any BI Tools that include connectors for MySQL (JDBC) and Apache Hive (2-way TLS required). +One platform we use with customers is [Apache Superset](https://superset.apache.org/), as it can be configured alongside Hopsworks to provide BI Tooling capabilities. diff --git a/docs/concepts/mlops/data_transformations.md b/docs/concepts/mlops/data_transformations.md index 170743a46..474e8bb6e 100644 --- a/docs/concepts/mlops/data_transformations.md +++ b/docs/concepts/mlops/data_transformations.md @@ -1,49 +1,83 @@ # Data Transformations -[Data transformations](https://www.hopsworks.ai/dictionary/data-transformation) are integral to all AI applications. Data transformations produce new features that can enhance the performance of an AI application. However, [not all transformations in an AI application are equivalent](https://www.hopsworks.ai/post/a-taxonomy-for-data-transformations-in-ai-systems). +[Data transformations](https://www.hopsworks.ai/dictionary/data-transformation) are integral to all AI applications. +Data transformations produce new features that can enhance the performance of an AI application. +However, [not all transformations in an AI application are equivalent](https://www.hopsworks.ai/post/a-taxonomy-for-data-transformations-in-ai-systems). -Transformations like binning and aggregations typically create reusable features, while transformations like one-hot encoding, scaling and normalization often produce model-specific features. Additionally, in real-time AI systems, some features can only be computed during inference when the request is received, as they need request-time parameters to be computed. +Transformations like binning and aggregations typically create reusable features, while transformations like one-hot encoding, scaling and normalization often produce model-specific features. +Additionally, in real-time AI systems, some features can only be computed during inference when the request is received, as they need request-time parameters to be computed. ![Types of features](../../assets/images/concepts/mlops/transformation-features.jpg) -This classification of features can be used to create a taxonomy for data transformation that would apply to any scalable and modular AI system that aims to reuse features. The taxonomy helps identify which classes of data transformation can cause [online-offline](https://www.hopsworks.ai/dictionary/online-offline-feature-skew) skews in AI systems, allowing for their prevention. Hopsworks provides support for a feature view abstraction as well as model-dependent transformations and on-demand transformations to prevent online-offline skew. +This classification of features can be used to create a taxonomy for data transformation that would apply to any scalable and modular AI system that aims to reuse features. +The taxonomy helps identify which classes of data transformation can cause [online-offline](https://www.hopsworks.ai/dictionary/online-offline-feature-skew) skews in AI systems, allowing for their prevention. +Hopsworks provides support for a feature view abstraction as well as model-dependent transformations and on-demand transformations to prevent online-offline skew. ## Data Transformation Taxonomy for AI Systems -Transformation functions in an AI system can be classified into three types based on the nature of the input features they generate: [model-independent](https://www.hopsworks.ai/dictionary/model-independent-transformations), [model-dependent](https://www.hopsworks.ai/dictionary/model-dependent-transformations), and [on-demand](https://www.hopsworks.ai/dictionary/on-demand-transformation) transformations. +Transformation functions in an AI system can be classified into three types based on the nature of the input features they generate: [model-independent](https://www.hopsworks.ai/dictionary/model-independent-transformations), [model-dependent](https://www.hopsworks.ai/dictionary/model-dependent-transformations), and [on-demand](https://www.hopsworks.ai/dictionary/on-demand-transformation) transformations. ![Types of transformations](../../assets/images/concepts/mlops/taxonomy-transformations.jpg) -**Model-independent transformations** create reusable features that can be utilized across one or more machine-learning models. These transformations include techniques such as grouped aggregations (e.g., minimum, maximum, or average of a variable), windowed aggregations (e.g., the number of clicks per day), and binning to generate categorical variables. Since the data produced by model-independent transformations are reusable, these features can be stored in a feature store. - -**Model-dependent transformations** generate features specific to one model. These include transformations that are unique to a particular model or are parameterized by the training dataset, making them model-specific. For instance, text tokenization is a transformation required by all large language models (LLMs) but each LLM has their own (unique) tokenizer. Other transformations, such as encoding categorical variables in a numerical representation or scaling/normalizing/standardizing numerical variables to enhance the performance of gradient-based models, are parameterized by the training dataset. Consequently, the features produced are applicable only to the model trained using that specific training dataset. Since these features are not reusable, there is no need to store them in a feature store. Also, storing encoded features in a feature store leads to write amplification, as every time feature values are written to a feature group, all existing rows in the feature group have to be re-encoded (and creation of a training dataset using a subset or rows in the feature group becomes impossible as they cannot be re-encoded). - -**On-demand transformations** are exclusive to [real-time AI systems](https://www.hopsworks.ai/dictionary/real-time-machine-learning), where predictions must be generated in real time based on incoming prediction requests. On-demand transformations compute on-demand features, which usually require at least one input parameter that is only available in a prediction request for their computation. These transformations can also combine request-time parameters with precomputed features from feature stores. Some examples include generating *zip_codes* from latitude and longitude received in the prediction request or calculating the *time_since_last_transaction* from a transaction request. The on-demand features produced can also be computed and [backfilled](https://www.hopsworks.ai/dictionary/backfill-features) into a feature store when the necessary historical data required for their computation becomes available. Backfilling on-demand features into the feature store eliminates the need to recompute them when creating training data. On-demand transformations are typically also model-independent transformations (model-dependent transformations can be applied after the on-demand transformation). - - +**Model-independent transformations** create reusable features that can be utilized across one or more machine-learning models. +These transformations include techniques such as grouped aggregations (e.g., minimum, maximum, or average of a variable), windowed aggregations (e.g., the number of clicks per day), and binning to generate categorical variables. +Since the data produced by model-independent transformations are reusable, these features can be stored in a feature store. + +**Model-dependent transformations** generate features specific to one model. +These include transformations that are unique to a particular model or are parameterized by the training dataset, making them model-specific. +For instance, text tokenization is a transformation required by all large language models (LLMs) but each LLM has their own (unique) tokenizer. +Other transformations, such as encoding categorical variables in a numerical representation or scaling/normalizing/standardizing numerical variables to enhance the performance of gradient-based models, are parameterized by the training dataset. +Consequently, the features produced are applicable only to the model trained using that specific training dataset. +Since these features are not reusable, there is no need to store them in a feature store. +Also, storing encoded features in a feature store leads to write amplification, as every time feature values are written to a feature group, all existing rows in the feature group have to be re-encoded (and creation of a training dataset using a subset or rows in the feature group becomes impossible as they cannot be re-encoded). + +**On-demand transformations** are exclusive to [real-time AI systems](https://www.hopsworks.ai/dictionary/real-time-machine-learning), where predictions must be generated in real time based on incoming prediction requests. +On-demand transformations compute on-demand features, which usually require at least one input parameter that is only available in a prediction request for their computation. +These transformations can also combine request-time parameters with precomputed features from feature stores. +Some examples include generating *zip_codes* from latitude and longitude received in the prediction request or calculating the *time_since_last_transaction* from a transaction request. +The on-demand features produced can also be computed and [backfilled](https://www.hopsworks.ai/dictionary/backfill-features) into a feature store when the necessary historical data required for their computation becomes available. +Backfilling on-demand features into the feature store eliminates the need to recompute them when creating training data. +On-demand transformations are typically also model-independent transformations (model-dependent transformations can be applied after the on-demand transformation). Each of these transformations is employed within specific areas in a modular AI system and can be illustrated using the figure below. ![Types of transformations in modular AI Pipeline](../../assets/images/concepts/mlops/transformation-in-modular-AI-pipeline.jpg) -Model-independent transformations are utilized exclusively in areas where new and historical data arrives, typically within feature pipelines. Model-dependent transformations are necessary during the creation of training data, in training programs and must also be consistently applied in inference programs prior to making predictions. On-demand transformations are primarily employed in online inference programs, though they can also be integrated into feature engineering programs to backfill data into the feature store. +Model-independent transformations are utilized exclusively in areas where new and historical data arrives, typically within feature pipelines. +Model-dependent transformations are necessary during the creation of training data, in training programs and must also be consistently applied in inference programs prior to making predictions. +On-demand transformations are primarily employed in online inference programs, though they can also be integrated into feature engineering programs to backfill data into the feature store. -The presence of model-dependent and on-demand transformations across different modules in a modular AI system introduces the potential for online-offline skew. Hopsworks provides support for model-dependent transformations and on-demand transformations to easily create modular skew-free AI pipelines. +The presence of model-dependent and on-demand transformations across different modules in a modular AI system introduces the potential for online-offline skew. +Hopsworks provides support for model-dependent transformations and on-demand transformations to easily create modular skew-free AI pipelines. ## Hopsworks and the Data Transformation Taxonomy ![Data transformations Hopsworks](../../assets/images/concepts/mlops/data-transformations-hopsworks.jpg) -In Hopsworks, an AI system is typically decomposed into different [AI pipelines](https://www.hopsworks.ai/dictionary/ai-pipelines) and usually falls into either a [feature pipeline](https://www.hopsworks.ai/dictionary/feature-pipeline), a [training pipeline](https://www.hopsworks.ai/dictionary/training-pipeline), or an [inference pipeline](https://www.hopsworks.ai/dictionary/inference-pipeline). - -Hopsworks stores reusable feature data, created by model-independent transformations within the feature pipeline, into [feature groups](../fs/feature_group/fg_overview.md) (tables containing feature data in both offline and online stores). Model-independent transformations in Hopsworks can be performed using a wide range of commonly used data engineering tools and the generated features can be seamlessly inserted into feature groups. The figure below illustrates the different software tools supported by Hopsworks for creating reusable features through model-independent transformations. +In Hopsworks, an AI system is typically decomposed into different [AI pipelines](https://www.hopsworks.ai/dictionary/ai-pipelines) and usually falls into either a [feature pipeline](https://www.hopsworks.ai/dictionary/feature-pipeline), a [training pipeline](https://www.hopsworks.ai/dictionary/training-pipeline), or an [inference pipeline](https://www.hopsworks.ai/dictionary/inference-pipeline). +Hopsworks stores reusable feature data, created by model-independent transformations within the feature pipeline, into [feature groups](../fs/feature_group/fg_overview.md) (tables containing feature data in both offline and online stores). +Model-independent transformations in Hopsworks can be performed using a wide range of commonly used data engineering tools and the generated features can be seamlessly inserted into feature groups. +The figure below illustrates the different software tools supported by Hopsworks for creating reusable features through model-independent transformations. ![Supported feature engineering tools](../../assets/images/concepts/mlops/supported-feature-engineering-tools.jpg) -Additionally, Hopsworks provides a simple Python API to [create custom transformation functions](../../user_guides/fs/transformation_functions.md) as either Python or Pandas User-Defined Functions (UDFs). Pandas UDFs enable the vectorized execution of transformation functions, offering significantly higher throughput compared to Python UDFs for large volumes of data. They can also be scaled out across workers in a Spark program, allowing for scalability from gigabytes (GBs) to terabytes (TBs) or more. However, Python UDFs can be much faster for small volumes of data, such as in the case of online inference. - -Transformation functions defined in Hopsworks can then be attached to feature groups to [create on-demand transformation](../../user_guides/fs/feature_group/on_demand_transformations.md). On-demand transformations in feature groups are executed automatically whenever data is inserted into them to compute and backfill the on-demand features into the feature group. Backfilling on-demand features removes the need to recompute them while creating training and batch data. - -Hopsworks also provides a powerful abstraction known as [feature views](../fs/feature_view/fv_overview.md), which enables feature reuse and prevents skew between training and inference pipelines. A feature view is a meta-data-only selection of features, created from potentially different feature groups. It includes the input and output schema required for a model. This means that a feature view describes not only the input features but also the output targets, along with any helper columns necessary for training or inference of the model. This allows feature views to create consistent snapshots of data for both training and inference of a model. Additionally feature views, also compute and save statistics for the training datasets they create. - -Hopsworks supports attaching transformations functions to feature views to [create model-dependent transformations](../../user_guides/fs/feature_view/model-dependent-transformations.md) that have no online-offline skew. These transformations get access to the same training dataset statistics during both training and inference ensuring their consistency. Additionally, feature views through lineage get access to the on-demand transformation used to create on-demand features if any are selected during the creation of the feature view. This allows for the computation of on-demand features in real-time during online-inference. \ No newline at end of file +Additionally, Hopsworks provides a simple Python API to [create custom transformation functions](../../user_guides/fs/transformation_functions.md) as either Python or Pandas User-Defined Functions (UDFs). +Pandas UDFs enable the vectorized execution of transformation functions, offering significantly higher throughput compared to Python UDFs for large volumes of data. +They can also be scaled out across workers in a Spark program, allowing for scalability from gigabytes (GBs) to terabytes (TBs) or more. +However, Python UDFs can be much faster for small volumes of data, such as in the case of online inference. + +Transformation functions defined in Hopsworks can then be attached to feature groups to [create on-demand transformation](../../user_guides/fs/feature_group/on_demand_transformations.md). +On-demand transformations in feature groups are executed automatically whenever data is inserted into them to compute and backfill the on-demand features into the feature group. +Backfilling on-demand features removes the need to recompute them while creating training and batch data. + +Hopsworks also provides a powerful abstraction known as [feature views](../fs/feature_view/fv_overview.md), which enables feature reuse and prevents skew between training and inference pipelines. +A feature view is a meta-data-only selection of features, created from potentially different feature groups. +It includes the input and output schema required for a model. +This means that a feature view describes not only the input features but also the output targets, along with any helper columns necessary for training or inference of the model. +This allows feature views to create consistent snapshots of data for both training and inference of a model. +Additionally feature views, also compute and save statistics for the training datasets they create. + +Hopsworks supports attaching transformations functions to feature views to [create model-dependent transformations](../../user_guides/fs/feature_view/model-dependent-transformations.md) that have no online-offline skew. +These transformations get access to the same training dataset statistics during both training and inference ensuring their consistency. +Additionally, feature views through lineage get access to the on-demand transformation used to create on-demand features if any are selected during the creation of the feature view. +This allows for the computation of on-demand features in real-time during online-inference. diff --git a/docs/concepts/mlops/mlops.md b/docs/concepts/mlops/mlops.md deleted file mode 100644 index dbf1edef7..000000000 --- a/docs/concepts/mlops/mlops.md +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/docs/concepts/mlops/opensearch.md b/docs/concepts/mlops/opensearch.md index 731580d4e..bee5225f1 100644 --- a/docs/concepts/mlops/opensearch.md +++ b/docs/concepts/mlops/opensearch.md @@ -1,5 +1,6 @@ Hopsworks includes OpenSearch as a multi-tenant service in projects. OpenSearch provides vector database capabilities through its k-NN plugin, that supports the FAISS and nsmlib embedding indexes. -Through Hopsworks, OpenSearch also provides enterprise capabilities, including authentication and access control to indexes (an index can be private to a Hopsworks project), filtering, scalability, high availability, and disaster recovery support. To learn how Opensearch empowers vector similar search in Hopsworks, you can see [this guide](../../user_guides/fs/vector_similarity_search.md). +Through Hopsworks, OpenSearch also provides enterprise capabilities, including authentication and access control to indexes (an index can be private to a Hopsworks project), filtering, scalability, high availability, and disaster recovery support. +To learn how Opensearch empowers vector similar search in Hopsworks, you can see [this guide](../../user_guides/fs/vector_similarity_search.md). diff --git a/docs/concepts/mlops/prediction_services.md b/docs/concepts/mlops/prediction_services.md index f62ff9b78..f964cd6ab 100644 --- a/docs/concepts/mlops/prediction_services.md +++ b/docs/concepts/mlops/prediction_services.md @@ -2,10 +2,10 @@ A prediction service is an end-to-end analytical or operational machine learning A prediction service consists of the following components: -* feature pipeline(s), -* training pipeline, -* inference pipeline (for either batch predictions or online predictions) -* a sink for predictions - either a store or a user-interface. +- feature pipeline(s), +- training pipeline, +- inference pipeline (for either batch predictions or online predictions) +- a sink for predictions - either a store or a user-interface. ## Analytical ML @@ -15,13 +15,18 @@ In the analytical ML figure below, we can see an analytical prediction service, ## Operational ML -In the operational ML figure below, we can see an operational prediction service, where feature pipelines update the feature store with new feature data, running at some schedule (e.g., streaming, hourly, daily), and the operational service sends prediction requests to a model deployed on KServe via its secured Istio endpoint. A deployed model on KServer handles the prediction request by first retrieving pre-computed features from the feature store for the given request, and then building a feature vector that is scored by the model. The prediction result is returned to the client (the operational service). KServe logs both the feature values and the prediction results back to Hopsworks for further analysis and to help create new training data. +In the operational ML figure below, we can see an operational prediction service, where feature pipelines update the feature store with new feature data, running at some schedule (e.g., streaming, hourly, daily), and the operational service sends prediction requests to a model deployed on KServe via its secured Istio endpoint. +A deployed model on KServer handles the prediction request by first retrieving pre-computed features from the feature store for the given request, and then building a feature vector that is scored by the model. +The prediction result is returned to the client (the operational service). +KServe logs both the feature values and the prediction results back to Hopsworks for further analysis and to help create new training data. - ## MLOps Flywheel -Once you have built your analytical or operational ML system, the MLOps flywheel is the path to building a self-managing system that automatically collects and processes feature logs, prediction logs, and outcomes to help create new training data for models. This enables a ML flywheel where new training data and insights are generated from your operational or analytical ML service, by feeding logs back into the feature store. More training data enables the training of better models, and with better models, you should hopefully improve you operational/batch services, so that you attract more clients, who in turn produce more data for training models. And, thus, the ML flywheel is bootstrapped and leads to a virtuous cycle of more data leading to better models and more models leading to more users, who produce more data, and so on. +Once you have built your analytical or operational ML system, the MLOps flywheel is the path to building a self-managing system that automatically collects and processes feature logs, prediction logs, and outcomes to help create new training data for models. +This enables a ML flywheel where new training data and insights are generated from your operational or analytical ML service, by feeding logs back into the feature store. +More training data enables the training of better models, and with better models, you should hopefully improve you operational/batch services, so that you attract more clients, who in turn produce more data for training models. +And, thus, the ML flywheel is bootstrapped and leads to a virtuous cycle of more data leading to better models and more models leading to more users, who produce more data, and so on. diff --git a/docs/concepts/mlops/registry.md b/docs/concepts/mlops/registry.md index f21838ba3..60ae63450 100644 --- a/docs/concepts/mlops/registry.md +++ b/docs/concepts/mlops/registry.md @@ -1,22 +1,29 @@ -Hopsworks Model Registry is designed with specific support for KServe and MLOps, through versioning. It enables developers to publish, test, monitor, govern and share models for collaboration with other teams. The model registry is where developers publish their models during the experimentation phase. The model registry can also be used to share models with the team and stakeholders. +Hopsworks Model Registry is designed with specific support for KServe and MLOps, through versioning. +It enables developers to publish, test, monitor, govern and share models for collaboration with other teams. +The model registry is where developers publish their models during the experimentation phase. +The model registry can also be used to share models with the team and stakeholders. -Like other project-based multi-tenant services in Hopsworks, a model registry is private to a project. That means you can easily add a development, staging, and production model registry to a cluster, and implement CI/CD processes for transitioning a model from development to staging to production. +Like other project-based multi-tenant services in Hopsworks, a model registry is private to a project. +That means you can easily add a development, staging, and production model registry to a cluster, and implement CI/CD processes for transitioning a model from development to staging to production. The model registry for KServe's capability are shown in the diagram below: -The model registry centralizes model management, enabling models to be securely accessed and governed. Models are more than just the model itself - the registry also stores sample data for testing, configuration information, provenance information, environment variables, links to the code used to generate the model, the model version, and tags/descriptions). When you save a model, you can also save model metrics with the model, enabling users to understand, for example, performance of the model on test (or unseen) data. +The model registry centralizes model management, enabling models to be securely accessed and governed. +Models are more than just the model itself - the registry also stores sample data for testing, configuration information, provenance information, environment variables, links to the code used to generate the model, the model version, and tags/descriptions). +When you save a model, you can also save model metrics with the model, enabling users to understand, for example, performance of the model on test (or unseen) data. ## Model Package + A ML model consists of a number of different components in a model package: - - Model Input/Output Schema - - Model artifacts - - Model version information - - Model format (based on the ML framework used to train the model - e.g., .pkl or .tb files) -You can also optionally include in your packaged model: - - Sample data (used to test the model in KServe) - - The source notebook/program/experiment used to create the model +- Model Input/Output Schema +- Model artifacts +- Model version information +- Model format (based on the ML framework used to train the model - e.g., .pkl or .tb files) +You can also optionally include in your packaged model: +- Sample data (used to test the model in KServe) +- The source notebook/program/experiment used to create the model diff --git a/docs/concepts/mlops/serving.md b/docs/concepts/mlops/serving.md index 6a49a49c2..37c3d4c35 100644 --- a/docs/concepts/mlops/serving.md +++ b/docs/concepts/mlops/serving.md @@ -1,9 +1,11 @@ -In Hopsworks, you can easily deploy models from the model registry in KServe or in Docker containers (for Hopsworks Community). KServe is the defacto open-source framework for model serving on Kubernetes. You can deploy models in either programs, using the HSML library, or in the UI. A KServe model deployment can include the following components: +In Hopsworks, you can easily deploy models from the model registry in KServe or in Docker containers (for Hopsworks Community). +KServe is the defacto open-source framework for model serving on Kubernetes. +You can deploy models in either programs, using the HSML library, or in the UI. +A KServe model deployment can include the following components: **`Transformer`** : A ^^pre-processing^^ and ^^post-processing^^ component that can transform model inputs before predictions are made, and predictions before these are delivered back to the client. - **`Predictor`** @@ -19,11 +21,12 @@ In Hopsworks, you can easily deploy models from the model registry in KServe or **`Istio Model Endpoint`** -: You can publish a model over ^^REST(HTTP)^^ or ^^gRPC^^ using a Hopsworks API key. API keys have scopes to ensure the principle of least privilege access control to resources managed by Hopsworks. +: You can publish a model over ^^REST(HTTP)^^ or ^^gRPC^^ using a Hopsworks API key. + API keys have scopes to ensure the principle of least privilege access control to resources managed by Hopsworks. Models deployed on KServe in Hopsworks can be easily integrated with the Hopsworks Feature Store using either a Transformer or Predictor Python script, that builds the predictor's input feature vector using the application input and pre-computed features from the Feature Store. !!! info "Model Serving Guide" - More information can be found in the [Model Serving guide](../../user_guides/mlops/serving/index.md). \ No newline at end of file + More information can be found in the [Model Serving guide](../../user_guides/mlops/serving/index.md). diff --git a/docs/concepts/mlops/training.md b/docs/concepts/mlops/training.md index 517cda91f..96c9ac1e1 100644 --- a/docs/concepts/mlops/training.md +++ b/docs/concepts/mlops/training.md @@ -1,8 +1,11 @@ -Hopsworks supports running model training pipelines on any Python environment, whether on an external Python client or on a Hopsworks cluster. The outputs of a training pipeline are typically experiment results, including logs, and possibly a trained model. You can plugin your own experimentation tracking platform or model registry, or you can use Hopsworks. +Hopsworks supports running model training pipelines on any Python environment, whether on an external Python client or on a Hopsworks cluster. +The outputs of a training pipeline are typically experiment results, including logs, and possibly a trained model. +You can plugin your own experimentation tracking platform or model registry, or you can use Hopsworks. ### Training Pipelines on Hopsworks -If you train models with Hopsworks, you can setup CI/CD pipelines as shown below, where the experiments are tracked by Hopsworks, and any model created is published to a model registry. Each project has its own private model registry, so when you are working in a development project, you typically publish models to your project's private development registry, and if all model validation tests pass, and the model performance is good enough, the same training pipeline can be submitted via a CI/CD pipeline (e.g., GitHub push request) to a staging project, and the same procedure can be repeated to push the training pipeline to a production project. +If you train models with Hopsworks, you can setup CI/CD pipelines as shown below, where the experiments are tracked by Hopsworks, and any model created is published to a model registry. +Each project has its own private model registry, so when you are working in a development project, you typically publish models to your project's private development registry, and if all model validation tests pass, and the model performance is good enough, the same training pipeline can be submitted via a CI/CD pipeline (e.g., GitHub push request) to a staging project, and the same procedure can be repeated to push the training pipeline to a production project. diff --git a/docs/concepts/projects/cicd.md b/docs/concepts/projects/cicd.md index 584d4492e..29ed26af8 100644 --- a/docs/concepts/projects/cicd.md +++ b/docs/concepts/projects/cicd.md @@ -1,24 +1,24 @@ You can setup traditional development, staging, and production environment in Hopsworks using Projects. A project enables you provide access control for the different environments - just like a GitHub repository, owners of projects can add and remove members of projects and assign different roles to project members - the "data owner" role can write to feature store, while a "data scientist" can only read from the feature store and create training data. +## Dev, Staging, Prod -##Dev, Staging, Prod You can create dev, staging, and prod projects - either on the same cluster, but mostly commonly, with production on its own cluster: -##Versioning +## Versioning Hopsworks supports the versioning of ML assets, including: -* Feature Groups: the version of its schema - breaking schema changes require a new version and backfilling the new version; -* Feature Views: the version of its schema, and breaking schema changes only require a new version; -* Models: the version of a model; -* Deployments: the version of the deployment of a model - a model with the same version can be found in >1 deployment. +- Feature Groups: the version of its schema - breaking schema changes require a new version and backfilling the new version; +- Feature Views: the version of its schema, and breaking schema changes only require a new version; +- Models: the version of a model; +- Deployments: the version of the deployment of a model - a model with the same version can be found in >1 deployment. +## Pytest for feature logic and feature pipeline tests -##Pytest for feature logic and feature pipeline tests - -Pytest and Great Expectations can be used for testing feature pipelines. Pytest is used to test feature logic and for end-to-end feature pipeline tests, while Great Expectations is used for data validation tests. +Pytest and Great Expectations can be used for testing feature pipelines. +Pytest is used to test feature logic and for end-to-end feature pipeline tests, while Great Expectations is used for data validation tests. Here, we can see how a feature pipeline test uses sample data to compute features and validate they have been written successfully, first to a development feature store, and then they can be pushed to a staging feature store, before finally being promoted to production. diff --git a/docs/concepts/projects/governance.md b/docs/concepts/projects/governance.md index d687ecf68..035d05708 100644 --- a/docs/concepts/projects/governance.md +++ b/docs/concepts/projects/governance.md @@ -1,16 +1,24 @@ -Hopsworks provides project-level multi-tenancy, a data mesh enabling technology. Think of it as a GitHub repository for your teams and ML assets. More specifically, a project is a sandbox for team members, ML assets (features, training data, models, vector database, model deployments), and optionally feature pipelines and training pipelines. The ML assets can only be accessed by project members, and there is role-based access control (RBAC) for project members within a project. +Hopsworks provides project-level multi-tenancy, a data mesh enabling technology. +Think of it as a GitHub repository for your teams and ML assets. +More specifically, a project is a sandbox for team members, ML assets (features, training data, models, vector database, model deployments), and optionally feature pipelines and training pipelines. +The ML assets can only be accessed by project members, and there is role-based access control (RBAC) for project members within a project. -## Dev/Staging/Prod for Data -Projects enable you to define development, staging, and even production projects on the same cluster. Often, companies deploy production projects on dedicated clusters, but development projects and staging projects on a shared cluster. This way, projects can be easily used to implement CI/CD workflows. +## Dev/Staging/Prod for Data +Projects enable you to define development, staging, and even production projects on the same cluster. +Often, companies deploy production projects on dedicated clusters, but development projects and staging projects on a shared cluster. +This way, projects can be easily used to implement CI/CD workflows. ## Data Mesh of Feature Stores -Projects enable you to move beyond the traditional dev/staging/prod ownership model for data. Different teams or lines of business can have their own private feature stores, you can mix them with a group-wide feature store, and feature stores can be securely shared between teams/organizations. Effectively, you can have decentralized ownership of feature stores, with domain-specific projects, and each project managing its own feature pipelines. Hopsworks provides data/feature sharing support between these self-service projects. +Projects enable you to move beyond the traditional dev/staging/prod ownership model for data. +Different teams or lines of business can have their own private feature stores, you can mix them with a group-wide feature store, and feature stores can be securely shared between teams/organizations. +Effectively, you can have decentralized ownership of feature stores, with domain-specific projects, and each project managing its own feature pipelines. +Hopsworks provides data/feature sharing support between these self-service projects. ## Audit Logs with REST API -Hopsworks stores audit logs for all calls on its REST API in its file system, HopsFS. The audit log can be used to analyze the historical usage of services by users. - +Hopsworks stores audit logs for all calls on its REST API in its file system, HopsFS. +The audit log can be used to analyze the historical usage of services by users. diff --git a/docs/concepts/projects/search.md b/docs/concepts/projects/search.md index 174abd199..4750bde12 100644 --- a/docs/concepts/projects/search.md +++ b/docs/concepts/projects/search.md @@ -6,28 +6,36 @@ description: "Documentation on the Hopsworks capabilities to discover machine-le Hopsworks supports free-text search to discover machine-learning assets: -* features -* feature groups -* feature views -* training data +- features +- feature groups +- feature views +- training data -You can use the search bar at the top of your project to free-text search for the names or descriptions of any ML asset. You can also search using keywords or tags that are attached to an ML asset. +You can use the search bar at the top of your project to free-text search for the names or descriptions of any ML asset. +You can also search using keywords or tags that are attached to an ML asset. -You can search for assets within a specific project or across all projects in a Hopsworks deployment, including those you are not a member of. This allows for easier discoverability and reusability of assets within an organization. -To avoid users gaining unauthorized access to data, if a search result is in a project you are **not** a member of, the information displayed is limited to: names, descriptions, tags, asset creator and create date. If the search result is within a project you are a member of, you are also able to inspect recent activities on the asset as well as statistics. +You can search for assets within a specific project or across all projects in a Hopsworks deployment, including those you are not a member of. +This allows for easier discoverability and reusability of assets within an organization. +To avoid users gaining unauthorized access to data, if a search result is in a project you are **not** a member of, the information displayed is limited to: names, descriptions, tags, asset creator and create date. +If the search result is within a project you are a member of, you are also able to inspect recent activities on the asset as well as statistics. ## Tags -A keyword is a single user-defined word attached to an ML asset. Keywords can be used to help it make it easier to find ML assets or understand the context in which they should be used, for example, *PII* could be used to indicate that the ML asset is based on personally identifiable information. - -However, it may be preferable to have a stronger governance framework for ML assets than keywords alone. For this, you can define a *schematized tag*, defining a list of key/value tags along with a type for a value. In the figure below, you can see an example of a schematized tag with two key/value pairs: *pii* of type boolean (indicating if this feature group contains PII data), and *owner* of type string (indicating who the owner of the data in this feature group is). Note there is also a keyword defined for this feature group called *eu_region*, indicating the data has its origins in the EU. +A keyword is a single user-defined word attached to an ML asset. +Keywords can be used to help it make it easier to find ML assets or understand the context in which they should be used, for example, *PII* could be used to indicate that the ML asset is based on personally identifiable information. +However, it may be preferable to have a stronger governance framework for ML assets than keywords alone. +For this, you can define a *schematized tag*, defining a list of key/value tags along with a type for a value. +In the figure below, you can see an example of a schematized tag with two key/value pairs: *pii* of type boolean (indicating if this feature group contains PII data), and *owner* of type string (indicating who the owner of the data in this feature group is). +Note there is also a keyword defined for this feature group called *eu_region*, indicating the data has its origins in the EU. - ## Lineage -Hopsworks tracks the lineage (or provenance) of ML assets automatically for you. You can see what features are used in which feature view or training dataset. You can see what training dataset was used to train a given model. For assets that are managed outside of Hopsworks, there is support for the explicit definition of lineage dependencies. +Hopsworks tracks the lineage (or provenance) of ML assets automatically for you. +You can see what features are used in which feature view or training dataset. +You can see what training dataset was used to train a given model. +For assets that are managed outside of Hopsworks, there is support for the explicit definition of lineage dependencies. - \ No newline at end of file + diff --git a/docs/concepts/projects/storage.md b/docs/concepts/projects/storage.md index eb0a4b93e..6d2d29fd9 100644 --- a/docs/concepts/projects/storage.md +++ b/docs/concepts/projects/storage.md @@ -1,15 +1,17 @@ Every project in Hopsworks has its own private assets: - * a Feature Store (including both Online and Offline Stores) - * a Filesystem subtree (all directory and files under /Projects//) - * a Model Registry - * Model Deployments - * Kafka topics - * OpenSearch indexes (including KNN indexes - the vector DB) - * a Hive Database +- a Feature Store (including both Online and Offline Stores) +- a Filesystem subtree (all directory and files under /Projects//) +- a Model Registry +- Model Deployments +- Kafka topics +- OpenSearch indexes (including KNN indexes - the vector DB) +- a Hive Database -Access control to these assets is controlled using project membership ACLs (access-control lists). Users in a project who have a *Data Owner* role have read/write access to these assets. Users in a project who have a *Data Scientist* role have mostly read-only access to these assets, with the exception of the ability to write to well-known directories (Resources, Jupyter, Logs). +Access control to these assets is controlled using project membership ACLs (access-control lists). +Users in a project who have a *Data Owner* role have read/write access to these assets. Users in a project who have a *Data Scientist* role have mostly read-only access to these assets, with the exception of the ability to write to well-known directories (Resources, Jupyter, Logs). -However, it is often desirable to share assets between projects, with read-only, read/write privileges, and to restrict the privileges to specific role (e.g., Data Owners) in the target project. In Hopsworks, you can explicitly share assets between projects without copying the assets. Sharing is managed by ACLs in Hopsworks, see example below: +However, it is often desirable to share assets between projects, with read-only, read/write privileges, and to restrict the privileges to specific role (e.g., Data Owners) in the target project. +In Hopsworks, you can explicitly share assets between projects without copying the assets. +Sharing is managed by ACLs in Hopsworks, see example below: - diff --git a/docs/css/custom.css b/docs/css/custom.css index 18236a29f..c09c2be37 100644 --- a/docs/css/custom.css +++ b/docs/css/custom.css @@ -127,3 +127,93 @@ visibility: hidden; } } + +/*******************************************************/ +/* Dark Theme */ +[data-md-color-scheme="slate"] { + --md-hue: 160; + --md-primary-fg-color: #1eb382; + --md-secondary-fg-color: #188a64; + --md-tertiary-fg-color: #0d493550; + --md-quaternary-fg-color: hsla(var(--md-hue), 0%, 14.5%, 1); + --border-radius-variable: 5px; + + --md-default-fg-color: hsla(var(--md-hue), 0%, 90%, 0.82); + --md-default-fg-color--light: hsla(var(--md-hue), 0%, 90%, 0.56); + --md-default-fg-color--lighter: hsla(var(--md-hue), 0%, 90%, 0.32); + --md-default-fg-color--lightest: hsla(var(--md-hue), 0%, 90%, 0.12); + --md-default-bg-color: hsla(var(--md-hue), 0%, 14%, 1); + --md-default-bg-color--light: hsla(var(--md-hue), 0%, 14%, 0.54); + --md-default-bg-color--lighter: hsla(var(--md-hue), 0%, 14%, 0.26); + --md-default-bg-color--lightest: hsla(var(--md-hue), 0%, 14%, 0.07); + + --md-code-fg-color: hsla(var(--md-hue), 0%, 86%, 0.82); + --md-code-bg-color: hsla(var(--md-hue), 0%, 18%, 1); + --md-code-bg-color--light: hsla(var(--md-hue), 0%, 18%, 0.9); + --md-code-bg-color--lighter: hsla(var(--md-hue), 0%, 18%, 0.54); + + --md-footer-bg-color: hsla(var(--md-hue), 0%, 10%, 0.87); + --md-footer-bg-color--dark: hsla(var(--md-hue), 0%, 8%, 1); +} + +[data-md-color-scheme="slate"] p img, +[data-md-color-scheme="slate"] figure img { + background-color: white; + padding: .4rem; + border-radius: var(--border-radius-variable); +} +[data-md-color-scheme="slate"] .marctech_main > div { + display: none; +} +[data-md-color-scheme="slate"] .marctech_main::after { + content: "Switch to the light theme to view the diagram."; +} + +/*******************************************************/ +/* Fix z-index. */ +header.md-header { + z-index: 900 !important; +} +.md-sidebar { + z-index: 1000 !important; +} +.md-overlay { + z-index: 950 !important; +} +.md-search__overlay { + z-index: 1100 !important; +} +.md-search__form { + z-index: 1200 !important; +} +.md-search__output { + z-index: 1100 !important; +} + +/*******************************************************/ +/* Hide repo stats. */ +.md-source__fact--stars, .md-source__fact--forks { + display: none; +} + +/*******************************************************/ +/* Custom styles for syntax highlighting in signatures. */ + +/* Fancier color for operators such as * and |. */ +.doc-signature .o { + color: var(--md-code-hl-special-color); +} + +/* Fancier color for constants such as None, True, and False. */ +.doc-signature .kc { + color: var(--md-code-hl-constant-color); +} + +/*******************************************************/ +/* Customization of mkdocs material theme */ + +/* Mark external links as such. */ +a.external::after, +a.autorefs-external::after { + content: "↗"; +} diff --git a/docs/css/dropdown.css b/docs/css/dropdown.css deleted file mode 100644 index c1c768fa2..000000000 --- a/docs/css/dropdown.css +++ /dev/null @@ -1,55 +0,0 @@ -/* Style The Dropdown Button */ -.dropbtn { - color: white; - border: none; - cursor: pointer; -} - -.md-tabs__list { - contain: inherit; -} -.md-tabs { -overflow: inherit; -} -.md-header { - z-index: 1000 !important; -} - -/* The container
- needed to position the dropdown content */ -.dropdown { - position: absolute; - display: inline-block; -} - -/* Dropdown Content (Hidden by Default) */ -.dropdown-content { - display:none; - font-size: 13px; - position: absolute; - background-color: #f9f9f9; - min-width: 160px; - box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); - z-index: 1000; - border-radius: 2px; - left:-15px; -} - -/* Links inside the dropdown */ -.dropdown-content a { - color: black; - padding: 12px 16px; - text-decoration: none; - display: block; -} - -/* Change color of dropdown links on hover */ -.dropdown-content a:hover {background-color: #f1f1f1} - -/* Show the dropdown menu on hover */ -.dropdown:hover .dropdown-content { - display: block; -} - -/* Change the background color of the dropdown button when the dropdown content is shown */ -.dropdown:hover .dropbtn { -} diff --git a/docs/index.md b/docs/index.md index 9344be312..a75a28311 100644 --- a/docs/index.md +++ b/docs/index.md @@ -133,10 +133,10 @@ pointer-events: initial;
-
+
Write API
@@ -254,37 +254,66 @@ pointer-events: initial; -Hopsworks is a data platform for ML with a Python-centric Feature Store and MLOps capabilities. Hopsworks is a modular platform. You can use it as a standalone Feature Store, you can use it to manage, govern, and serve your models, and you can even use it to develop and operate feature, training and inference pipelines. Hopsworks brings collaboration for ML teams, providing a secure, governed platform for developing, managing, and sharing ML assets - features, models, training data, batch scoring data, logs, and more. +Hopsworks is a data platform for ML with a Python-centric Feature Store and MLOps capabilities. +Hopsworks is a modular platform. +You can use it as a standalone Feature Store, you can use it to manage, govern, and serve your models, and you can even use it to develop and operate feature, training and inference pipelines. +Hopsworks brings collaboration for ML teams, providing a secure, governed platform for developing, managing, and sharing ML assets - features, models, training data, batch scoring data, logs, and more. ## Python-Centric Feature Store -Hopsworks is widely used as a standalone Feature Store. Hopsworks breaks the monolithic model development pipeline into separate feature and training pipelines, enabling both feature reuse and better tested ML assets. You can develop features by building feature pipelines in any Python (or Spark or Flink) environment, either inside or outside Hopsworks. You can use the Python frameworks you are familiar with to build production feature pipelines. You can compute aggregations in Pandas, validate feature data with Great Expectations, reduce your data dimensionality with embeddings and PCA, test your feature logic and features end-to-end with PyTest, and transform your categorical and numerical features with Scikit-Learn, TensorFlow, and PyTorch. You can orchestrate your feature pipelines with your Python framework of choice, including Hopsworks' own Airflow support. + +Hopsworks is widely used as a standalone Feature Store. +Hopsworks breaks the monolithic model development pipeline into separate feature and training pipelines, enabling both feature reuse and better tested ML assets. +You can develop features by building feature pipelines in any Python (or Spark or Flink) environment, either inside or outside Hopsworks. +You can use the Python frameworks you are familiar with to build production feature pipelines. +You can compute aggregations in Pandas, validate feature data with Great Expectations, reduce your data dimensionality with embeddings and PCA, test your feature logic and features end-to-end with PyTest, and transform your categorical and numerical features with Scikit-Learn, TensorFlow, and PyTorch. +You can orchestrate your feature pipelines with your Python framework of choice, including Hopsworks' own Airflow support. ## The Widest Feature Store Capabilities -Hopsworks Feature Store also supports feature pipelines in PySpark, Spark, Flink, and SQL. Offline features can either be stored in Hopsworks, as Hudi tables on object storage, or in external data lakehouses (Snowflake, Databricks, Redshift, BigQuery, any JDBC-enabled platform) via External Feature Groups. Online features are served by [RonDB](https://www.rondb.com), developed by Hopsworks as the lowest latency, highest throughput, highest availability data store for your features. + +Hopsworks Feature Store also supports feature pipelines in PySpark, Spark, Flink, and SQL. +Offline features can either be stored in Hopsworks, as Hudi tables on object storage, or in external data lakehouses (Snowflake, Databricks, Redshift, BigQuery, any JDBC-enabled platform) via External Feature Groups. +Online features are served by [RonDB](https://www.rondb.com), developed by Hopsworks as the lowest latency, highest throughput, highest availability data store for your features. ## MLOps on Hopsworks -Hopsworks provides model serving capabilities through KServe, with additional support for feature/prediction logging to Kafka (also part of Hopsworks), and secure, low-latency model deployments via Istio. Hopsworks also has a Model Registry for KServe, with support for versioning both models and model assets (such as KServe transformers). Hopsworks also includes a vector database to provide similarity search capabilities for embeddings, based on [OpenSearch](./concepts/mlops/opensearch.md). + +Hopsworks provides model serving capabilities through KServe, with additional support for feature/prediction logging to Kafka (also part of Hopsworks), and secure, low-latency model deployments via Istio. +Hopsworks also has a Model Registry for KServe, with support for versioning both models and model assets (such as KServe transformers). +Hopsworks also includes a vector database to provide similarity search capabilities for embeddings, based on [OpenSearch](./concepts/mlops/opensearch.md). ## Project-based Multi-Tenancy and Team Collaboration -Hopsworks provides projects as a secure sandbox in which teams can collaborate and share ML assets. Hopsworks' unique multi-tenant project model even enables sensitive data to be stored in a shared cluster, while still providing fine-grained sharing capabilities for ML assets across project boundaries. Projects can be used to structure teams so that they have end-to-end responsibility from raw data to managed features and models. Projects can also be used to create development, staging, and production environments for data teams. All ML assets support versioning, lineage, and provenance provide all Hopsworks users with a complete view of the MLOps life cycle, from feature engineering through model serving. + +Hopsworks provides projects as a secure sandbox in which teams can collaborate and share ML assets. +Hopsworks' unique multi-tenant project model even enables sensitive data to be stored in a shared cluster, while still providing fine-grained sharing capabilities for ML assets across project boundaries. Projects can be used to structure teams so that they have end-to-end responsibility from raw data to managed features and models. +Projects can also be used to create development, staging, and production environments for data teams. +All ML assets support versioning, lineage, and provenance provide all Hopsworks users with a complete view of the MLOps life cycle, from feature engineering through model serving. ## Development and Operations -Hopsworks provides a FTI (feature/training/inference) pipeline architecture for ML systems. Each part of the pipeline is defined in a Hopsworks job which corresponds to a Jupyter notebook, a python script or a jar. The production pipelines are then orchestrated with Airflow which is bundled in Hopsworks. Hopsworks provides several python environments that can be used and customized for each part of the FTI pipeline, for example switching between using PyTorch or TensorFlow in the training pipeline. You can train models on as many GPUs as are installed in a Hopsworks cluster and easily share them among users. You can also run Spark, Spark Streaming, or Flink programs on Hopsworks. JupyterLab is also bundled which can be used to run Python and Spark interactively. + +Hopsworks provides a FTI (feature/training/inference) pipeline architecture for ML systems. +Each part of the pipeline is defined in a Hopsworks job which corresponds to a Jupyter notebook, a python script or a jar. +The production pipelines are then orchestrated with Airflow which is bundled in Hopsworks. +Hopsworks provides several python environments that can be used and customized for each part of the FTI pipeline, for example switching between using PyTorch or TensorFlow in the training pipeline. +You can train models on as many GPUs as are installed in a Hopsworks cluster and easily share them among users. +You can also run Spark, Spark Streaming, or Flink programs on Hopsworks. +JupyterLab is also bundled which can be used to run Python and Spark interactively. ## Available on any Platform -Hopsworks is available to be installed on a kubernetes cluster in the cloud on AWS, Azure, and GCP, and On-Prem (Ubuntu/Redhat compatible), even in air-gapped data centers. Hopsworks is also available as a serverless platform that manages and serves both your features and models. + +Hopsworks is available to be installed on a kubernetes cluster in the cloud on AWS, Azure, and GCP, and On-Prem (Ubuntu/Redhat compatible), even in air-gapped data centers. +Hopsworks is also available as a serverless platform that manages and serves both your features and models. ## Join the community + - Ask questions and give us feedback in the [Hopsworks Community](https://community.hopsworks.ai/) - Follow us on [Twitter](https://twitter.com/hopsworks) -- Check out all our latest [product releases](https://github.com/logicalclocks/hopsworks/releases) +- Check out all our latest [product releases](https://github.com/logicalclocks/hopsworks-api/releases) - Join our public [slack-channel](https://join.slack.com/t/public-hopsworks/shared_invite/zt-24fc3hhyq-VBEiN8UZlKsDrrLvtU4NaA ) ## Contribute -We are building the most complete and modular ML platform available in the market, and we count on your support to continuously improve Hopsworks. Feel free to [give us suggestions](https://github.com/logicalclocks/hopsworks), [report bugs](https://github.com/logicalclocks/hopsworks/issues) and [add features to our library](https://github.com/logicalclocks/hopsworks-api) anytime. + +We are building the most complete and modular ML platform available in the market, and we count on your support to continuously improve Hopsworks. +Feel free to [add features to our library](https://github.com/logicalclocks/hopsworks-api) and [report bugs](https://github.com/logicalclocks/hopsworks-api/issues) anytime. ## Open-Source -Hopsworks is available under the AGPL-V3 license. In plain English this means that you are free to use Hopsworks and even build paid services on it, but if you modify the source code, you should also release back your changes and any systems built around it as AGPL-V3. -We're the best at what we do, and we strive to keep the same standard for our community! -Our many thanks to the contributors of Hopsworks. +Hopsworks Python API is available under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0.html). diff --git a/docs/js/dropdown.js b/docs/js/dropdown.js deleted file mode 100644 index 4f0a7e8a7..000000000 --- a/docs/js/dropdown.js +++ /dev/null @@ -1,3 +0,0 @@ -document.getElementsByClassName("md-tabs__link")[6].style.display = "none"; -document.getElementsByClassName("md-tabs__link")[8].style.display = "none"; - diff --git a/docs/js/inject-api-links.js b/docs/js/inject-api-links.js deleted file mode 100644 index 89082c67d..000000000 --- a/docs/js/inject-api-links.js +++ /dev/null @@ -1,28 +0,0 @@ -window.addEventListener("DOMContentLoaded", function () { - var windowPathNameSplits = window.location.pathname.split("/"); - var majorVersionRegex = new RegExp("(\\d+[.]\\d+)"); - var latestRegex = new RegExp("latest"); - if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/4.0 - URL contains major version - // Version API dropdown - document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/generated/api/login/"; - document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/javadoc"; - } else { // on / docs.hopsworks.api/hopsworks-api/4.0 - if (latestRegex.test(windowPathNameSplits[2]) || latestRegex.test(windowPathNameSplits[1])) { - var majorVersion = "latest"; - } else { - var apiVersion = windowPathNameSplits[2]; - var majorVersion = apiVersion.match(majorVersionRegex)[0]; - } - // Version main navigation - document.getElementsByClassName("md-tabs__link")[0].href = "https://docs.hopsworks.ai/" + majorVersion; - document.getElementsByClassName("md-tabs__link")[1].href = "https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb"; - document.getElementsByClassName("md-tabs__link")[2].href = "https://docs.hopsworks.ai/" + majorVersion + "/tutorials/"; - document.getElementsByClassName("md-tabs__link")[3].href = "https://docs.hopsworks.ai/" + majorVersion + "/concepts/hopsworks/"; - document.getElementsByClassName("md-tabs__link")[4].href = "https://docs.hopsworks.ai/" + majorVersion + "/user_guides/"; - document.getElementsByClassName("md-tabs__link")[5].href = "https://docs.hopsworks.ai/" + majorVersion + "/setup_installation/aws/getting_started/"; - document.getElementsByClassName("md-tabs__link")[6].href = "https://docs.hopsworks.ai/" + majorVersion + "/admin/"; - // Version API dropdown - document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/generated/api/login/"; - document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/javadoc"; - } -}); diff --git a/docs/setup_installation/admin/alert.md b/docs/setup_installation/admin/alert.md index 0857722ce..a5e351da3 100644 --- a/docs/setup_installation/admin/alert.md +++ b/docs/setup_installation/admin/alert.md @@ -1,16 +1,20 @@ # Configure Alerts ## Introduction -Alerts are sent from Hopsworks using Prometheus' + +Alerts are sent from Hopsworks using Prometheus' [Alert manager](https://prometheus.io/docs/alerting/latest/alertmanager/). In order to send alerts we first need to configure the _Alert manager_. ## Prerequisites + Administrator account on a Hopsworks cluster. ### Step 1: Go to alerts configuration + To configure the _Alert manager_ click on your name in the top right corner of the navigation bar and choose -Cluster Settings from the dropdown menu. In the Cluster Settings' Alerts tab you can configure the alert +Cluster Settings from the dropdown menu. +In the Cluster Settings' Alerts tab you can configure the alert manager to send alerts via email, slack or pagerduty.
@@ -19,7 +23,9 @@ manager to send alerts via email, slack or pagerduty.
### Step 2: Configure Email Alerts -To send alerts via email you need to configure an SMTP server. Click on the _Configure_ + +To send alerts via email you need to configure an SMTP server. +Click on the _Configure_ button on the left side of the **email** row and fill out the form that pops up.
@@ -34,11 +40,14 @@ button on the left side of the **email** row and fill out the form that pops up. CRAM-MD5, LOGIN or PLAIN. Optionally cluster wide Email alert receivers can be added in _Default receiver emails_. -These receivers will be available to all users when they create event triggered [alerts](../../../user_guides/fs/feature_group/data_validation_best_practices#setup-alerts). +These receivers will be available to all users when they create event triggered [alerts](../../user_guides/fs/feature_group/data_validation_best_practices.md#setup-alerts). ### Step 3: Configure Slack Alerts -Alerts can also be sent via Slack messages. To be able to send Slack messages you first need to configure -a Slack webhook. Click on the _Configure_ button on the left side of the **slack** row and past in your + +Alerts can also be sent via Slack messages. +To be able to send Slack messages you first need to configure +a Slack webhook. +Click on the _Configure_ button on the left side of the **slack** row and past in your [Slack webhook](https://api.slack.com/messaging/webhooks) in _Webhook_.
@@ -47,11 +56,13 @@ a Slack webhook. Click on the _Configure_ button on the left side of the **slack
Optionally cluster wide Slack alert receivers can be added in _Slack channel/user_. -These receivers will be available to all users when they create event triggered [alerts](../../../user_guides/fs/feature_group/data_validation_best_practices/#setup-alerts). +These receivers will be available to all users when they create event triggered [alerts](../../user_guides/fs/feature_group/data_validation_best_practices.md#setup-alerts). ### Step 4: Configure Pagerduty Alerts -Pagerduty is another way you can send alerts from Hopsworks. Click on the _Configure_ button on the left side of -the **pagerduty** row and fill out the form that pops up. + +Pagerduty is another way you can send alerts from Hopsworks. +Click on the _Configure_ button on the left side of +the **pagerduty** row and fill out the form that pops up.
Configure Pagerduty Alerts @@ -66,14 +77,16 @@ By first choosing the PagerDuty integration type: - _global event routing (routing_key)_: when using PagerDuty integration type `Events API v2`. - _service (service_key)_: when using PagerDuty integration type `Prometheus`. -Then adding the Service key/Routing key of the receiver(s). PagerDuty provides -[documentation](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) on how to integrate with +Then adding the Service key/Routing key of the receiver(s). +PagerDuty provides +[documentation](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) on how to integrate with Prometheus' Alert manager. ### Step 5: Configure Webhook Alerts -You can also use webhooks to send alerts. A Webhook Alert is sent as an HTTP POST command with a JSON-encoded parameter payload. -Click on the _Configure_ button on the left side of the **webhook** row and fill out the form that pops up. +You can also use webhooks to send alerts. +A Webhook Alert is sent as an HTTP POST command with a JSON-encoded parameter payload. +Click on the _Configure_ button on the left side of the **webhook** row and fill out the form that pops up.
Configure Webhook Alerts @@ -82,16 +95,18 @@ Click on the _Configure_ button on the left side of the **webhook** row and fill Fill in the unique URL of your Webhook: the endpoint to send HTTP POST requests to. -A global receiver is created when a webhook is configured and can be used by any project in the cluster. +A global receiver is created when a webhook is configured and can be used by any project in the cluster. ### Step 6: Advanced configuration -If you are familiar with Prometheus' [Alert manager](https://prometheus.io/docs/alerting/latest/alertmanager/) + +If you are familiar with Prometheus' [Alert manager](https://prometheus.io/docs/alerting/latest/alertmanager/) you can also configure alerts by editing the _yaml/json_ file directly by going to the advaced page and clicking the edit button. - -The advanced page shows the configuration currently loaded on the alert manager. After editing the configuration it takes some time to propagate changes to the alertmanager. -The reload button can be used to validate the changes made to the configuration. -It will try to load the new configuration to the alertmanager and show any errors that might prevent the configuration from being loaded. +The advanced page shows the configuration currently loaded on the alert manager. +After editing the configuration it takes some time to propagate changes to the alertmanager. + +The reload button can be used to validate the changes made to the configuration. +It will try to load the new configuration to the alertmanager and show any errors that might prevent the configuration from being loaded.
Advanced configuration @@ -100,10 +115,10 @@ It will try to load the new configuration to the alertmanager and show any error !!!warning - If you make any changes to the configuration ensure that the changes are valid by reloading the configuration until the changes are loaded and visible in the advanced page. + If you make any changes to the configuration ensure that the changes are valid by reloading the configuration until the changes are loaded and visible in the advanced page. _Example:_ Adding the yaml snippet shown below in the global section of the alert manager configuration will -have the same effect as creating the SMTP configuration as shown in [section 1](#1-email-alerts) above. +have the same effect as creating the SMTP configuration as shown in [section 1](#step-2-configure-email-alerts) above. ```yaml global: @@ -115,9 +130,9 @@ global: ... ``` -To test the alerts by creating triggers from Jobs and Feature group validations see [Alerts](../../../user_guides/fs/feature_group/data_validation_best_practices/#setup-alerts). +To test the alerts by creating triggers from Jobs and Feature group validations see [Alerts](../../user_guides/fs/feature_group/data_validation_best_practices.md#setup-alerts). -The yaml syntax in the UI is slightly different in that it does not allow double quotes (it will ignore the values but give no error). +The yaml syntax in the UI is slightly different in that it does not allow double quotes (it will ignore the values but give no error). Below is an example configuration, that can be used in the UI, with both email and slack receivers configured for system alerts. ```yaml diff --git a/docs/setup_installation/admin/audit/audit-logs.md b/docs/setup_installation/admin/audit/audit-logs.md index 50a802656..7aa387cf8 100644 --- a/docs/setup_installation/admin/audit/audit-logs.md +++ b/docs/setup_installation/admin/audit/audit-logs.md @@ -1,18 +1,19 @@ # Access Audit Logs - + ## Introduction - -Hopsworks collects audit logs on all URL requests to the application server. These logs are saved in Payara log directory under ```/audit``` by default. - + +Hopsworks collects audit logs on all URL requests to the application server. +These logs are saved in Payara log directory under ```/audit``` by default. + ## Prerequisites -In order to access the audit logs you need the following: - +In order to access the audit logs you need the following: + - Administrator account on the Hopsworks cluster. - SSH access to the Hopsworks cluster with a user in the ```glassfish``` group. - + ## Step 1: Configure Audit logs - + Audit logs can be configured from the _Cluster Settings_ Configuration tab. You can access the _Configuration_ page of your Hopsworks cluster by clicking on your name, in the top right corner, and choosing _Cluster Settings_ from the dropdown menu. @@ -20,10 +21,10 @@ You can access the _Configuration_ page of your Hopsworks cluster by clicking on Audit log configuration
Audit log configuration
- + Type _audit_ in the search box to see the configuration variables associated with audit logs. To edit a configuration variable, you can click on the edit button (:material-pencil:), insert the new value and save changes clicking on the check mark (:material-check:). - + !!! info "Audit logs configuration variables" | Name | Description | @@ -35,12 +36,13 @@ To edit a configuration variable, you can click on the edit button (:material-pe | audit_log_date_format | if io.hops.hopsworks.audit.helper.JSONLogFormatter is used as audit log file type, this will set the date format of the output JSON. The format should be java.text.SimpleDateFormat compatible string. | !!! warning - Hopsworks application needs to be reloaded for any changes to be applied. For doing that, go to the Payara admin panel (```https://:4848```), click on _Applications_ on the side menu and reload the _hopsworks-ear_ application. - + Hopsworks application needs to be reloaded for any changes to be applied. + For doing that, go to the Payara admin panel (```https://:4848```), click on _Applications_ on the side menu and reload the _hopsworks-ear_ application. + ## Step 2: Access the Logs - + To access the audit logs, SSH into the **instance pod** of your Hopsworks cluster and navigate to the path ```/opt/payara/appserver/glassfish/nodes///logs/audit```. - + Audit logs follow the format set in the _audit\_log\_file\_type_ configuration variable. !!! note "Example of audit logs using JSONLogFormatter" @@ -64,7 +66,7 @@ Regardless the format, each line in the audit logs can contain the following var | userAgent | the browser used by the client | | pathInfo | the URL path called by the client | | dateTime | time of the request | - + ## Going Further You can [export audit logs](../audit/export-audit-logs.md) to use them outside Hopsworks. diff --git a/docs/setup_installation/admin/audit/export-audit-logs.md b/docs/setup_installation/admin/audit/export-audit-logs.md index 781937571..32fe0b331 100644 --- a/docs/setup_installation/admin/audit/export-audit-logs.md +++ b/docs/setup_installation/admin/audit/export-audit-logs.md @@ -2,9 +2,10 @@ ## Introduction -Audit logs can be exported to your storage of preference. In case audit logs have not been configured yet in your Hopsworks cluster, please see [Access Audit Logs](../audit/audit-logs.md). +Audit logs can be exported to your storage of preference. +In case audit logs have not been configured yet in your Hopsworks cluster, please see [Access Audit Logs](../audit/audit-logs.md). -!!! note +!!! note As an example, in this guide we will show how to export audit logs to BigQuery using the ```bq``` command-line tool. ## Prerequisites @@ -17,25 +18,26 @@ Create a dataset and a table in [BigQuery](https://cloud.google.com/bigquery/doc The table schema is shown below. -``` -fullname mode type description -pathInfo NULLABLE STRING -methodName NULLABLE STRING -caller NULLABLE RECORD -dateTime NULLABLE TIMESTAMP bq-datetime -userAgent NULLABLE STRING -clientIp NULLABLE STRING -outcome NULLABLE STRING -parameters NULLABLE STRING -className NULLABLE STRING -caller.userId NULLABLE STRING -caller.email NULLABLE STRING -caller.username NULLABLE STRING +```plaintext +fullname mode type description +pathInfo NULLABLE STRING +methodName NULLABLE STRING +caller NULLABLE RECORD +dateTime NULLABLE TIMESTAMP bq-datetime +userAgent NULLABLE STRING +clientIp NULLABLE STRING +outcome NULLABLE STRING +parameters NULLABLE STRING +className NULLABLE STRING +caller.userId NULLABLE STRING +caller.email NULLABLE STRING +caller.username NULLABLE STRING ``` ## Step 2: Export Audit Logs to the BigQuery Table -Audit logs can be exported in different formats. For instance, to export audit logs in JSON format set ```audit_log_file_type=io.hops.hopsworks.audit.helper.JSONLogFormatter```. +Audit logs can be exported in different formats. +For instance, to export audit logs in JSON format set ```audit_log_file_type=io.hops.hopsworks.audit.helper.JSONLogFormatter```. !!! info For more information on how to configure the audit log file type see the ```audit_log_file_type``` configuration variable in [Audit logs](../audit/audit-logs.md#step-1-configure-audit-logs). diff --git a/docs/setup_installation/admin/auth.md b/docs/setup_installation/admin/auth.md index 7c6c5b676..bfecdca58 100644 --- a/docs/setup_installation/admin/auth.md +++ b/docs/setup_installation/admin/auth.md @@ -1,43 +1,42 @@ # Authentication Methods ## Introduction -Hopsworks can be configured to use different types of authentication methods. In this guide we will look at the + +Hopsworks can be configured to use different types of authentication methods. +In this guide we will look at the different authentication methods available in Hopsworks. ## Prerequisites + Administrator account on a Hopsworks cluster. ### Step 1: Go to Authentication methods page -To configure Authentication methods click on your name in the top right corner of the navigation bar and choose +To configure Authentication methods click on your name in the top right corner of the navigation bar and choose **Cluster Settings** from the dropdown menu. ### Step 2: Configure Authentication methods + In the **Cluster Settings** _Authentication_ tab you can configure how users authenticate. -1. **TOTP Two-factor Authentication**: can be _disabled_, _optional_ or _mandatory_. If set to mandatory all users are - required to set up two-factor authentication when registering. - - !!! note - - If two-factor is set to _mandatory_ on a cluster with preexisting users all users will need to go through - lost device recovery step to enable two-factor. So consider setting it to _optional_ first and allow users to - enable it before setting it to mandatory. - -2. **OAuth2**: if your organization already have an identity management system compatible with - [OpenID Connect (OIDC)](https://openid.net/connect/) you can configure Hopsworks to use your identity provider - by enabling **OAuth** as shown in the figure below. After enabling OAuth - you can register your identity provider by clicking on **Add Identity Provider** button. See - [Create client](../oauth2/create-client) for details. -3. **LDAP/Kerberos**: if your organization is using LDAP or Kerberos to manage users and services you can configure - Hopsworks to use it as the user management system. You can enable LDAP/Kerberos by clicking on the checkbox, - as shown in the figure below, and choosing LDAP or Kerberos. For more information on how to configure LDAP and Kerberos see - [Configure LDAP](../ldap/configure-ldap) and [Configure Kerberos](../ldap/configure-krb). +1. **TOTP Two-factor Authentication**: can be _disabled_, _optional_ or _mandatory_. + If set to mandatory all users are required to set up two-factor authentication when registering. + + !!! note + If two-factor is set to _mandatory_ on a cluster with preexisting users all users will need to go through lost device recovery step to enable two-factor. + So consider setting it to _optional_ first and allow users to enable it before setting it to mandatory. + +2. **OAuth2**: if your organization already have an identity management system compatible with [OpenID Connect (OIDC)](https://openid.net/connect/) you can configure Hopsworks to use your identity provider by enabling **OAuth** as shown in the figure below. + After enabling OAuth you can register your identity provider by clicking on **Add Identity Provider** button. + See [Create client](./oauth2/create-client.md) for details. +3. **LDAP/Kerberos**: if your organization is using LDAP or Kerberos to manage users and services you can configure Hopsworks to use it as the user management system. + You can enable LDAP/Kerberos by clicking on the checkbox, as shown in the figure below, and choosing LDAP or Kerberos. + For more information on how to configure LDAP and Kerberos see [Configure LDAP](./ldap/configure-ldap.md) and [Configure Kerberos](./ldap/configure-krb.md).
Authentication config
Setup Authentication Methods
-In the figure above we see a cluster with Two-factor authentication disabled, OAuth enabled with one registered -identity provider and LDAP authentication enabled. +In the figure above we see a cluster with Two-factor authentication disabled, OAuth enabled with one registered +identity provider and LDAP authentication enabled. diff --git a/docs/setup_installation/admin/configure-project-mapping.md b/docs/setup_installation/admin/configure-project-mapping.md index 4844620b0..063d9fc9d 100644 --- a/docs/setup_installation/admin/configure-project-mapping.md +++ b/docs/setup_installation/admin/configure-project-mapping.md @@ -1,14 +1,14 @@ # Configure group to project mapping - + ## Introduction A group-to-project mapping lets you automatically add all members of a Hopsworks group to a project, eliminating the need to add each user individually. To create a mapping, you simply select a Hopsworks group, choose the project it should be linked to, and assign the role that its members will have within that project. Once a mapping is created, project membership is controlled through Hopsworks group membership. Any updates made to the Hopsworks group—such as adding or removing users—will automatically be reflected in the project membership. For example, if a user is removed from the Hopsworks group, they will also be removed from the corresponding project. - + ## Prerequisites -1. Hopsworks group mapping sync enabled. This can be done by setting the variable ```hw_group_mapping_sync_enabled=true```. -See [Cluster Configuration](../variables.md) on how to change variable values in Hopsworks. +1. Hopsworks group mapping sync enabled. This can be done by setting the variable ```hw_group_mapping_sync_enabled=true```. +See [Cluster Configuration](./variables.md) on how to change variable values in Hopsworks.
Enable Hopsworks mapping @@ -24,19 +24,19 @@ If you can not find the variable ```hw_group_mapping_sync_enabled``` create it b
Create Hopsworks group mapping enabled variable
- + ### Step 1: Create a mapping To create a mapping go to **Cluster Settings** by clicking on your name in the top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. In the _Project mapping_ tab, you can create a new mapping by clicking on _Create new mapping_. - +
Project mapping tab
Project mapping
- + This will take you to the create mapping page shown below
@@ -44,36 +44,36 @@ This will take you to the create mapping page shown below
Create mapping
- + Here you can enter your Hopsworks group and map it to a project from the _Project_ drop down list. You can also choose the _Project role_ users will be assigned when they are added to the project. - + Finally, click on _Create mapping_ and go back to mappings. You should see the newly created mapping(s) as shown below. - +
Project mappings
Project mappings
- + ### Step 2: Edit a mapping - + From the list of mappings click on the edit button (:material-pencil:). This will open a popup that will allow you to change the _group_, _project name_, and _project role_ of a mapping. - +
Edit mapping
Edit mapping
- + !!!Warning Updating a mapping's _group_ or _project name_ will remove all members of the previous group from the project. - + ### Step 3: Delete a mapping - + To delete a mapping click on the delete button. - + !!!Warning Deleting a mapping will remove all members of that group from the project. \ No newline at end of file diff --git a/docs/setup_installation/admin/ha-dr/dr.md b/docs/setup_installation/admin/ha-dr/dr.md index 31c9c4377..f23ca74c2 100644 --- a/docs/setup_installation/admin/ha-dr/dr.md +++ b/docs/setup_installation/admin/ha-dr/dr.md @@ -1,34 +1,45 @@ # Disaster Recovery ## Backup -The state of the Hopsworks cluster is divided into data and metadata and distributed across the different node groups. This section of the guide allows you to take a consistent backup between data in the offline and online feature store as well as the metadata. + +The state of the Hopsworks cluster is divided into data and metadata and distributed across the different node groups. +This section of the guide allows you to take a consistent backup between data in the offline and online feature store as well as the metadata. The following services contain critical state that should be backed up: -* **RonDB**: as mentioned above, the RonDB is used by Hopsworks to store the cluster metadata as well as the data for the online feature store. -* **HopsFS**: HopsFS stores the data for the batch feature store as well as checkpoints and logs for feature engineering applications. +- **RonDB**: as mentioned above, the RonDB is used by Hopsworks to store the cluster metadata as well as the data for the online feature store. +- **HopsFS**: HopsFS stores the data for the batch feature store as well as checkpoints and logs for feature engineering applications. -Backing up service/application metrics and services/applications logs are out of the scope of this guide. By default metrics and logs are rotated after 7 days. Application logs are available on HopsFS when the application has finished and, as such, are backed up with the rest of HopsFS’ data. +Backing up service/application metrics and services/applications logs are out of the scope of this guide. +By default metrics and logs are rotated after 7 days. +Application logs are available on HopsFS when the application has finished and, as such, are backed up with the rest of HopsFS’ data. -Apache Kafka and OpenSearch are additional services maintaining state. The OpenSearch metadata can be reconstructed from the metadata stored on RonDB. +Apache Kafka and OpenSearch are additional services maintaining state. +The OpenSearch metadata can be reconstructed from the metadata stored on RonDB. -Apache Kafka is used in Hopsworks to store the in-flight data that is on its way to the online feature store. In the event of a total loss of the cluster, running jobs with in-flight data will have to be replayed. +Apache Kafka is used in Hopsworks to store the in-flight data that is on its way to the online feature store. +In the event of a total loss of the cluster, running jobs with in-flight data will have to be replayed. ### Configuration Backup -Hopsworks adopts an Infrastructure-as-code philosophy, as such all the configuration files for the different Hopsworks services are generated during the deployment phase. Cluster-specific customizations should be centralized in the cluster definition used to deploy the cluster. As such the cluster definition should be backed up (e.g., by committing it to a git repository) to be able to recreate the same cluster in case it needs to be recreated. +Hopsworks adopts an Infrastructure-as-code philosophy, as such all the configuration files for the different Hopsworks services are generated during the deployment phase. +Cluster-specific customizations should be centralized in the cluster definition used to deploy the cluster. +As such the cluster definition should be backed up (e.g., by committing it to a git repository) to be able to recreate the same cluster in case it needs to be recreated. ### RonDB Backup The RonDB backup is divided into two parts: user and privileges backup and data backup. -To take the backup of users and privileges you can run the following command from any of the nodes in the head node group. This command generates a SQL file containing all the user definitions for both the metadata services (Hopsworks, HopsFS, Metastore) as well as the user and permission grants for the online feature store. This command needs to be run as user ‘mysql’ or with sudo privileges. +To take the backup of users and privileges you can run the following command from any of the nodes in the head node group. +This command generates a SQL file containing all the user definitions for both the metadata services (Hopsworks, HopsFS, Metastore) as well as the user and permission grants for the online feature store. +This command needs to be run as user ‘mysql’ or with sudo privileges. ```sh /srv/hops/mysql/bin/mysqlpump -S /srv/hops/mysql-cluster/mysql.sock --exclude-databases=% --exclude-users=root,mysql.sys,mysql.session,mysql.infoschema --users > users.sql ``` -The second step is to trigger the backup of the data. This can be achieved by running the following command as user ‘mysql’ on one of the nodes of the head node group. +The second step is to trigger the backup of the data. +This can be achieved by running the following command as user ‘mysql’ on one of the nodes of the head node group. ```sh /srv/hops/mysql-cluster/ndb/scripts/mgm-client.sh -e "START BACKUP [replace_backup_id] SNAPSHOTEND WAIT COMPLETED" @@ -36,30 +47,39 @@ The second step is to trigger the backup of the data. This can be achieved by ru The backup ID is an integer greater or equal than 1. The script uses the following: `$(date +'%y%m%d%H%M')` instead of an integer as backup id to make it easier to identify backups over time. -The command instructs each RonDB datanode to backup the data it is responsible for. The backup will be located locally on each datanode under the following path: +The command instructs each RonDB datanode to backup the data it is responsible for. +The backup will be located locally on each datanode under the following path: ```sh -/srv/hops/mysql-cluster/ndb/backups/BACKUP - the directory name will be BACKUP-[backup_id] +/srv/hops/mysql-cluster/ndb/backups/BACKUP - the directory name will be BACKUP-[backup_id] ``` -A more comprehensive backup script is available [here](https://github.com/logicalclocks/ndb-chef/blob/master/templates/default/native_ndb_backup.sh.erb) - The script includes the steps above as well as collecting all the partial RonDB backups on a single node. The script is a good starting point and can be adapted to ship the database backup outside the cluster. +You can check out [a more comprehensive backup script](https://github.com/logicalclocks/ndb-chef/blob/master/templates/default/native_ndb_backup.sh.erb). +The script includes the steps above as well as collecting all the partial RonDB backups on a single node. +The script is a good starting point and can be adapted to ship the database backup outside the cluster. ### HopsFS Backup -HopsFS is a distributed file system based on Apache HDFS. HopsFS stores its metadata in RonDB, as such metadata backup has already been discussed in the section above. The data is stored in the form of blocks on the different data nodes. +HopsFS is a distributed file system based on Apache HDFS. +HopsFS stores its metadata in RonDB, as such metadata backup has already been discussed in the section above. +The data is stored in the form of blocks on the different data nodes. For availability reasons, the blocks are replicated across three different data nodes. -Within a node, the blocks are stored by default under the following directory, under the ownership of the ‘hdfs’ user: +Within a node, the blocks are stored by default under the following directory, under the ownership of the ‘hdfs’ user: ```sh /srv/hopsworks-data/hops/hopsdata/hdfs/dn/ ``` -To safely backup all the data, a copy of all the datanodes should be taken. As the data is replicated across the different nodes, excluding a set of nodes might result in data loss. +To safely backup all the data, a copy of all the datanodes should be taken. +As the data is replicated across the different nodes, excluding a set of nodes might result in data loss. -Additionally, as HopsFS blocks are files on the file system and the filesystem can be quite large, the backup is not transactional. Consistency is dictated by the metadata. Blocks being added during the copying process will not be visible when restoring as they are not part of the metadata backup taken prior to cloning the HopsFS blocks. +Additionally, as HopsFS blocks are files on the file system and the filesystem can be quite large, the backup is not transactional. +Consistency is dictated by the metadata. +Blocks being added during the copying process will not be visible when restoring as they are not part of the metadata backup taken prior to cloning the HopsFS blocks. -When the HopsFS data blocks are stored in a cloud block storage, for example, Amazon S3, then it is sufficient to only backup the metadata. The blob cloud storage service will ensure durability of the data blocks. +When the HopsFS data blocks are stored in a cloud block storage, for example, Amazon S3, then it is sufficient to only backup the metadata. +The blob cloud storage service will ensure durability of the data blocks. ## Restore @@ -67,16 +87,21 @@ As with the backup phase, the restore operation is broken down in different step ### Cluster deployment -The first step to redeploy the cluster is to redeploy the binaries and configuration. You should reuse the same cluster definition used to deploy the first (original) cluster. This will re-create the same cluster with the same configuration. +The first step to redeploy the cluster is to redeploy the binaries and configuration. +You should reuse the same cluster definition used to deploy the first (original) cluster. +This will re-create the same cluster with the same configuration. ### RonDB restore -The deployment step above created a functioning empty cluster. To restore the cluster, the first step is to restore the metadata and online feature store data stored on RonDB. +The deployment step above created a functioning empty cluster. +To restore the cluster, the first step is to restore the metadata and online feature store data stored on RonDB. To restore the state of RonDB, we first need to restore its schemas and tables, then its data, rebuild the indices, and finally restore the users and grants. #### Restore RonDB schemas and tables -This command should be executed on one of the nodes in the head node group and is going to recreate the schemas, tables, and internal RonDB metadata. In the command below, you should replace the node_id with the id of the node you are running the command on, backup_id with the id of the backup you want to restore. Finally, you should replace the mgm_node_ip with the address of the node where the RonDB management service is running. +This command should be executed on one of the nodes in the head node group and is going to recreate the schemas, tables, and internal RonDB metadata. +In the command below, you should replace the node_id with the id of the node you are running the command on, backup_id with the id of the backup you want to restore. +Finally, you should replace the mgm_node_ip with the address of the node where the RonDB management service is running. ```sh /srv/hops/mysql/bin/ndb_restore -n [node_id] -b [backup_id] -m --disable-indexes --ndb-connectstring=[mgm_node_ip]:1186 --backup_path=/srv/hops/mysql-cluster/ndb/backups/BACKUP/BACKUP-[backup_id] @@ -84,7 +109,9 @@ This command should be executed on one of the nodes in the head node group and i #### Restore RonDB data -This command should be executed on all the RonDB datanodes. Each command should be customized with the node id of the node you are trying to restore (i.e., replace the node_id). As for the command above you should replace the backup_id and mgm_node_ip. +This command should be executed on all the RonDB datanodes. +Each command should be customized with the node id of the node you are trying to restore (i.e., replace the node_id). +As for the command above you should replace the backup_id and mgm_node_ip. ```sh /srv/hops/mysql/bin/ndb_restore -n [node_id] -b [backup_id] -r --ndb-connectstring=[mgm_node_ip]:1186 --backup_path=/srv/hops/mysql-cluster/ndb/backups/BACKUP/BACKUP-[backup_id] @@ -92,7 +119,10 @@ This command should be executed on all the RonDB datanodes. Each command should #### Rebuild the indices -In the first command we disable the indices for recovery. This last command will take care of enabling them again. This command needs to run only once on one of the nodes of the head node group. As for the commands above, you should replace node_id, backup_id and mgm_node_id. +In the first command we disable the indices for recovery. +This last command will take care of enabling them again. +This command needs to run only once on one of the nodes of the head node group. +As for the commands above, you should replace node_id, backup_id and mgm_node_id. ```sh /srv/hops/mysql/bin/ndb_restore -n [node_id] -b [backup_id] --rebuild-indexes --ndb-connectstring=[mgm_node_ip]:1186 --backup_path=/srv/hops/mysql-cluster/ndb/backups/BACKUP/BACKUP-[backup_ip] @@ -100,7 +130,9 @@ In the first command we disable the indices for recovery. This last command will #### Restore Users and Grants -In the backup phase, we took the backup of the user and grants separately. The last step of the RonDB restore process is to re-create all the users and grants both for Hopsworks services as well as for the online feature store users. This can be achieved by running the following command on one node of the head node group: +In the backup phase, we took the backup of the user and grants separately. +The last step of the RonDB restore process is to re-create all the users and grants both for Hopsworks services as well as for the online feature store users. +This can be achieved by running the following command on one node of the head node group: ```sh /srv/hops/mysql-cluster/ndb/scripts/mysql-client.sh source users.sql @@ -108,17 +140,20 @@ In the backup phase, we took the backup of the user and grants separately. The l ### HopsFS restore -With the metadata restored, you can now proceed to restore the file system blocks on HopsFS and restart the file system. When starting the datanode, it will advertise it’s ID/ClusterID and Storage ID based on the VERSION file that can be found in this directory: - +With the metadata restored, you can now proceed to restore the file system blocks on HopsFS and restart the file system. +When starting the datanode, it will advertise it’s ID/ClusterID and Storage ID based on the VERSION file that can be found in this directory: + ```sh /srv/hopsworks-data/hops/hopsdata/hdfs/dn/current ``` -It’s important that all the datanodes are restored and they report their block to the namenodes processes running on the head nodes. By default the namenodes in HopsFS will exit “SAFE MODE” (i.e., the mode that allows only read operations) only when the datanodes have reported 99.9% of the blocks the namenodes have in the metadata. As such, the namenodes will not resume operations until all the file blocks have been restored. +It’s important that all the datanodes are restored and they report their block to the namenodes processes running on the head nodes. +By default the namenodes in HopsFS will exit “SAFE MODE” (i.e., the mode that allows only read operations) only when the datanodes have reported 99.9% of the blocks the namenodes have in the metadata. As such, the namenodes will not resume operations until all the file blocks have been restored. ### OpenSearch state rebuild -The OpenSearch state can be rebuilt using the Hopsworks metadata stored on RonDB. The rebuild process is done by using the re-indexing mechanism provided by ePipe. +The OpenSearch state can be rebuilt using the Hopsworks metadata stored on RonDB. +The rebuild process is done by using the re-indexing mechanism provided by ePipe. The re-indexing can be triggered by running the following command on the head node where ePipe is running: ```sh @@ -129,11 +164,12 @@ The script is deployed and configured during the platform deployment. ### Kafka topics rebuild -The backup and restore plan doesn’t cover the data in transit in Kafka, for which the jobs producing it will have to be replayed. However, the RonDB backup contains the information necessary to recreate the topics of all the feature groups. +The backup and restore plan doesn’t cover the data in transit in Kafka, for which the jobs producing it will have to be replayed. +However, the RonDB backup contains the information necessary to recreate the topics of all the feature groups. You can run the following command, as super user, to recreate all the topics with the correct partitioning and replication factors: ```sh /srv/hops/kafka/bin/kafka-restore.sh ``` -The script is deployed and configured during the platform deployment. \ No newline at end of file +The script is deployed and configured during the platform deployment. diff --git a/docs/setup_installation/admin/ha-dr/ha.md b/docs/setup_installation/admin/ha-dr/ha.md index b4a42d306..4200b2e9a 100644 --- a/docs/setup_installation/admin/ha-dr/ha.md +++ b/docs/setup_installation/admin/ha-dr/ha.md @@ -1,14 +1,24 @@ # High Availability -At a high level a Hopsworks cluster can be divided into 4 groups of nodes. Each node group should be deployed according to the requirements (e.g., 3/5/7 nodes for the head node group) to guarantee the availability of the components. +At a high level a Hopsworks cluster can be divided into 4 groups of nodes. +Each node group should be deployed according to the requirements (e.g., 3/5/7 nodes for the head node group) to guarantee the availability of the components. -* **Head nodes**: The head node is responsible for running all the metadata, public API, and user interface services that are required for Hopsworks to provide its functionality. They need to be deployed in an odd number (1, 3, 5) as the head nodes run services like Zookeeper and OpenSearch which enforce consistency through quorum based protocols. The head nodes are also responsible for managing the services running on the remaining group of nodes. -* **Worker nodes**: The worker node is responsible for executing the feature engineering pipeline code as well as storing the data for the offline feature store (HopsFS). In an on-prem deployment, the data is stored and replicated on the workers’ local hard drives. By default the data is replicated across 3 workers. In a cloud deployment, HopsFS’ data is persisted in a cloud object store (Amazon S3, Azure Blob Storage, Google Cloud Blob Storage) and the HopsFS datanodes are responsible for persisting, retrieving and caching of blocks from the object store. -* **RonDB Data nodes**: +- **Head nodes**: The head node is responsible for running all the metadata, public API, and user interface services that are required for Hopsworks to provide its functionality. + They need to be deployed in an odd number (1, 3, 5) as the head nodes run services like Zookeeper and OpenSearch which enforce consistency through quorum based protocols. + The head nodes are also responsible for managing the services running on the remaining group of nodes. +- **Worker nodes**: The worker node is responsible for executing the feature engineering pipeline code as well as storing the data for the offline feature store (HopsFS). + In an on-prem deployment, the data is stored and replicated on the workers’ local hard drives. + By default the data is replicated across 3 workers. + In a cloud deployment, HopsFS’ data is persisted in a cloud object store (Amazon S3, Azure Blob Storage, Google Cloud Blob Storage) and the HopsFS datanodes are responsible for persisting, retrieving and caching of blocks from the object store. +- **RonDB Data nodes**: These nodes are responsible for storing the services’ metadata (Hopsworks, HopsFS, Hive Metastore, Airflow) as well as the data for the online feature store. For high availability, at least two data nodes should be deployed and RonDB is typically configured with a replication factor of 2, as it uses synchronous replication with 2-phase commit, not a quorum-based replication protocol. More advanced deployment patterns and best practices are covered in the [RonDB documentation](https://docs.rondb.com). -* **Query brokers**: The query brokers are the entry point for querying the online feature store. They handle authentication, authorization and execution of the requests for online feature data being submitted from the feature store APIs. At least two query brokers should be deployed to achieve high availability. Query brokers are stateless. Additional query brokers should be deployed to handle additional load and clients. +- **Query brokers**: The query brokers are the entry point for querying the online feature store. + They handle authentication, authorization and execution of the requests for online feature data being submitted from the feature store APIs. + At least two query brokers should be deployed to achieve high availability. + Query brokers are stateless. + Additional query brokers should be deployed to handle additional load and clients. Example deployment: @@ -17,6 +27,7 @@ Example deployment:
Example High Available deployment
-For higher availability, a Hopsworks cluster should be deployed across multiple availability zones, however, a single cluster cannot be deployed across multiple regions. Multiple region deployments are out of the scope of this guide. +For higher availability, a Hopsworks cluster should be deployed across multiple availability zones, however, a single cluster cannot be deployed across multiple regions. +Multiple region deployments are out of the scope of this guide. A different service placement is also possible, e.g., separating RonDB data nodes between metadata and online feature store or adding more replicas of a metadata service without necessarily adding a whole new head node, however, this is outside the scope of this guide. diff --git a/docs/setup_installation/admin/ha-dr/intro.md b/docs/setup_installation/admin/ha-dr/intro.md index 14c4fed8a..a654c7890 100644 --- a/docs/setup_installation/admin/ha-dr/intro.md +++ b/docs/setup_installation/admin/ha-dr/intro.md @@ -1,7 +1,9 @@ # Hopsworks High Availability and Disaster Recovery Documentation -The Hopsworks Feature Store is the underlying component powering enterprise ML pipelines as well as serving feature data to model making user facing predictions. Sometimes the Hopsworks cluster can experience hardware failures or power loss, to help you plan for these occasions and avoid Hopsworks Feature Store downtime, we put together this guide. This guide is divided into three sections: +The Hopsworks Feature Store is the underlying component powering enterprise ML pipelines as well as serving feature data to model making user facing predictions. +Sometimes the Hopsworks cluster can experience hardware failures or power loss, to help you plan for these occasions and avoid Hopsworks Feature Store downtime, we put together this guide. +This guide is divided into three sections: -* **High availability**: deployment patterns and best practices to make sure individual component failures do not impact the availability of the Hopsworks cluster. -* **Backup**: configuration policies and best practices to make sure you have have fresh copy of the data and metadata in case of necessity -* **Restore**: procedures and best practices to restore a previous backup if needed. \ No newline at end of file +- **High availability**: deployment patterns and best practices to make sure individual component failures do not impact the availability of the Hopsworks cluster. +- **Backup**: configuration policies and best practices to make sure you have have fresh copy of the data and metadata in case of necessity +- **Restore**: procedures and best practices to restore a previous backup if needed. diff --git a/docs/setup_installation/admin/index.md b/docs/setup_installation/admin/index.md index d4d0c7b0d..23a3e809f 100644 --- a/docs/setup_installation/admin/index.md +++ b/docs/setup_installation/admin/index.md @@ -1,7 +1,7 @@ # Cluster Administration -Hopsworks has a cluster management page that allows you, the administrator, to perform management actions, +Hopsworks has a cluster management page that allows you, the administrator, to perform management actions, monitor and control Hopsworks. -To access the cluster management page you should log in into Hopsworks using your administrator account. -In the top right corner, click on your name in the top right corner of the navigation bar and choose Cluster Settings from the dropdown menu. \ No newline at end of file +To access the cluster management page you should log in into Hopsworks using your administrator account. +In the top right corner, click on your name in the top right corner of the navigation bar and choose Cluster Settings from the dropdown menu. diff --git a/docs/setup_installation/admin/ldap/configure-krb.md b/docs/setup_installation/admin/ldap/configure-krb.md index 2b83cc823..d6730fd5e 100644 --- a/docs/setup_installation/admin/ldap/configure-krb.md +++ b/docs/setup_installation/admin/ldap/configure-krb.md @@ -1,19 +1,21 @@ # Configure Kerberos ## Introduction + Kerberos is a network authentication protocol that allow nodes to communicating over a non-secure network to prove their identity to one another in a secure manner. This tutorial shows an administrator how to configure Kerberos authentication. Kerberos need some server configuration before you can enable it from the UI. ## Prerequisites -A server configured with Kerberos. See [Server Configuration for Kerberos](../configure-server/#server-configuration-for-kerberos) for -instruction on how to do this. + +A server configured with Kerberos. +See [Server Configuration for Kerberos](./configure-server.md#step-2-server-configuration-for-kerberos) for instruction on how to do this. ### Step 1: Enable Kerberos -After configuring the server you can configure Authentication methods by clicking on your name in the top right -corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. -In the _Authentication_ tab you can find in **Cluster Settings**, you can enable Kerberos by clicking on the Kerberos checkbox. + +After configuring the server you can configure Authentication methods by clicking on your name in the top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. +In the *Authentication* tab you can find in **Cluster Settings**, you can enable Kerberos by clicking on the Kerberos checkbox. If LDAP/Kerberos checkbox is not checked, make sure that you configured your application server and enable it by clicking on the checkbox. @@ -24,6 +26,7 @@ clicking on the checkbox.
### Step 2: Edit configuration + Finally, click on edit configuration and fill in the attributes.
@@ -31,23 +34,34 @@ Finally, click on edit configuration and fill in the attributes.
Configure Kerberos
-- Account status: the status a user will be assigned when logging in for the first time. If a user is assigned a status - different from _Activated_ an admin needs to manually activate each user from the [User management](../../user). -- Group mapping: allows you to specify a mapping between LDAP groups and Hopsworks groups. The mapping is a - semicolon separated string in the form ```Directory Administrators->HOPS_ADMIN;IT People-> HOPS_USER```. Default - is empty. If no mapping is specified, users need to be assigned a role by an admin before they can log in. -- User id: the id field in LDAP with a string placeholder. Default ```uid=%s```. -- User given name: the given name field in LDAP. Default ```givenName```. -- User surname: the surname field in LDAP. Default ```sn```. -- User email: the email field in LDAP. Default ```mail```. -- User search filter: the search filter for user. Default ```uid=%s```. -- Principal search filter: the search filter for principal name. Default ```krbPrincipalName=%s```. -- Group search filter: the search filter for groups. Default ```member=%d```. -- Group target: the target to search for groups in the LDAP directory tree. Default ```cn```. -- Dynamic group target: the target to search for dynamic groups in the LDAP directory tree. Default ```memberOf```. -- User dn: specify the distinguished name (DN) of the container or base point where the users are stored. Default is - empty. -- Group dn: specify the DN of the container or base point where the groups are stored. Default is empty. +- Account status: the status a user will be assigned when logging in for the first time. + If a user is assigned a status different from *Activated* an admin needs to manually activate each user from the [User management](../user.md). +- Group mapping: allows you to specify a mapping between LDAP groups and Hopsworks groups. + The mapping is a semicolon separated string in the form ```Directory Administrators->HOPS_ADMIN;IT People-> HOPS_USER```. + Default is empty. + If no mapping is specified, users need to be assigned a role by an admin before they can log in. +- User id: the id field in LDAP with a string placeholder. + Default ```uid=%s```. +- User given name: the given name field in LDAP. + Default ```givenName```. +- User surname: the surname field in LDAP. + Default ```sn```. +- User email: the email field in LDAP. + Default ```mail```. +- User search filter: the search filter for user. + Default ```uid=%s```. +- Principal search filter: the search filter for principal name. + Default ```krbPrincipalName=%s```. +- Group search filter: the search filter for groups. + Default ```member=%d```. +- Group target: the target to search for groups in the LDAP directory tree. + Default ```cn```. +- Dynamic group target: the target to search for dynamic groups in the LDAP directory tree. + Default ```memberOf```. +- User dn: specify the distinguished name (DN) of the container or base point where the users are stored. + Default is empty. +- Group dn: specify the DN of the container or base point where the groups are stored. + Default is empty. All defaults are taken from [OpenLDAP](https://www.openldap.org/). diff --git a/docs/setup_installation/admin/ldap/configure-ldap.md b/docs/setup_installation/admin/ldap/configure-ldap.md index 10f0c9a33..b5fbf19af 100644 --- a/docs/setup_installation/admin/ldap/configure-ldap.md +++ b/docs/setup_installation/admin/ldap/configure-ldap.md @@ -1,23 +1,23 @@ # Configure LDAP/Kerberos ## Introduction -LDAP (Lightweight Directory Access Protocol) is a software protocol for enabling anyone in a network to gain -access to resources such as files and devices. This tutorial shows an administrator how to configure LDAP authentication. +LDAP (Lightweight Directory Access Protocol) is a software protocol for enabling anyone in a network to gain access to resources such as files and devices. +This tutorial shows an administrator how to configure LDAP authentication. LDAP need some server configuration before you can enable it from the UI. ## Prerequisites -A server configured with LDAP. See [Server Configuration for LDAP](../configure-server/#server-configuration-for-ldap) for -instruction on how to do this. + +A server configured with LDAP. +See [Server Configuration for LDAP](./configure-server.md#step-1-server-configuration-for-ldap) for instruction on how to do this. ### Step 1: Enable LDAP -After configuring the server you can configure Authentication methods by clicking on your name in the top right -corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. -In the _Authentication_ tab you can find in **Cluster Settings**, you can enable LDAP by clicking on the LDAP checkbox. -If LDAP/Kerberos checkbox is not checked make sure that you configured your application server and enable it by -clicking on the checkbox. +After configuring the server you can configure Authentication methods by clicking on your name in the top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. +In the *Authentication* tab you can find in **Cluster Settings**, you can enable LDAP by clicking on the LDAP checkbox. + +If LDAP/Kerberos checkbox is not checked make sure that you configured your application server and enable it by clicking on the checkbox.
Authentication config @@ -25,6 +25,7 @@ clicking on the checkbox.
### Step 2: Edit configuration + Finally, click on edit configuration and fill in the attributes.
@@ -32,22 +33,32 @@ Finally, click on edit configuration and fill in the attributes.
Configure LDAP
-- Account status: the status a user will be assigned when logging in for the first time. If a use is assigned a status - different from _Activated_ an admin needs to manually activate each user from the [User management](../../user). -- Group mapping: allows you to specify a mapping between LDAP groups and Hopsworks groups. The mapping is a - semicolon separated string in the form ```Directory Administrators->HOPS_ADMIN;IT People-> HOPS_USER```. Default - is empty. If no mapping is specified, users need to be assigned a role by an admin before they can log in. -- User id: the id field in LDAP with a string placeholder. Default ```uid=%s```. -- User given name: the given name field in LDAP. Default ```givenName```. -- User surname: the surname field in LDAP. Default ```sn```. -- User email: the email field in LDAP. Default ```mail```. -- User search filter: the search filter for user. Default ```uid=%s```. -- Group search filter: the search filter for groups. Default ```member=%d```. -- Group target: the target to search for groups in the LDAP directory tree. Default ```cn```. -- Dynamic group target: the target to search for dynamic groups in the LDAP directory tree. Default ```memberOf```. -- User dn: specify the distinguished name (DN) of the container or base point where the users are stored. Default is - empty. -- Group dn: specify the DN of the container or base point where the groups are stored. Default is empty. +- Account status: the status a user will be assigned when logging in for the first time. + If a user is assigned a status different from *Activated* an admin needs to manually activate each user from the [User management](../user.md). +- Group mapping: allows you to specify a mapping between LDAP groups and Hopsworks groups. + The mapping is a semicolon separated string in the form ```Directory Administrators->HOPS_ADMIN;IT People-> HOPS_USER```. + Default is empty. + If no mapping is specified, users need to be assigned a role by an admin before they can log in. +- User id: the id field in LDAP with a string placeholder. + Default ```uid=%s```. +- User given name: the given name field in LDAP. + Default ```givenName```. +- User surname: the surname field in LDAP. + Default ```sn```. +- User email: the email field in LDAP. + Default ```mail```. +- User search filter: the search filter for user. + Default ```uid=%s```. +- Group search filter: the search filter for groups. + Default ```member=%d```. +- Group target: the target to search for groups in the LDAP directory tree. + Default ```cn```. +- Dynamic group target: the target to search for dynamic groups in the LDAP directory tree. + Default ```memberOf```. +- User dn: specify the distinguished name (DN) of the container or base point where the users are stored. + Default is empty. +- Group dn: specify the DN of the container or base point where the groups are stored. + Default is empty. All defaults are taken from [OpenLDAP](https://www.openldap.org/). diff --git a/docs/setup_installation/admin/ldap/configure-project-mapping.md b/docs/setup_installation/admin/ldap/configure-project-mapping.md index 9d74129af..b92cbef8d 100644 --- a/docs/setup_installation/admin/ldap/configure-project-mapping.md +++ b/docs/setup_installation/admin/ldap/configure-project-mapping.md @@ -1,15 +1,19 @@ # Configure LDAP/Kerberos group to project mapping - + ## Introduction -A group-to-project mapping lets you automatically add all members of an LDAP group to a project, eliminating the need to add each user individually. To create a mapping, you simply select the LDAP group, choose the project it should be linked to, and assign the role that its members will have within that project. +A group-to-project mapping lets you automatically add all members of an LDAP group to a project, eliminating the need to add each user individually. +To create a mapping, you simply select the LDAP group, choose the project it should be linked to, and assign the role that its members will have within that project. + +Once a mapping is created, project membership is controlled through LDAP group membership. +Any updates made to the LDAP group—such as adding or removing users—will automatically be reflected in Hopsworks. +For example, if a user is removed from the LDAP group, they will also be removed from the corresponding project. -Once a mapping is created, project membership is controlled through LDAP group membership. Any updates made to the LDAP group—such as adding or removing users—will automatically be reflected in Hopsworks. For example, if a user is removed from the LDAP group, they will also be removed from the corresponding project. - ## Prerequisites -1. A server configured with LDAP or Kerberos. See [Server Configuration for Kerberos](../configure-server/#server-configuration-for-kerberos) and -[Server Configuration for LDAP](../configure-server/#server-configuration-for-ldap) for instructions on how to do this. -2. LDAP group mapping sync enabled. This can be done by setting the variable ```ldap_group_mapping_sync_enabled=true```. + +1. A server configured with LDAP or Kerberos. See [Server Configuration for Kerberos](./configure-server.md#step-2-server-configuration-for-kerberos) and [Server Configuration for LDAP](./configure-server.md#step-1-server-configuration-for-ldap) for instructions on how to do this. +2. LDAP group mapping sync enabled. This can be done by setting the variable ```ldap_group_mapping_sync_enabled=true```. + See [Cluster Configuration](../variables.md) on how to change variable values in Hopsworks.
@@ -26,19 +30,19 @@ If you can not find the variable ```ldap_group_mapping_sync_enabled``` create it
Create ldap mapping enabled variable
- + ### Step 1: Create a mapping -To create a mapping go to **Cluster Settings** by clicking on your name in the top right -corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. -In the _Project mapping_ tab, you can create a new mapping by clicking on _Create new mapping_. - + +To create a mapping go to **Cluster Settings** by clicking on your name in the top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. +In the *Project mapping* tab, you can create a new mapping by clicking on *Create new mapping*. +
Project mapping tab
Project mapping
- + This will take you to the create mapping page shown below
@@ -46,53 +50,55 @@ This will take you to the create mapping page shown below
Create mapping
- -Here you can choose from your LDAP groups and map them to a project from the _Project_ drop down list. -You can also choose the _Project role_ users will be assigned when they are added to the project. - -Finally, click on _Create mapping_ and go back to mappings. You should see the newly created mapping(s) as shown below. - + +Here you can choose from your LDAP groups and map them to a project from the *Project* drop down list. +You can also choose the *Project role* users will be assigned when they are added to the project. + +Finally, click on *Create mapping* and go back to mappings. You should see the newly created mapping(s) as shown below. +
Project mappings
Project mappings
- + !!!Note - If there are no groups in the _Remote group_ drop down list check if **ldap_groups_search_filter** is correct by using the value - in ```ldapsearch``` replacing ```%c``` with ```*```, as shown in the example below. - - ```ldapsearch -LLL -H ldap:/// -b '' -D '' -w '(&(objectClass=groupOfNames)(cn=*))'``` + If there are no groups in the *Remote group* drop down list check if **ldap_groups_search_filter** is correct by using the value in ```ldapsearch``` replacing ```%c``` with ```*```, as shown in the example below. + + ```bash + ldapsearch -LLL -H ldap:/// -b '' -D '' -w '(&(objectClass=groupOfNames)(cn=*))' + ``` - This should return all the groups in your LDAP. - + This should return all the groups in your LDAP. See [Cluster Configuration](../variables.md) on how to find and update the value of this variable. - + ### Step 2: Edit a mapping - -From the list of mappings click on the edit button (:material-pencil:). This will open a popup that will allow you to change the _remote group_, _project name_, and _project role_ of a mapping. - + +From the list of mappings click on the edit button (:material-pencil:). +This will open a popup that will allow you to change the *remote group*, *project name*, and *project role* of a mapping. +
Edit mapping
Edit mapping
- -!!!Warning - Updating a mapping's _remote group_ or _project name_ will remove all members of the previous group from the project. - + +!!! Warning + Updating a mapping's *remote group* or *project name* will remove all members of the previous group from the project. + ### Step 3: Delete a mapping - + To delete a mapping click on the delete button. - -!!!Warning + +!!! Warning Deleting a mapping will remove all members of that group from the project. -### Step 4: Configure sync interval +### Step 4: Configure sync interval After configuring all the group mappings users will be added to or removed from the projects in the mapping when they login to Hopsworks. -It is also possible to synchronize mappings without requiring users to log out. This can be done by setting ```ldap_group_mapping_sync_interval``` -to an interval greater or equal to 2 minutes. If ```ldap_group_mapping_sync_interval``` is set group mapping sync will run periodically based on the interval and +It is also possible to synchronize mappings without requiring users to log out. +This can be done by setting ```ldap_group_mapping_sync_interval``` to an interval greater or equal to 2 minutes. +If ```ldap_group_mapping_sync_interval``` is set group mapping sync will run periodically based on the interval and add or remove users from projects. diff --git a/docs/setup_installation/admin/ldap/configure-server.md b/docs/setup_installation/admin/ldap/configure-server.md index 0ea08d3f1..0e7ec17ba 100644 --- a/docs/setup_installation/admin/ldap/configure-server.md +++ b/docs/setup_installation/admin/ldap/configure-server.md @@ -1,18 +1,19 @@ # Configure Server for LDAP and Kerberos ## Introduction -LDAP and Kerberos integration need some configuration in the helm charts for your -cluster definition used to deploy your Hopsworks cluster. This tutorial shows an administrator how to configure the application -server for LDAP and Kerberos integration. + +LDAP and Kerberos integration need some configuration in the helm charts for your cluster definition used to deploy your Hopsworks cluster. +This tutorial shows an administrator how to configure the application server for LDAP and Kerberos integration. ## Prerequisites -An accessible LDAP domain. + +An accessible LDAP domain. A Kerberos Key Distribution Center (KDC) running on the same domain as Hopsworks (Only for Kerberos). ### Step 1: Server Configuration for LDAP -The LDAP attributes below are used to configure JNDI external resource in Payara. The JNDI resource will communicate -with your LDAP server to perform the authentication. +The LDAP attributes below are used to configure JNDI external resource in Payara. +The JNDI resource will communicate with your LDAP server to perform the authentication. ```yaml ldap: @@ -34,8 +35,9 @@ ldap: - security_credentials: contains the password of the user that will be used to query LDAP. - referral: whether to follow or ignore an alternate location in which an LDAP Request may be processed. -An already deployed instance can be configured to connect to LDAP. -Go to the payara admin UI and create a new JNDI external resource. The name of the resource should be __ldap/LdapResource__. +An already deployed instance can be configured to connect to LDAP. +Go to the payara admin UI and create a new JNDI external resource. +The name of the resource should be __ldap/LdapResource__.
LDAP Resource @@ -61,9 +63,9 @@ asadmin create-jndi-resource \ ### Step 2: Server Configuration for Kerberos -The Kerberos attributes are used to configure [SPNEGO](http://spnego.sourceforge.net/). -SPNEGO is used to establish a secure context between the requester and the application server when using Kerberos -authentication. +The Kerberos attributes are used to configure [SPNEGO](https://spnego.sourceforge.net/). +SPNEGO is used to establish a secure context between the requester and the application server when using Kerberos +authentication. ```yaml kerberos: @@ -83,12 +85,13 @@ ldap: additional_props: "" ``` -Both Kerberos and LDAP attributes need to be specified to configure Kerberos. The LDAP attributes are explained above. +Both Kerberos and LDAP attributes need to be specified to configure Kerberos. +The LDAP attributes are explained above. -- krb_conf_path: contains the path to the krb5.conf used by SPNEGO to get information about the default domain and the - location of the Kerberos KDC. The file is copied by the recipe in to /srv/hops/domains/domain1/config. -- krb_server_key_tab_path: contains the path to the Kerberos service keytab. The keytab is copied by the recipe in to - /srv/hops/domains/domain/config with the name set in the **krb_server_key_tab_name** attribute. -- spnego_server_conf: contains the configuration that will be appended to Payara's (application serve used to host hopsworks) - login.conf. In particular, it should contain useKeyTab=true, and the principal name to be used in the authentication phase. +- krb_conf_path: contains the path to the krb5.conf used by SPNEGO to get information about the default domain and the location of the Kerberos KDC. + The file is copied by the recipe in to /srv/hops/domains/domain1/config. +- krb_server_key_tab_path: contains the path to the Kerberos service keytab. + The keytab is copied by the recipe in to /srv/hops/domains/domain/config with the name set in the __krb_server_key_tab_name__ attribute. +- spnego_server_conf: contains the configuration that will be appended to Payara's (application serve used to host hopsworks) login.conf. + In particular, it should contain useKeyTab=true, and the principal name to be used in the authentication phase. Initiator should be set to false. diff --git a/docs/setup_installation/admin/monitoring/export-metrics.md b/docs/setup_installation/admin/monitoring/export-metrics.md index dd94d72fc..dd818a688 100644 --- a/docs/setup_installation/admin/monitoring/export-metrics.md +++ b/docs/setup_installation/admin/monitoring/export-metrics.md @@ -1,31 +1,39 @@ # Exporting Hopsworks metrics ## Introduction -Hopsworks services produce metrics which are centrally gathered by [Prometheus](https://prometheus.io/) and visualized in [Grafana](../grafana). + +Hopsworks services produce metrics which are centrally gathered by [Prometheus](https://prometheus.io/) and visualized in [Grafana](./grafana.md). Although the system is self-contained, it is possible for another *federated* Prometheus instance to scrape these metrics or directly push them to another system. This is useful if you have a centralized monitoring system with already configured alerts. ## Prerequisites + In order to configure Prometheus to export metrics you need to have the right to change the remote Prometheus configuration. ## Exporting metrics + Prometheus can be configured to export metrics to another Prometheus instance (cross-service federation) or to a custom service which knows how to handle them. ### Prometheus federation + Prometheus servers can be federated to scale better or to just clone all metrics (cross-service federation). In the guide below we assume **Prometheus A** is the service running in Hopsworks and **Prometheus B** is the server you want to clone metrics to. #### Step 1 -**Prometheus B** needs to be able to connect to TCP port `9090` of **Prometheus A** to scrape metrics. If you have any firewall (or Security Group) in place, allow ingress for that port. + +**Prometheus B** needs to be able to connect to TCP port `9090` of **Prometheus A** to scrape metrics. +If you have any firewall (or Security Group) in place, allow ingress for that port. #### Step 2 -The next step is to expose **Prometheus A** running inside Hopsworks Kubernetes cluster. If **Prometheus B** has direct access to **Prometheus A** then you can skip this step. + +The next step is to expose **Prometheus A** running inside Hopsworks Kubernetes cluster. +If **Prometheus B** has direct access to **Prometheus A** then you can skip this step. We will create a Kubernetes *Service* of type *LoadBalancer* to expose port `9090` !!!Warning - If you need to apply custom **annotations**, then modify the Manifest below + If you need to apply custom **annotations**, then modify the Manifest below. The example below assumes Hopsworks is **installed** at Namespace *hopsworks* ```bash @@ -49,7 +57,7 @@ spec: EOF ``` -Then we need to find the External IP address of the newly created Service +Then we need to find the External IP address of the newly created Service: ```bash export NAMESPACE=hopsworks @@ -57,16 +65,17 @@ kubectl -n $NAMESPACE get svc prometheus-external -ojsonpath='{.status.loadBalan ``` !!!Warning - It will take a few seconds until an IP address is assigned to the Service + It will take a few seconds until an IP address is assigned to the Service. -We will use this IP address in Step 2 +We will use this IP address in Step 3. -#### Step 2 -Edit the configuration file of **Prometheus B** server and append the following Job under `scrape_configs` +#### Step 3 + +Edit the configuration file of **Prometheus B** server and append the following Job under `scrape_configs`: !!! note - Replace IP_ADDRESS with the IP address from Step 1 or the IP address of Prometheus service if it is directly accessible. - The snippet below assumes Hopsworks services runs at Namespace **hopsworks** + Replace IP_ADDRESS with the IP address from Step 2 or the IP address of Prometheus service if it is directly accessible. + The snippet below assumes Hopsworks services runs at Namespace **hopsworks**. ```yaml - job_name: 'federate' @@ -84,8 +93,8 @@ Edit the configuration file of **Prometheus B** server and append the following - 'IP_ADDRESS:9090' ``` -The configuration above will scrape for services metrics under the *hopsworks* Namespace. If you want to additionally -scrape *user application* metrics then append `'{job="pushgateway"}'` to the matchers, for example: +The configuration above will scrape for services metrics under the *hopsworks* Namespace. +If you want to additionally scrape *user application* metrics then append `'{job="pushgateway"}'` to the matchers, for example: ```yaml params: @@ -95,16 +104,19 @@ scrape *user application* metrics then append `'{job="pushgateway"}'` to the mat ``` Depending on the Prometheus setup you might need to restart **Prometheus B** service to pick up the new configuration. -For more details on federation visit Prometheus [documentation](https://prometheus.io/docs/prometheus/latest/federation/#cross-service-federation) +For more details on federation visit Prometheus [documentation](https://prometheus.io/docs/prometheus/latest/federation/#cross-service-federation). ### Custom service -Prometheus can push metrics to another custom resource via HTTP. The custom service is responsible for handling the received metrics. + +Prometheus can push metrics to another custom resource via HTTP. +The custom service is responsible for handling the received metrics. To push metrics with this method we use the `remote_write` configuration. -We will only give a sample configuration as `remote_write` is extensively documented in Prometheus [documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) +We will only give a sample configuration as `remote_write` is extensively documented in Prometheus [documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). In the example below we push metrics to a custom service listening on port 9096 which transforms the metrics and forwards them. -In order to configure Prometheus to push metrics to a remote HTTP service we need to customize our Helm chart values file with the following snippet after changing the *url* accordingly. You can also tweak other configuration parameters to your needs. +In order to configure Prometheus to push metrics to a remote HTTP service we need to customize our Helm chart values file with the following snippet after changing the *url* accordingly. +You can also tweak other configuration parameters to your needs. ```yaml prometheus: @@ -120,4 +132,4 @@ prometheus: If the section already exists, then append the `remoteWrite` section. -Run `helm install` or `helm upgrade` if it's the first time you install Hopsworks or you want to apply the change to an existing cluster respectively. \ No newline at end of file +Run `helm install` or `helm upgrade` if it's the first time you install Hopsworks or you want to apply the change to an existing cluster respectively. diff --git a/docs/setup_installation/admin/monitoring/grafana.md b/docs/setup_installation/admin/monitoring/grafana.md index 7a6d0c9c0..917467a81 100644 --- a/docs/setup_installation/admin/monitoring/grafana.md +++ b/docs/setup_installation/admin/monitoring/grafana.md @@ -1,8 +1,9 @@ -# Services Dashboards +# Services Dashboards ## Introduction -The Hopsworks platform is composed of different services. Hopsworks uses Prometheus to collect health and performance metrics from the different services and Grafana to display them to the Hopsworks administrators. +The Hopsworks platform is composed of different services. +Hopsworks uses Prometheus to collect health and performance metrics from the different services and Grafana to display them to the Hopsworks administrators. In this guide you will learn how to access the Grafana dashboards to monitor the health of the cluster or to troubleshoot performance issues. @@ -10,11 +11,12 @@ In this guide you will learn how to access the Grafana dashboards to monitor the To access the services dashboards in Grafana, you need to have an administrator account on the Hopsworks cluster. -## Step 1: Access Grafana +## Step 1: Access Grafana You can access the admin page of your Hopsworks cluster by clicking on your name, in the top right corner, and choosing _Cluster Settings_ from the dropdown menu. -You can then navigate to the _Monitoring_ tab. The _Monitoring_ tab gives you access to several of the observability tools that are already deployed to help you manage the health of the cluster. +You can then navigate to the _Monitoring_ tab. +The _Monitoring_ tab gives you access to several of the observability tools that are already deployed to help you manage the health of the cluster.
monitoring tab @@ -29,9 +31,10 @@ In the Grafana web application, you can click on the _Home_ button on the top le Dashboards are organized into three folders: -- **Hops**: This folder contains all the dashboards of the Hopsworks services (e.g. the web application, the file system, resource manager) as well as the dashboards of the hosts (e.g. EC2 instances, virtual machines, servers) on which the cluster is deployed. +- **Hops**: This folder contains all the dashboards of the Hopsworks services (e.g., the web application, the file system, resource manager) as well as the dashboards of the hosts (e.g., EC2 instances, virtual machines, servers) on which the cluster is deployed. -- **RonDB**: This folder contains all the dashboard related to the database. The _Database_ dashboard contains a general overview of the RonDB cluster, while the remaining dashboards focus on specific items (e.g. thread activity, memory management, etc). +- **RonDB**: This folder contains all the dashboard related to the database. +The _Database_ dashboard contains a general overview of the RonDB cluster, while the remaining dashboards focus on specific items (e.g., thread activity, memory management, etc). - **Kubernetes**: If you have integrated Hopsworks with a Kubernetes cluster, this folder contains the dashboards to monitor the health of the Kubernetes cluster. @@ -40,11 +43,14 @@ Dashboards are organized into three folders:
Grafana view
-The default dashboards are read only and cannot be edited. Additional dashboards can be created by logging in to Grafana. You can log in into Grafana using the username and password specified in the cluster definition. +The default dashboards are read only and cannot be edited. +Additional dashboards can be created by logging in to Grafana. +You can log in into Grafana using the username and password specified in the cluster definition. !!! warning - By default Hopsworks keeps metrics information only for the past 15 days. This means that, by default, you will not be able to access health and performance metrics which are older than 15 days. + By default Hopsworks keeps metrics information only for the past 15 days. + This means that, by default, you will not be able to access health and performance metrics which are older than 15 days. ## Going Further diff --git a/docs/setup_installation/admin/monitoring/services-logs.md b/docs/setup_installation/admin/monitoring/services-logs.md index 79a0cfef9..4a2f46612 100644 --- a/docs/setup_installation/admin/monitoring/services-logs.md +++ b/docs/setup_installation/admin/monitoring/services-logs.md @@ -1,8 +1,9 @@ -# Services Logs +# Services Logs ## Introduction -The Hopsworks platform is composed of different services running on different nodes. Hopsworks uses Filebeat, Logstash and OpenSearch to collect, parse, index and present the logs to the Hopsworks administrators. +The Hopsworks platform is composed of different services running on different nodes. +Hopsworks uses Filebeat, Logstash and OpenSearch to collect, parse, index and present the logs to the Hopsworks administrators. In this guide you will learn how to access the Hopsworks logs using OpenSearch Dashboards. @@ -10,11 +11,12 @@ In this guide you will learn how to access the Hopsworks logs using OpenSearch D To access the services logs, you need to have an administrator account on the Hopsworks cluster. -## Step 1: Access the Logs +## Step 1: Access the Logs You can access the admin page of your Hopsworks cluster by clicking on your name, in the top right corner, and choosing _Cluster Settings_ from the dropdown menu. -You can then navigate to the _Monitoring_ tab. The _Monitoring_ tab gives you access to several of the observability tools that are already deployed to help you manage the health of the cluster. +You can then navigate to the _Monitoring_ tab. +The _Monitoring_ tab gives you access to several of the observability tools that are already deployed to help you manage the health of the cluster.
monitoring tab @@ -25,20 +27,24 @@ Click on the _Service Logs_ link to open the OpenSearch Dashboards web applicati ## Step 2: Search the logs -In the OpenSearch dashboard web application you will see by default all the logs generated by all monitored services in the last 15 minutes. +In the OpenSearch dashboard web application you will see by default all the logs generated by all monitored services in the last 15 minutes. -You can filter the logs of a specific service by searching for the term `service:[service name]`. As shown in the picture below, you can search for the _namenode_ logs by querying `service:namenode`. +You can filter the logs of a specific service by searching for the term `service:[service name]`. +As shown in the picture below, you can search for the _namenode_ logs by querying `service:namenode`. -Currently only the logs of the following services are collected and indexed: Hopsworks web application (called `domain1` in the log entries), namenodes, resource managers, datanodes, nodemanagers, Kafka brokers, Hive services and RonDB. These are the core component of the platform, additional logs will be added in the future. +Currently only the logs of the following services are collected and indexed: Hopsworks web application (called `domain1` in the log entries), namenodes, resource managers, datanodes, nodemanagers, Kafka brokers, Hive services and RonDB. +These are the core component of the platform, additional logs will be added in the future.
OpenSearch Dashboards with services logs
OpenSearch Dashboards displaying the logs
-!!! warning +!!! warning - By default, logs are rotated automatically after 7 days. This means that by default, you will not be able to access logs through OpenSearch Dashboards which are older than 7 days. Depending on the service and on the Hopsworks configuration, you can still access the logs by SSH directly into the machines of the cluster. + By default, logs are rotated automatically after 7 days. + This means that by default, you will not be able to access logs through OpenSearch Dashboards which are older than 7 days. + Depending on the service and on the Hopsworks configuration, you can still access the logs by SSH directly into the machines of the cluster. ## Going Further diff --git a/docs/setup_installation/admin/oauth2/configure-project-mapping.md b/docs/setup_installation/admin/oauth2/configure-project-mapping.md index d718c6523..590bdbd0c 100644 --- a/docs/setup_installation/admin/oauth2/configure-project-mapping.md +++ b/docs/setup_installation/admin/oauth2/configure-project-mapping.md @@ -1,13 +1,13 @@ # Configure OAuth2 group to project mapping - + ## Introduction A group-to-project mapping lets you automatically add all members of an OAuth2 group to a project, eliminating the need to add each user individually. To create a mapping, you simply select an OAuth2 group, choose the project it should be linked to, and assign the role that its members will have within that project. Once a mapping is created, project membership is controlled through OAuth2 group membership. Any updates made to the OAuth2 group—such as adding or removing users—will automatically be reflected in Hopsworks. For example, if a user is removed from the OAuth2 group, they will also be removed from the corresponding project. - + ## Prerequisites -1. A server configured with OAuth2. See [Register Identity Provider in Hopsworks](../create-client) for instructions on how to do this. -2. OAuth2 group mapping sync enabled. This can be done by setting the variable ```oauth_group_mapping_sync_enabled=true```. +1. A server configured with OAuth2. See [Register Identity Provider in Hopsworks](./create-client.md) for instructions on how to do this. +2. OAuth2 group mapping sync enabled. This can be done by setting the variable ```oauth_group_mapping_sync_enabled=true```. See [Cluster Configuration](../variables.md) on how to change variable values in Hopsworks.
@@ -24,19 +24,19 @@ If you can not find the variable ```oauth_group_mapping_sync_enabled``` create i
Create OAuth2 mapping enabled variable
- + ### Step 1: Create a mapping To create a mapping go to **Cluster Settings** by clicking on your name in the top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. In the _Project mapping_ tab, you can create a new mapping by clicking on _Create new mapping_. - +
Project mapping tab
Project mapping
- + This will take you to the create mapping page shown below
@@ -44,41 +44,41 @@ This will take you to the create mapping page shown below
Create mapping
- + Here you can enter your OAuth2 group and map it to a project from the _Project_ drop down list. You can also choose the _Project role_ users will be assigned when they are added to the project. - + Finally, click on _Create mapping_ and go back to mappings. You should see the newly created mapping(s) as shown below. - +
Project mappings
Project mappings
- + !!!Note - Make sure the group names from your OAuth2 provider match the one you entered above. + Make sure the group names from your OAuth2 provider match the one you entered above. If your identity provider uses a claim name other than ```groups``` or ```roles``` to represent group information, be sure to specify that claim name in the **Group Claim** field when setting up your identity provider. - + ### Step 2: Edit a mapping - + From the list of mappings click on the edit button (:material-pencil:). This will open a popup that will allow you to change the _remote group_, _project name_, and _project role_ of a mapping. - +
Edit mapping
Edit mapping
- + !!!Warning Updating a mapping's _remote group_ or _project name_ will remove all members of the previous group from the project. - + ### Step 3: Delete a mapping - + To delete a mapping click on the delete button. - + !!!Warning Deleting a mapping will remove all members of that group from the project. \ No newline at end of file diff --git a/docs/setup_installation/admin/oauth2/create-azure-client.md b/docs/setup_installation/admin/oauth2/create-azure-client.md index b9f75f375..dc55a98ae 100644 --- a/docs/setup_installation/admin/oauth2/create-azure-client.md +++ b/docs/setup_installation/admin/oauth2/create-azure-client.md @@ -1,17 +1,23 @@ -# Create An Application in Azure Active Directory. +# Create An Application in Azure Active Directory ## Introduction -This example uses Azure Active Directory as the identity provider, but the same can be done with any identity provider + +This example uses Azure Active Directory as the identity provider, but the same can be done with any identity provider supporting OAuth2 OpenID Connect protocol. ## Prerequisites + Azure account. ### Step 1: Register Hopsworks as an application in your identity provider -To use OAuth2 in Hopsworks you first need to create and configure an OAuth client in your identity provider. We will take the example of Azure AD for the remaining of this documentation, but equivalent steps can be taken on other identity providers. +To use OAuth2 in Hopsworks you first need to create and configure an OAuth client in your identity provider. +We will take the example of Azure AD for the remaining of this documentation, but equivalent steps can be taken on other identity providers. -Navigate to the [Microsoft Azure Portal](https://portal.azure.com) and authenticate. Navigate to [Azure Active Directory](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/Overview). Click on [App Registrations](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/RegisteredApps). Click on *New Registration*. +Navigate to the [Microsoft Azure Portal](https://portal.azure.com) and authenticate. +Navigate to [Azure Active Directory](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/Overview). +Click on [App Registrations](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/RegisteredApps). +Click on *New Registration*.

@@ -20,7 +26,9 @@ Navigate to the [Microsoft Azure Portal](https://portal.azure.com) and authentic

-Enter a name for the client such as *hopsworks_oauth_client*. Verify the Supported account type is set to *Accounts in this organizational directory only*. And Click Register. +Enter a name for the client such as *hopsworks_oauth_client*. +Verify the Supported account type is set to *Accounts in this organizational directory only*. +Click Register.

@@ -30,8 +38,9 @@ Enter a name for the client such as *hopsworks_oauth_client*. Verify the Support

### Step 2: Get the necessary fields for client registration -In the Overview section, copy the *Application (client) ID field*. We will use it in -[Identity Provider registration](../create-client) under the name *Client id*. + +In the Overview section, copy the *Application (client) ID field*. +We will use it in [Identity Provider registration](./create-client.md) under the name *Client id*.

@@ -40,8 +49,8 @@ In the Overview section, copy the *Application (client) ID field*. We will use i

-Click on *Endpoints* and copy the *OpenId Connect metadata document* endpoint excluding the *.well-known/openid-configuration* part. -We will use it in [Identity Provider registration](../create-client) under the name *Connection URL*. +Click on *Endpoints* and copy the *OpenId Connect metadata document* endpoint excluding the *.well-known/openid-configuration* part. +We will use it in [Identity Provider registration](./create-client.md) under the name *Connection URL*.

@@ -51,7 +60,8 @@ We will use it in [Identity Provider registration](../create-client) under the n

!!! note - If you have multiple tenants in your Azure Active Directory, the `OpenID Connect metadata document` endpoint might use `organizations` instead of a specific tenant ID. In such cases, replace `organizations` with your actual tenant ID to target a specific directory. + If you have multiple tenants in your Azure Active Directory, the `OpenID Connect metadata document` endpoint might use `organizations` instead of a specific tenant ID. + In such cases, replace `organizations` with your actual tenant ID to target a specific directory. example: @@ -68,7 +78,9 @@ Click on *Certificates & secrets*, then Click on *New client secret*.

-Add a *description* of the secret. Select an expiration period. And, Click *Add*. +Add a *description* of the secret. +Select an expiration period. +Click *Add*.

@@ -77,8 +89,8 @@ Add a *description* of the secret. Select an expiration period. And, Click *Add*

-Copy the secret. This will be used in [Identity Provider registration](../create-client) under the name -*Client Secret*. +Copy the secret. +This will be used in [Identity Provider registration](./create-client.md) under the name *Client Secret*.

@@ -87,7 +99,8 @@ Copy the secret. This will be used in [Identity Provider registration](../create

-Click on *Authentication*. Then click on *Add a platform* +Click on *Authentication*. +Then click on *Add a platform*.

@@ -105,7 +118,8 @@ In *Configure platforms* click on *Web*.

-Enter the *Redirect URI* and click on *Configure*. The redirect URI is *HOPSWORKS-URI/callback* with *HOPSWORKS-URI* the URI of your Hopsworks cluster. +Enter the *Redirect URI* and click on *Configure*. +The redirect URI is *HOPSWORKS-URI/callback* with *HOPSWORKS-URI* the URI of your Hopsworks cluster.

diff --git a/docs/setup_installation/admin/oauth2/create-client.md b/docs/setup_installation/admin/oauth2/create-client.md index b455c68f4..fc3114482 100644 --- a/docs/setup_installation/admin/oauth2/create-client.md +++ b/docs/setup_installation/admin/oauth2/create-client.md @@ -1,44 +1,40 @@ # Register Identity Provider in Hopsworks ## Introduction -Before registering your identity provider in Hopsworks you need to create a client application in your identity provider and -acquire a _client id_ and a _client secret_. An example on how to create a client using [Okta](https://www.okta.com/) -and [Azure Active Directory](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/Overview) -identity providers can be found [here](../create-okta-client) and [here](../create-azure-client) respectively. + +Before registering your identity provider in Hopsworks you need to create a client application in your identity provider and acquire a _client id_ and a _client secret_. +An example on how to create a client using [Okta](https://www.okta.com/) and [Azure Active Directory](https://portal.azure.com/#blade/Microsoft_AAD_IAM/ActiveDirectoryMenuBlade/Overview) identity providers can be found in the following guides: [Create Okta Client](./create-okta-client.md) and [Create Azure Client](./create-azure-client.md). ## Prerequisites + Acquired a _client id_ and a _client secret_ from your identity provider. ### Step 1: Register a client -After acquiring the _client id_ and _client secret_ create the client in Hopsworks by [enabling OAuth2](../../auth) -and clicking on _add another identity provider_ in the [Authentication configuration page](../../auth). Then set -base uri of your identity provider in _Connection URL_ give a name to your identity provider (the name will be used -in the login page as an alternative login method) and set the _client id_ and _client secret_ in their respective -fields, as shown in the figure below. + +After acquiring the _client id_ and _client secret_ create the client in Hopsworks by [enabling OAuth2](../auth.md) and clicking on _add another identity provider_ in the [Authentication configuration page](../auth.md). +Then set base uri of your identity provider in _Connection URL_ give a name to your identity provider (the name will be used in the login page as an alternative login method) and set the _client id_ and _client secret_ in their respective fields, as shown in the figure below.
Application overview
Application overview
-- _Connection URL_: (provider Uri) is the base uri of the identity provider's API (URI should contain scheme http:// or - https://). +- _Connection URL_: (provider Uri) is the base uri of the identity provider's API (URI should contain scheme http:// or https://). Additional configuration can be set here: -- _Verify email_: if checked only users with verified email address (in the identity provider) can log in to Hopsworks. -- _Code challenge_: if your identity provider requires code challenge for authorization request check - the _code challenge_ check box. This will allow you to choose code challenge method that can be either _plain_ or - _S256_. -- _Logo URL_: optionally a logo URL to an image can be added. The logo will be shown on the login page with the name - as shown in the figure below. -- Claim names for given name, family name, email and group can also be set here. If left empty the default openid claim names will be used. +- _Verify email_: if checked only users with verified email address (in the identity provider) can log in to Hopsworks. +- _Code challenge_: if your identity provider requires code challenge for authorization request check the _code challenge_ check box. + This will allow you to choose code challenge method that can be either _plain_ or _S256_. +- _Logo URL_: optionally a logo URL to an image can be added. + The logo will be shown on the login page with the name as shown in the figure below. +- Claim names for given name, family name, email and group can also be set here. + If left empty the default openid claim names will be used. ### Step 2: Add Group mappings -Optionally you can add a group mapping from your identity provider to Hopsworks groups, by clicking on your name in the -top right corner of the navigation bar and choosing *Cluster Settings* from the dropdown menu. In the *Cluster -Settings* _Configuration_ tab search for _oauth\_group\_mapping_ and click on the edit button. +Optionally you can add a group mapping from your identity provider to Hopsworks groups, by clicking on your name in the top right corner of the navigation bar and choosing _Cluster Settings_ from the dropdown menu. +In the _Cluster Settings_ _Configuration_ tab search for _oauth\_group\_mapping_ and click on the edit button.
Set variables @@ -47,18 +43,18 @@ Settings* _Configuration_ tab search for _oauth\_group\_mapping_ and click on th !!! Note - Setting ```oauth_group_mapping``` to ```ANY_GROUP->HOPS_USER``` will assign the role *user* to any user from any group in - your identity provider when they log into Hopsworks with OAuth for the first time. You can replace *ANY_GROUP* with - the group of your choice in the identity provider. You can replace *HOPS_USER* by *HOPS_ADMIN* if you want the - users of that group to be admins in Hopsworks. You can do several mappings by separating them with a semicolon. + Setting ```oauth_group_mapping``` to ```ANY_GROUP->HOPS_USER``` will assign the role *user* to any user from any group in your identity provider when they log into Hopsworks with OAuth for the first time. + You can replace *ANY_GROUP* with the group of your choice in the identity provider. + You can replace *HOPS_USER* by *HOPS_ADMIN* if you want the users of that group to be admins in Hopsworks. + You can do several mappings by separating them with a semicolon. Group mapping can be disabled by setting ```oauth_group_mapping_enabled=false``` in the [Configuration](../variables.md) UI. When group mapping is disabled an administrator needs to activate each user from the [User Management](../user.md) page. If group mapping is disabled then ```oauth_account_status``` in the [Configuration](../variables.md) UI should be set to 1 (Verified). -Users will now see a new button on the login page. The button has the name you set above for _Name_ and will -redirect to your identity provider. +Users will now see a new button on the login page. +The button has the name you set above for _Name_ and will redirect to your identity provider.
OAuth2 login @@ -67,7 +63,6 @@ redirect to your identity provider. !!! note - When creating a client make sure you can access the provider metadata by making a GET request on the well known - endpoint of the provider. The well-known URL, will typically be the _Connection URL_ plus - `.well-known/openid-configuration`. For the above client it would be - `https://dev-86723251.okta.com/.well-known/openid-configuration`. + When creating a client make sure you can access the provider metadata by making a GET request on the well known endpoint of the provider. + The well-known URL, will typically be the _Connection URL_ plus `.well-known/openid-configuration`. + For the above client it would be `https://dev-86723251.okta.com/.well-known/openid-configuration`. diff --git a/docs/setup_installation/admin/oauth2/create-okta-client.md b/docs/setup_installation/admin/oauth2/create-okta-client.md index 6fb7f1d10..ee2e83b52 100644 --- a/docs/setup_installation/admin/oauth2/create-okta-client.md +++ b/docs/setup_installation/admin/oauth2/create-okta-client.md @@ -1,13 +1,16 @@ # Create An Application in Okta ## Introduction -This example uses an Okta development account to create an application that will represent a Hopsworks client in the -identity provider. + +This example uses an Okta development account to create an application that will represent a Hopsworks client in the identity provider. ## Prerequisites -Okta development account. To create a developer account go to [Okta developer](https://developer.okta.com/signup/). + +Okta development account. +To create a developer account go to [Okta developer](https://developer.okta.com/signup/). ### Step 1: Register Hopsworks as an application in your identity provider + After creating a developer account register a client by going to _Applications_ and click on **Create App Integration**.
@@ -15,24 +18,23 @@ After creating a developer account register a client by going to _Applications_
Okta Applications
-This will open a popup as shown in the figure below. Select **OIDC** as _Sign-in-method_ and **Web Application** as -_Application type_ and click next. +This will open a popup as shown in the figure below. +Select **OIDC** as _Sign-in-method_ and **Web Application** as _Application type_ and click next.
Create New Application
Create new Application
-Give your application a name and select **Client credential** as _Grant Type_. Then add a _Sign-in redirect URI_ -that is your Hopsworks cluster domain name (including the port number if needed) with path _/callback_, and a _Sign-out -redirect URI_ that is Hopsworks cluster domain name (including the port number if needed) with no path. +Give your application a name and select **Client credential** as _Grant Type_. +Then add a _Sign-in redirect URI_ that is your Hopsworks cluster domain name (including the port number if needed) with path _/callback_, and a _Sign-out redirect URI_ that is Hopsworks cluster domain name (including the port number if needed) with no path.
New Application
New Application
-If you want to limit who can access your Hopsworks cluster select _Limit access to selected groups_ and -select group(s) you want to give access to. Here we will allow everyone in the organization to access the cluster. +If you want to limit who can access your Hopsworks cluster select _Limit access to selected groups_ and select group(s) you want to give access to. +Here we will allow everyone in the organization to access the cluster.
Group assignment @@ -41,11 +43,11 @@ select group(s) you want to give access to. Here we will allow everyone in the o ## Group mapping -You can also create mappings from groups in Okta to groups in Hopsworks. To achieve this you need to configure Okta to -send _Groups_ with user information. To do this go to _Applications_ and select your application name. In the _Sign -On_ tab click edit _OpenID Connect ID Token_ and select **Filter** for _Groups claim type_, then for _Groups claim -filter_ add **groups** as the claim name, select **Match Regex** from the dropdown and .* (dot star) as Regex to -match all groups. See [Group mapping](../create-client/#group-mapping) on how to do the mapping in Hopsworks. +You can also create mappings from groups in Okta to groups in Hopsworks. +To achieve this you need to configure Okta to send _Groups_ with user information. +To do this go to _Applications_ and select your application name. +In the _Sign On_ tab click edit _OpenID Connect ID Token_ and select **Filter** for _Groups claim type_, then for _Groups claim filter_ add **groups** as the claim name, select **Match Regex** from the dropdown and .* (dot star) as Regex to match all groups. +See [Group mapping](./create-client.md#step-2-add-group-mappings) on how to do the mapping in Hopsworks.
Group claim @@ -53,9 +55,9 @@ match all groups. See [Group mapping](../create-client/#group-mapping) on how to
### Step 2: Get the necessary fields for client registration -After the application is created go back to _Applications_ and click on the application you just created. Use the -_Okta domain_ (_Connection URL_), _client id_ and _client secret_ generated for your app in the -[Identity Provider registration](../create-client) in Hopsworks. + +After the application is created go back to _Applications_ and click on the application you just created. +Use the _Okta domain_ (_Connection URL_), _client id_ and _client secret_ generated for your app in the [Identity Provider registration](./create-client.md) in Hopsworks.
Application overview @@ -64,5 +66,4 @@ _Okta domain_ (_Connection URL_), _client id_ and _client secret_ generated for !!! note - When copying the domain in the figure above make sure to add the url scheme (http:// or https://) when using it - in the _Connection URL_ in the [Identity Provider registration form](../create-client). + When copying the domain in the figure above make sure to add the url scheme (http:// or https://) when using it in the _Connection URL_ in the [Identity Provider registration form](./create-client.md). diff --git a/docs/setup_installation/admin/project.md b/docs/setup_installation/admin/project.md index 9a6818e61..afeb8ba3b 100644 --- a/docs/setup_installation/admin/project.md +++ b/docs/setup_installation/admin/project.md @@ -6,7 +6,8 @@ description: Guide on how to manage projects and quotas as a Hopsworks administr Hopsworks provides an administrator with a view of the projects in a Hopsworks cluster. -A Hopsworks administrator is not automatically a member of all the projects in a cluster. However, they can see which projects exist, who is the project owner, and they can limit the storage quota and compute quota for each project. +A Hopsworks administrator is not automatically a member of all the projects in a cluster. +However, they can see which projects exist, who is the project owner, and they can limit the storage quota and compute quota for each project. ## Prerequisites @@ -21,7 +22,8 @@ You can find the Project management page by clicking on your name, in the top ri
Project page
-This page will list all the projects in a cluster, their name, owner and when its quota was last updated. By clicking on the _edit configuration_ link of a project you will be able to edit the quotas of that project. +This page will list all the projects in a cluster, their name, owner and when its quota was last updated. +By clicking on the _edit configuration_ link of a project you will be able to edit the quotas of that project.
Project quotas @@ -30,31 +32,43 @@ This page will list all the projects in a cluster, their name, owner and when it ### Storage -Storage quota represents the amount of data a project can store. The storage quota is broken down in three different areas: +Storage quota represents the amount of data a project can store. +The storage quota is broken down in three different areas: -- **Feature Store**: This represents the storage quota for files and directories stored in the `_featurestore.db` dataset in the project. This dataset contains all the feature group offline data for the project. -- **Hive DB**: This represents the storage quota for files and directories stored in the `[projectName].db` dataset in the project. This is a general purpose Hive database for the project that can be used for analytics. +- **Feature Store**: This represents the storage quota for files and directories stored in the `_featurestore.db` dataset in the project. +This dataset contains all the feature group offline data for the project. +- **Hive DB**: This represents the storage quota for files and directories stored in the `[projectName].db` dataset in the project. +This is a general purpose Hive database for the project that can be used for analytics. - **Project**: This represents the storage quota for all the data stored on any other dataset. -Each storage quota is divided into space quota, i.e., how much space the files can consume, and namespace quota, i.e., how many files and directories there can be. If Hopsworks is deployed on-premise using hard drives to store the data, i.e., Hopsworks is not configured to store its data in a S3-compliant storage system, the data is replicated across multiple nodes (by default 3) and the space quota takes the replication factor into consideration. As an example, a 100MB file stored with a replication factor of 3, will consume 300MB of space quota. +Each storage quota is divided into space quota, i.e., how much space the files can consume, and namespace quota, i.e., how many files and directories there can be. +If Hopsworks is deployed on-premise using hard drives to store the data, i.e., Hopsworks is not configured to store its data in a S3-compliant storage system, the data is replicated across multiple nodes (by default 3) and the space quota takes the replication factor into consideration. +As an example, a 100MB file stored with a replication factor of 3, will consume 300MB of space quota. -By default, all storage quotas are disabled and not enforced. Administrators can change this default by changing the following configuration in the [Configuration](../admin/variables.md) UI and/or the cluster definition: -``` +By default, all storage quotas are disabled and not enforced. +Administrators can change this default by changing the following configuration in the [Configuration](../admin/variables.md) UI and/or the cluster definition: + +```yaml hopsworks: featurestore_default_quota: [default quota in bytes, -1 to disable] hdfs_default_quota: [default quota in bytes, -1 to disable] hive_default_quota: [default quota in bytes, -1 to disable] ``` + The values specified will be set during project creation and administrators will be able to customize each project using this UI. ### Compute -Compute quotas represents the amount of compute a project can use to run Spark and Flink applications as well as Tez queries. Quota is expressed as number of seconds a container of size 1 CPU and 1GB of RAM can run for. +Compute quotas represents the amount of compute a project can use to run Spark and Flink applications as well as Tez queries. +Quota is expressed as number of seconds a container of size 1 CPU and 1GB of RAM can run for. -If the Hopsworks cluster is connected to a Kubernetes cluster, Python jobs, Jupyter notebooks and KServe models are not subject to the compute quota. Currently, Hopsworks does not support defining quotas for compute scheduled on the connected Kubernetes cluster. +If the Hopsworks cluster is connected to a Kubernetes cluster, Python jobs, Jupyter notebooks and KServe models are not subject to the compute quota. +Currently, Hopsworks does not support defining quotas for compute scheduled on the connected Kubernetes cluster. -By default, the compute quota is disabled. Administrators can change this default by changing the following configuration in the [Configuration](../admin/variables.md) UI and/or the cluster definition: -``` +By default, the compute quota is disabled. +Administrators can change this default by changing the following configuration in the [Configuration](../admin/variables.md) UI and/or the cluster definition: + +```yaml hopsworks: yarn_default_payment_type: [NOLIMIT to disable the quota, PREPAID to enable it] yarn_default_quota: [default quota in seconds] @@ -64,21 +78,27 @@ The values specified will be set during project creation and administrators will ### Kakfa Topics -Kafka is used within Hopsworks to enable users to write data to the feature store in Real-Time and from a variety of different frameworks. If a user creates a feature group with the stream APIs enabled, then a Kafka topic will be created for that feature group. By default, a project can have up to 100 Kafka topics. -Administrators can increase the number of Kafka topics a project is allowed to create by increasing the quota in the project admin UI. +Kafka is used within Hopsworks to enable users to write data to the feature store in Real-Time and from a variety of different frameworks. +If a user creates a feature group with the stream APIs enabled, then a Kafka topic will be created for that feature group. +By default, a project can have up to 100 Kafka topics. +Administrators can increase the number of Kafka topics a project is allowed to create by increasing the quota in the project admin UI. ## Force deleting a project -Administrators have the option to force delete a project. This is useful if the project was not created or deleted properly, e.g., because of an error. +Administrators have the option to force delete a project. +This is useful if the project was not created or deleted properly, e.g., because of an error. ## Controlling who can create projects -Every user on Hopsworks can create projects. By default, each user can create up to 10 projects. For production environments, the number of projects should be limited and controlled for resource allocation purposes as well as closer control over the data. +Every user on Hopsworks can create projects. +By default, each user can create up to 10 projects. +For production environments, the number of projects should be limited and controlled for resource allocation purposes as well as closer control over the data. Administrators can control how many projects a user can provision by setting the following configuration in the [Configuration](../admin/variables.md) UI and/or cluster definition: -``` +```yaml hopsworks: max_num_proj_per_user: [Maximum number of projects each user can create] ``` -This value will be set when the user is provisioned. Administrators can grant additional projects to a specific user through the [User Administration](../admin/user.md) UI. +This value will be set when the user is provisioned. +Administrators can grant additional projects to a specific user through the [User Administration](../admin/user.md) UI. diff --git a/docs/setup_installation/admin/roleChaining.md b/docs/setup_installation/admin/roleChaining.md index ccf6bff69..b796f0469 100644 --- a/docs/setup_installation/admin/roleChaining.md +++ b/docs/setup_installation/admin/roleChaining.md @@ -2,11 +2,15 @@ ## Introduction -When running Hopsworks in Amazon EKS you have several options to give the Hopsworks user access to AWS resources. The simplest is to assign [Amazon EKS node IAM role](https://docs.aws.amazon.com/eks/latest/userguide/create-node-role.html) access to the resources. But, this will make these resources accessible by all users. To manage access to resources on a project base you need to use [Role chaining](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_terms-and-concepts.html#iam-term-role-chaining). +When running Hopsworks in Amazon EKS you have several options to give the Hopsworks user access to AWS resources. +The simplest is to assign [Amazon EKS node IAM role](https://docs.aws.amazon.com/eks/latest/userguide/create-node-role.html) access to the resources. +But, this will make these resources accessible by all users. +To manage access to resources on a project base you need to use [Role chaining](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_terms-and-concepts.html#iam-term-role-chaining). In this document we will see how to configure AWS and Hopsworks to use Role chaining in your Hopsworks projects. ## Prerequisites + Before you begin this guide you'll need the following: - A Hopsworks cluster running on EKS. @@ -14,11 +18,12 @@ Before you begin this guide you'll need the following: - Administrator account on the Hopsworks cluster. ### Step 1: Create an IAM role and associate it with a Kubernetes service account -To use role chaining the hopsworks instance pods need to be able to impersonate the roles you want to be linked to your project. For this you need to create an IAM role and associate it with your Kubernetes service accounts with assume role permissions and attach it to your hopsworks instance pods. -For more details on how to create an IAM roles for Kubernetes service accounts see the [aws documentation](https://docs.aws.amazon.com/eks/latest/userguide/associate-service-account-role.html). +To use role chaining the hopsworks instance pods need to be able to impersonate the roles you want to be linked to your project. +For this you need to create an IAM role and associate it with your Kubernetes service accounts with assume role permissions and attach it to your hopsworks instance pods. +For more details on how to create an IAM roles for Kubernetes service accounts see the [aws documentation](https://docs.aws.amazon.com/eks/latest/userguide/associate-service-account-role.html). -!!!note +!!!note To ensure that users can't use the service account role and impersonate the roles by their own means, you need to ensure that the service account is only attached to the hopsworks instance pods. ```sh @@ -27,7 +32,6 @@ oidc_provider=$(aws eks describe-cluster --name my-cluster --region $AWS_REGION ``` - ```sh export namespace=hopsworks export service_account=my-service-account @@ -54,6 +58,7 @@ export service_account=my-service-account ] } ``` +
Example trust policy for a service account.
```json @@ -74,6 +79,7 @@ export service_account=my-service-account ] } ``` +
Example policy for assuming four roles.
The IAM role will need to add a trust policy to allow the service account to assume the role, and permissions to assume the different roles that will be used to access resources. @@ -85,7 +91,9 @@ kubectl annotate serviceaccount -n $namespace $service_account eks.amazonaws.com ``` ### Step 2: Create the resource roles -For the service account role to be able to impersonate the roles you also need to configure the roles themselves to allow it. This is done by adding the service account role to the role's [Trust relationships](https://docs.aws.amazon.com/directoryservice/latest/admin-guide/edit_trust.html). + +For the service account role to be able to impersonate the roles you also need to configure the roles themselves to allow it. +This is done by adding the service account role to the role's [Trust relationships](https://docs.aws.amazon.com/directoryservice/latest/admin-guide/edit_trust.html). ```json { @@ -101,9 +109,11 @@ For the service account role to be able to impersonate the roles you also need t ] } ``` +
Example resource roles.
### Step 3: Create mappings + Now that the service account IAM role can assume the roles we need to configure Hopsworks to delegate access to the roles on a project base. In Hopsworks, click on your name in the top right corner of the navigation bar and choose _Cluster Settings_ from the dropdown menu. @@ -114,13 +124,18 @@ In the Cluster Settings' _IAM Role Chaining_ tab you can configure the mappings
Role Chaining
-Add mappings by clicking on *New role chaining*. Enter the project name. Select the type of user that can assume the role. Enter the role ARN. And click on *Create new role chaining* +Add mappings by clicking on _New role chaining_. +Enter the project name. +Select the type of user that can assume the role. +Enter the role ARN. +And click on _Create new role chaining_
Create Role Chaining
Create Role Chaining
-Project member can now create connectors using *temporary credentials* to assume the role you configured. More detail about using temporary credentials can be found [here](../../user_guides/fs/data_source/creation/s3.md#temporary-credentials). +Project member can now create connectors using _temporary credentials_ to assume the role you configured. +More details about using temporary credentials can be found in the [Temporary Credentials section](../../user_guides/fs/data_source/creation/s3.md#temporary-credentials) of the S3 datasource creation guide. -Project member can see the list of role they can assume by going the _Project Settings_ -> [Assuming IAM Roles](../../../user_guides/projects/iam_role/iam_role_chaining) page. +Project member can see the list of role they can assume by going the _Project Settings_ -> [Assuming IAM Roles](../../user_guides/projects/iam_role/iam_role_chaining.md) page. diff --git a/docs/setup_installation/admin/user.md b/docs/setup_installation/admin/user.md index 4539bb17a..ca4d6a016 100644 --- a/docs/setup_installation/admin/user.md +++ b/docs/setup_installation/admin/user.md @@ -1,18 +1,17 @@ # User Management ## Introduction -Whether you run Hopsworks on-premise, or on the cloud using kubernetes, -you have a Hopsworks cluster which contains all users and projects. + +Whether you run Hopsworks on-premise, or on the cloud using kubernetes, you have a Hopsworks cluster which contains all users and projects. ## Prerequisites + Administrator account on a Hopsworks cluster. ### Step 1: Go to user management -All the users of your Hopsworks instance have access to your cluster with different access rights. -You can find them by clicking on your name in the top right corner of the navigation bar and choosing _Cluster -Settings_ from the dropdown menu and going to the _Users_ tab (You need to have _Admin_ role to get access to the -_Cluster Settings_ page). +All the users of your Hopsworks instance have access to your cluster with different access rights. +You can find them by clicking on your name in the top right corner of the navigation bar and choosing _Cluster Settings_ from the dropdown menu and going to the _Users_ tab (You need to have _Admin_ role to get access to the _Cluster Settings_ page).
active users @@ -24,14 +23,14 @@ _Cluster Settings_ page). Roles let you manage the access rights of a user to the cluster. - User: users with this role are only allowed to use the cluster by creating a limited number of projects. -- Admin: users with this role are allowed to manage the cluster. This includes accepting new users to the cluster or - blocking them, managing user quota, [configure alerts](../alert) and setting up [authentication methods](../auth). +- Admin: users with this role are allowed to manage the cluster. + This includes accepting new users to the cluster or blocking them, managing user quota, [configure alerts](./alert.md) and setting up [authentication methods](./auth.md). You can change the role of a user by clicking on the _select dropdown_ that shows the current role of the user. ### Step 3: Validating and blocking users -By default, a user who register on Hopsworks using their own credentials are not granted access to the cluster. +By default, a user who register on Hopsworks using their own credentials are not granted access to the cluster. First, a user with an admin role needs to validate their account. By clicking on the _Review Requests_ button you can open a _user request review_ popup as shown in the image below. @@ -41,12 +40,12 @@ By clicking on the _Review Requests_ button you can open a _user request review_
Review user request
-On the user request review popup you can activate or block users. Users with a validated email address will have a -check mark on their email. +On the user request review popup you can activate or block users. +Users with a validated email address will have a check mark on their email. -Similarly, if a user is no longer allowed access to the cluster you can block them. To keep consistency with the -history of your datasets, a user can not be deleted but only blocked. If necessary a user can be -deleted manually in the cluster using the command line. +Similarly, if a user is no longer allowed access to the cluster you can block them. +To keep consistency with the history of your datasets, a user can not be deleted but only blocked. +If necessary a user can be deleted manually in the cluster using the command line. You can block a user by clicking on the block icon on the right side of the user in the list.
@@ -54,13 +53,13 @@ You can block a user by clicking on the block icon on the right side of the user
Blocked Users
-Blocked users will appear on the lower section of the page. Click on _display blocked users_ to show all the blocked -users in your cluster. If a user is blocked by mistake you can reactivate it by clicking on the check mark icon -that corresponds to that user in the blocked users list. +Blocked users will appear on the lower section of the page. +Click on _display blocked users_ to show all the blocked users in your cluster. +If a user is blocked by mistake you can reactivate it by clicking on the check mark icon that corresponds to that user in the blocked users list. -If there are too many users in your cluster, use the search box (available for blocked users too) to filter users by -name or email. It is also possible to filter activated users by role. For example to see all administrators in you -cluster click on the _select dropdown_ to the right of the search box and choose _Admin_. +If there are too many users in your cluster, use the search box (available for blocked users too) to filter users by name or email. +It is also possible to filter activated users by role. +For example to see all administrators in you cluster click on the _select dropdown_ to the right of the search box and choose _Admin_. ### Step 4: Create a new users @@ -71,13 +70,14 @@ If you want to allow users to login without registering you can pre-create them
Create new user
-After setting the user's name and email chose the type of user you want to create (Hopsworks, Kerberos or LDAP). To -create a Kerberos or LDAP user you need to get the users **UUID** from the Kerberos or LDAP server. Hopsworks user -can also be assigned a _Role_. Kerberos and LDAP users on the other hand can only be assigned a role through group +After setting the user's name and email chose the type of user you want to create (Hopsworks, Kerberos or LDAP). +To create a Kerberos or LDAP user you need to get the users **UUID** from the Kerberos or LDAP server. +Hopsworks user can also be assigned a _Role_. +Kerberos and LDAP users on the other hand can only be assigned a role through group mapping. -A temporary password will be generated and displayed when you click on _Create new user_. Copy the password and pass -it securely to the user. +A temporary password will be generated and displayed when you click on _Create new user_. +Copy the password and pass it securely to the user.
create user @@ -86,17 +86,18 @@ it securely to the user. ### Step 5: Reset user password -In the case where a user loses her/his password and can not recover it with the -[password recovery](../../user_guides/projects/auth/recovery.md), an administrator can reset it for them. +In the case where a user loses her/his password and can not recover it with the [password recovery](../../user_guides/projects/auth/recovery.md), an administrator can reset it for them. -On the bottom of the _Users_ page click on the _Reset a user password_ link. A popup window with a dropdown for -searching users by name or email will open. Find the user and click on _Reset new password_. +On the bottom of the _Users_ page click on the _Reset a user password_ link. +A popup window with a dropdown for searching users by name or email will open. +Find the user and click on _Reset new password_.
reset password
Reset user password
-A temporary password will be displayed. Copy the password and pass it to the user securely. +A temporary password will be displayed. +Copy the password and pass it to the user securely.
temp password diff --git a/docs/setup_installation/admin/variables.md b/docs/setup_installation/admin/variables.md index 88a32306a..dab67d429 100644 --- a/docs/setup_installation/admin/variables.md +++ b/docs/setup_installation/admin/variables.md @@ -1,13 +1,16 @@ # Cluster Configuration ## Introduction + Whether you run Hopsworks on-premise, or on the cloud using kubernetes, it is possible to change a variety of configurations on the cluster, changing its default behaviour. -This section is not going into detail for every setting, since every Hopsworks cluster comes with a robust default setup. However, this guide is to explain where to find the configurations and if necessary, how to change them. +This section is not going into detail for every setting, since every Hopsworks cluster comes with a robust default setup. +However, this guide is to explain where to find the configurations and if necessary, how to change them. !!! note In most cases you will be only be prompted to change these configurations by a Hopsworks Solutions Engineer or similar. ## Prerequisites + An administrator account on a Hopsworks cluster. ### Step 1: The configuration page @@ -29,7 +32,9 @@ Once you have made the change, don't forget to click *save* to persist the chang #### Visibility -The visibility setting indicates whether a setting can be read only by **Hops Admins** or also by simple **Hops Users**, that is everyone. Additionally, you can also allow to read the setting even when **not authenticated**. If the setting contains a password or sensitive information, you can also hide the value so it's not shown in the UI. +The visibility setting indicates whether a setting can be read only by **Hops Admins** or also by simple **Hops Users**, that is everyone. +Additionally, you can also allow to read the setting even when **not authenticated**. +If the setting contains a password or sensitive information, you can also hide the value so it's not shown in the UI. ### Step 3: Adding a new configuration diff --git a/docs/setup_installation/aws/getting_started.md b/docs/setup_installation/aws/getting_started.md index 95113acb8..37d654e35 100644 --- a/docs/setup_installation/aws/getting_started.md +++ b/docs/setup_installation/aws/getting_started.md @@ -1,33 +1,37 @@ # AWS - Getting started Kubernetes and Helm are used to install & run Hopsworks and the Feature Store -in the cloud. They both integrate seamlessly with third-party platforms such as Databricks, -SageMaker and KubeFlow. This guide shows how to set up the Hopsworks platform in your organization's AWS account. +in the cloud. +They both integrate seamlessly with third-party platforms such as Databricks, +SageMaker and KubeFlow. +This guide shows how to set up the Hopsworks platform in your organization's AWS account. ## Prerequisites To follow the instruction on this page you will need the following: - Kubernetes Version: Hopsworks can be deployed on EKS clusters running Kubernetes >= 1.27.0. -- [aws-cli](https://aws.amazon.com/cli/) to provision the AWS resources +- [aws-cli](https://aws.amazon.com/cli/) to provision the AWS resources - [eksctl](https://eksctl.io/) to interact with the AWS APIs and provision the EKS cluster - [helm](https://helm.sh/) to deploy Hopsworks ### ECR Registry -Hopsworks allows users to customize the images used by Python jobs, Jupyter Notebooks and (Py)Spark applications running in their projects. The images are stored in ECR. Hopsworks needs access to an ECR repository to push the project images. +Hopsworks allows users to customize the images used by Python jobs, Jupyter Notebooks and (Py)Spark applications running in their projects. +The images are stored in ECR. +Hopsworks needs access to an ECR repository to push the project images. ### Permissions - The deployment requires cluster admin access to create ClusterRoles, ServiceAccounts, and ClusterRoleBindings. -- A namespace is required to deploy the Hopsworks stack. If you don’t have permissions to create a namespace, ask your EKS administrator to provision one. +- A namespace is required to deploy the Hopsworks stack. +If you don’t have permissions to create a namespace, ask your EKS administrator to provision one. ## EKS Deployment The following steps describe how to deploy an EKS cluster and related resources so that it’s compatible with Hopsworks. - ## Step 1: AWS EKS Setup ### Step 1.1: Create S3 Bucket @@ -38,7 +42,7 @@ aws s3 mb s3://BUCKET_NAME --region REGION --profile PROFILE ### Step 1.2: Create ECR Repository -Create the repository to host the projects images. +Create the repository to host the projects images. ```bash aws --profile PROFILE ecr create-repository --repository-name NAMESPACE/hopsworks-base --region REGION @@ -111,7 +115,6 @@ When creating the cluster using eksctl the following parameters are required in - The following policies are required: [IAM policies - eksctl](https://eksctl.io/usage/iam-policies/#attaching-policies-by-arn) - ```bash - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy @@ -122,7 +125,7 @@ The following is required if you are using the EKS AWS Load Balancer Controller ```bash withAddonPolicies: - awsLoadBalancerController: true + awsLoadBalancerController: true ``` You need to update the CLUSTER NAME and the POLICY ARN generated above @@ -134,7 +137,7 @@ kind: ClusterConfig metadata: name: CLUSTER_NAME region: REGION - version: "1.29" + version: "1.29" iam: withOIDC: true @@ -180,6 +183,7 @@ You should see the list of nodes provisioned for the cluster. ### Step 1.4: Install the AWS LoadBalancer Addon For Hopsworks to provision the necessary network and application load balancers, we need to install the AWS LoadBalancer plugin (See [AWS Documentation](https://docs.aws.amazon.com/eks/latest/userguide/lbc-helm.html) ) + ```bash helm repo add eks https://aws.github.io/eks-charts helm repo update eks @@ -188,7 +192,8 @@ helm install aws-load-balancer-controller eks/aws-load-balancer-controller -n ku ### Step 1.5: (Optional) Create GP3 Storage Class -By default EKS comes with GP2 as storage class. GP3 is more cost effective, we can use it with Hopsworks by creating the storage class +By default EKS comes with GP2 as storage class. +GP3 is more cost effective, we can use it with Hopsworks by creating the storage class ```bash kubectl apply -f - <= 1.27.0. -- An Azure resource group in which the Hopsworks cluster will be deployed. +- An Azure resource group in which the Hopsworks cluster will be deployed. - The [azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) installed and [logged in](https://docs.microsoft.com/en-us/cli/azure/authenticate-azure-cli). - kubectl (to manage the AKS cluster) - helm (to deploy Hopsworks) @@ -18,8 +20,9 @@ To follow the instruction on this page you will need the following: The deployment requires cluster admin access to create ClusterRoles, ServiceAccounts, and ClusterRoleBindings in AKS. -A namespace is also required for deploying the Hopsworks stack. If you don’t have permissions to create a namespace, ask your AKS administrator to provision one for you. - +A namespace is also required for deploying the Hopsworks stack. +If you don’t have permissions to create a namespace, ask your AKS administrator to provision one for you. + To run all the commands on this page the user needs to have at least the following permissions on the Azure resource group: You will also need to have a role such as *Application Administrator* on the Azure Active Directory to be able to create the hopsworks.ai service principal. @@ -28,7 +31,8 @@ You will also need to have a role such as *Application Administrator* on the Azu ### Step 1.1: Create an Azure Blob Storage Account -Create a storage account to host project data. Ensure that the storage account is in the same region as the AKS cluster for performance and cost reasons: +Create a storage account to host project data. +Ensure that the storage account is in the same region as the AKS cluster for performance and cost reasons: ```bash az storage account create --name $STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP --location $REGION @@ -97,7 +101,8 @@ az role assignment create --role hopsfs-storage-permissions --assignee-object-id ### Step 1.5: Create Service Principal for Hopsworks services -Create a service principal to grant Hopsworks applications with access to the container registry. For example, Hopsworks uses this service principal to push new Python environments created via the Hopsworks UI. +Create a service principal to grant Hopsworks applications with access to the container registry. +For example, Hopsworks uses this service principal to push new Python environments created via the Hopsworks UI. ```bash export SP_PASSWORD=`az ad sp create-for-rbac --name $SP_NAME --scopes $ACR_ID --role AcrPush --years 1 --query "password" --output tsv` @@ -149,8 +154,7 @@ kubectl create secret docker-registry azregcred \ ### Step 4.1: Add the Hopsworks Helm repository -To obtain access to the Hopsworks helm chart repository, please obtain -an evaluation/startup licence [here](https://www.hopsworks.ai/try). +To obtain access to the Hopsworks helm chart repository, please [obtain](https://www.hopsworks.ai/try) an evaluation/startup licence. Once you have the helm chart repository URL, replace the environment variable $HOPSWORKS_REPO in the following command with this URL. @@ -228,7 +232,6 @@ kubectl expose deployment hopsworks --type=LoadBalancer --name=hopsworks-service Check out our other guides for how to get started with Hopsworks and the Feature Store: -* Get started with the [Hopsworks Feature Store](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"} -* Follow one of our [tutorials](../../tutorials/index.md) -* Follow one of our [Guide](../../user_guides/index.md) - +- Get started with the [Hopsworks Feature Store](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"} +- Follow one of our [tutorials](../../tutorials/index.md) +- Follow one of our [Guide](../../user_guides/index.md) diff --git a/docs/setup_installation/common/arrow_flight_duckdb.md b/docs/setup_installation/common/arrow_flight_duckdb.md index fd8464067..2460c50cb 100644 --- a/docs/setup_installation/common/arrow_flight_duckdb.md +++ b/docs/setup_installation/common/arrow_flight_duckdb.md @@ -1,4 +1,5 @@ # ArrowFlight Server with DuckDB + By default, Hopsworks uses big data technologies (Spark or Hive) to create training data and read data for Python clients. This is great for large datasets, but for small or moderately sized datasets (think of the size of data that would fit in a Pandas DataFrame in your local Python environment), the overhead of starting a Spark or Hive job and doing distributed data processing can be significant. @@ -8,12 +9,11 @@ and batch inference data from the Feature Store, as well as creating moderately- When the service is enabled, clients will automatically use it for the following operations: -- [reading Feature Groups](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#read) -- [reading Queries](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/query_api/#read) -- [reading Training Datasets](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#get_training_data) -- [creating In-Memory Training Datasets](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#training_data) -- [reading Batch Inference Data](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#get_batch_data) - +- [reading Feature Groups][hsfs.feature_group.FeatureGroup.read] +- [reading Queries][hsfs.constructor.query.Query.read] +- [reading Training Datasets][hsfs.feature_view.FeatureView.get_training_data] +- [creating In-Memory Training Datasets][hsfs.feature_view.FeatureView.training_data] +- [reading Batch Inference Data][hsfs.feature_view.FeatureView.get_batch_data] For larger datasets, clients can still make use of the Spark/Hive backend by explicitly setting `read_options={"use_hive": True}`. @@ -43,11 +43,9 @@ To deploy ArrowFlight Server on a cluster: 3. Tick the checkbox `Enable ArrowFlight Server`. (*) The service should have at least the 2x the amount of memory available that a typical Python client would have. - Because RonDB and ArrowFlight Server share the same node we recommend selecting an instance type with at least 4x the - client memory. For example, if the service serves Python clients with typically 4GB of memory, - an instance with at least 16GB of memory should be selected. - An instance with 16GB of memory will be able to read feature groups and training datasets of up to 10-100M rows, - depending on the number of columns and size of the features (~2GB in parquet). The same instance will be able to create - point-in-time correct training datasets with 1-10M rows, also depending on the number and the size of the features. - Larger instances are able to handle larger datasets. The numbers scale roughly linearly with the instance size. - + Because RonDB and ArrowFlight Server share the same node we recommend selecting an instance type with at least 4x the client memory. + For example, if the service serves Python clients with typically 4GB of memory, an instance with at least 16GB of memory should be selected. + An instance with 16GB of memory will be able to read feature groups and training datasets of up to 10-100M rows, depending on the number of columns and size of the features (~2GB in parquet). + The same instance will be able to create point-in-time correct training datasets with 1-10M rows, also depending on the number and the size of the features. + Larger instances are able to handle larger datasets. + The numbers scale roughly linearly with the instance size. diff --git a/docs/setup_installation/gcp/getting_started.md b/docs/setup_installation/gcp/getting_started.md index 8f6ebfc11..10d3ef749 100644 --- a/docs/setup_installation/gcp/getting_started.md +++ b/docs/setup_installation/gcp/getting_started.md @@ -1,32 +1,33 @@ # GCP - Getting started with GKE Kubernetes and Helm are used to install & run Hopsworks and the Feature Store -in the cloud. They both integrate seamlessly with third-party platforms such as Databricks, -SageMaker and KubeFlow. This guide shows how to set up the Hopsworks platform in your organization's Google Cloud Platform's (GCP) account. - +in the cloud. +They both integrate seamlessly with third-party platforms such as Databricks, +SageMaker and KubeFlow. +This guide shows how to set up the Hopsworks platform in your organization's Google Cloud Platform's (GCP) account. ## Prerequisites To follow the instruction on this page you will need the following: - Kubernetes Version: Hopsworks can be deployed on GKE clusters running Kubernetes >= 1.27.0. -- [gcloud CLI](https://cloud.google.com/sdk/gcloud) to provision the GCP resources +- [gcloud CLI](https://cloud.google.com/sdk/gcloud) to provision the GCP resources - [gke-gcloud-auth-plugin](https://cloud.google.com/blog/products/containers-kubernetes/kubectl-auth-changes-in-gke) to manage authentication with the GKE cluster - [helm](https://helm.sh/) to deploy Hopsworks - ### Permissions - The deployment requires cluster admin access to create ClusterRoles, ServiceAccounts, and ClusterRoleBindings. -- A namespace is required to deploy the Hopsworks stack. If you don’t have permissions to create a namespace, ask your GKE administrator to provision one. - +- A namespace is required to deploy the Hopsworks stack. +If you don’t have permissions to create a namespace, ask your GKE administrator to provision one. ## Step 1: GCP GKE Setup ### Step 1.1: Create a Google Cloud Storage (GCS) bucket -Create a bucket to store project data. Ensure the bucket is in the same region as your GKE cluster for performance and cost optimization. +Create a bucket to store project data. +Ensure the bucket is in the same region as your GKE cluster for performance and cost optimization. ```bash gsutil mb -l $region gs://$bucket_name @@ -60,7 +61,8 @@ includedPermissions: - artifactregistry.tags.delete ``` -Execute the following gcloud command to create a custom role from the file. Replace $PROJECT_ID with your GCP project id: +Execute the following gcloud command to create a custom role from the file. +Replace $PROJECT_ID with your GCP project id: ```bash gcloud iam roles create hopsworksai_instances \ @@ -68,7 +70,8 @@ gcloud iam roles create hopsworksai_instances \ --file=hopsworksai_role.yaml ``` -Execute the following gcloud command to create a service account for Hopsworks AI instances. Replace $PROJECT_ID with your GCP project id: +Execute the following gcloud command to create a service account for Hopsworks AI instances. +Replace $PROJECT_ID with your GCP project id: ```bash gcloud iam service-accounts create hopsworksai_instances \ @@ -77,7 +80,8 @@ gcloud iam service-accounts create hopsworksai_instances \ --display-name="Hopsworks AI instances" ``` -Execute the following gcloud command to bind the custom role to the service account. Replace all occurrences $PROJECT_ID with your GCP project id: +Execute the following gcloud command to bind the custom role to the service account. +Replace all occurrences $PROJECT_ID with your GCP project id: ```bash gcloud projects add-iam-policy-binding $PROJECT_ID \ @@ -95,6 +99,7 @@ gcloud container clusters create \ --enable-ip-alias \ --service-account my-service-account@my-project.iam.gserviceaccount.com ``` + Once the creation process is completed, you should be able to access the cluster using the kubectl CLI tool: ```bash @@ -103,7 +108,9 @@ kubectl get nodes ### Step 1.4: Create GCR repository -Hopsworks allows users to customize images for Python jobs, Jupyter Notebooks, and (Py)Spark applications. These images should be stored in Google Container Registry (GCR). The GKE cluster needs access to a GCR repository to push project images. +Hopsworks allows users to customize images for Python jobs, Jupyter Notebooks, and (Py)Spark applications. +These images should be stored in Google Container Registry (GCR). +The GKE cluster needs access to a GCR repository to push project images. Enable Artifact Registry and create a GCR repository to store images: @@ -117,8 +124,7 @@ gcloud artifacts repositories create \ ### Step 3.1: Add the Hopsworks Helm repository -To obtain access to the Hopsworks helm chart repository, please obtain -an evaluation/startup licence [here](https://www.hopsworks.ai/try). +To obtain access to the Hopsworks helm chart repository, please [obtain](https://www.hopsworks.ai/try) an evaluation/startup licence. Once you have the helm chart repository URL, replace the environment variable $HOPSWORKS_REPO in the following command with this URL. @@ -150,11 +156,11 @@ global: credHelper: enabled: true secretName: &gcpregcred "gcpregcred" - + managedObjectStorage: enabled: true s3: - bucket: + bucket: name: &bucket "hopsworks" region: ®ion "europe-north1" endpoint: &gcpendpoint "https://storage.cloud.google.com" @@ -195,6 +201,6 @@ kubectl expose deployment hopsworks --type=LoadBalancer --name=hopsworks-service Check out our other guides for how to get started with Hopsworks and the Feature Store: -* Get started with the [Hopsworks Feature Store](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"} -* Follow one of our [tutorials](../../tutorials/index.md) -* Follow one of our [Guide](../../user_guides/index.md) +- Get started with the [Hopsworks Feature Store](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"} +- Follow one of our [tutorials](../../tutorials/index.md) +- Follow one of our [Guide](../../user_guides/index.md) diff --git a/docs/setup_installation/index.md b/docs/setup_installation/index.md index bf0e220d7..2348be8d0 100644 --- a/docs/setup_installation/index.md +++ b/docs/setup_installation/index.md @@ -1,6 +1,6 @@ # Setup and Administration -This section contains installation guides for the **Hopsworks Platform** using kubernetes, on +This section contains installation guides for the **Hopsworks Platform** using kubernetes, on - [AWS](aws/getting_started.md) - [Azure](azure/getting_started.md) diff --git a/docs/setup_installation/on_prem/contact_hopsworks.md b/docs/setup_installation/on_prem/contact_hopsworks.md index fee9e4600..266a1df42 100644 --- a/docs/setup_installation/on_prem/contact_hopsworks.md +++ b/docs/setup_installation/on_prem/contact_hopsworks.md @@ -4,8 +4,10 @@ description: Requirements and instructions on how to install the Hopsworks on-pr # Hopsworks On-Premise Installation -It is possible to use Hopsworks on-premises, which means that companies can run their machine learning workloads on their own hardware and infrastructure, rather than relying on a cloud provider. This can provide greater flexibility, control, and cost savings, as well as enabling companies to meet specific compliance and security requirements. +It is possible to use Hopsworks on-premises, which means that companies can run their machine learning workloads on their own hardware and infrastructure, rather than relying on a cloud provider. +This can provide greater flexibility, control, and cost savings, as well as enabling companies to meet specific compliance and security requirements. -Working on-premises with Hopsworks typically involves collaboration with the Hopsworks engineering teams, as each infrastructure is unique and requires a tailored approach to deployment and configuration. The process begins with an assessment of the company's existing infrastructure and requirements, including network topology, security policies, and hardware specifications. +Working on-premises with Hopsworks typically involves collaboration with the Hopsworks engineering teams, as each infrastructure is unique and requires a tailored approach to deployment and configuration. +The process begins with an assessment of the company's existing infrastructure and requirements, including network topology, security policies, and hardware specifications. For further details about on-premise installations; [contact us](https://www.hopsworks.ai/contact). diff --git a/docs/setup_installation/on_prem/external_kafka_cluster.md b/docs/setup_installation/on_prem/external_kafka_cluster.md index 746a1fef7..ea944d44b 100644 --- a/docs/setup_installation/on_prem/external_kafka_cluster.md +++ b/docs/setup_installation/on_prem/external_kafka_cluster.md @@ -1,11 +1,12 @@ --- -description: Administrator guide on how to integrate Hopsworks with an external Kafka cluster to handle data ingestion into the feature store. +description: Administrator guide on how to integrate Hopsworks with an external Kafka cluster to handle data ingestion into the feature store. --- # External Kafka cluster -Hopsworks uses [Apache Kafka](https://kafka.apache.org/) to ingest data to the feature store. Streaming applications and external clients send data to the Kafka cluster for ingestion to the online and offline feature store. -By default, Hopsworks comes with an embedded Kafka cluster managed by Hopsworks itself, however, users can configure Hopsworks to leverage an existing external cluster. +Hopsworks uses [Apache Kafka](https://kafka.apache.org/) to ingest data to the feature store. +Streaming applications and external clients send data to the Kafka cluster for ingestion to the online and offline feature store. +By default, Hopsworks comes with an embedded Kafka cluster managed by Hopsworks itself, however, users can configure Hopsworks to leverage an existing external cluster. This guide will cover how to configure an Hopsworks cluster to leverage an external Kafka cluster. ## Configure the external Kafka cluster integration @@ -13,7 +14,7 @@ This guide will cover how to configure an Hopsworks cluster to leverage an exter To enable the integration with an external Kafka cluster, you should set the `enable_bring_your_own_kafka` [configuration option](../admin/variables.md) to `true`. This can also be achieved in the cluster definition by setting the following attribute: -``` +```yaml hopsworks: enable_bring_your_own_kafka: "true" ``` @@ -25,18 +26,18 @@ This can be achieved by provisioning the necessary credentials for OnlineFS to s OnlineFs can be configured to use these credentials by adding the following configurations to the cluster definition used to deploy Hopsworks: -``` +```yaml onlinefs: config_dir: "/home/ubuntu/cluster-definitions/byok" kafka_cosumers: topic_list: "comma separated list of kafka topics to subscribe to" ``` -In particular, the `onlinefs/config_dir` should contain the credentials necessary for the Kafka consumers to authenticate. +In particular, the `onlinefs/config_dir` should contain the credentials necessary for the Kafka consumers to authenticate. Additionally the directory should contain a file name `onlinefs-kafka.properties` with the Kafka consumer configuration. The following is an example of the `onlinefs-kafka.properties` file: -``` +```properties bootstrap.servers=cluster_identifier.us-east-2.aws.confluent.cloud:9092 security.protocol=SASL_SSL sasl.jaas.config=org.apache.kafka.common.security.plain.PlainLoginModule required username="username" password="password"; @@ -44,14 +45,16 @@ sasl.mechanism=PLAIN ``` !!! note "Hopsworks will not provision topics" - Please note that when using an external Kafka cluster, Hopsworks will not provision the topics for the different projects. Users are responsible for provisioning the necessary topics and configure the projects accordingly (see next section). + Please note that when using an external Kafka cluster, Hopsworks will not provision the topics for the different projects. + Users are responsible for provisioning the necessary topics and configure the projects accordingly (see next section). Users should also specify the list of topics OnlineFS should subscribe to by providing the `onlinefs/kafka_consumers/topic_list` option in the cluster definition. ### Project configuration #### Topic configuration -As mentioned above, when configuring Hopsworks to use an external Kafka cluster, Hopsworks will not provision the topics for the different projects. Instead, when creating a project, users will be asked to provide the topic name to use for the feature store operations. +As mentioned above, when configuring Hopsworks to use an external Kafka cluster, Hopsworks will not provision the topics for the different projects. +Instead, when creating a project, users will be asked to provide the topic name to use for the feature store operations.

@@ -60,7 +63,7 @@ As mentioned above, when configuring Hopsworks to use an external Kafka cluster,

-#### Data Source configuration +#### Data Source configuration Users should create a [Kafka Data Source](../../user_guides/fs/data_source/creation/kafka.md) named `kafka_connector` which is going to be used by the feature store clients to configure the necessary Kafka producers to send data. The configuration is done for each project to ensure its members have the necessary authentication/authorization. diff --git a/docs/templates/python/material/attribute.html.jinja b/docs/templates/python/material/attribute.html.jinja new file mode 100644 index 000000000..8e0f9b2f9 --- /dev/null +++ b/docs/templates/python/material/attribute.html.jinja @@ -0,0 +1,12 @@ +{% extends "_base/attribute.html.jinja" %} + +{% block heading scoped %} + {% block source_link scoped %} + {% if config.link_source and attribute.source_link %} + [source] + {% endif %} + {% endblock source_link %} + + {{ super() }} + +{% endblock heading %} diff --git a/docs/templates/python/material/class.html.jinja b/docs/templates/python/material/class.html.jinja new file mode 100644 index 000000000..76b687016 --- /dev/null +++ b/docs/templates/python/material/class.html.jinja @@ -0,0 +1,12 @@ +{% extends "_base/class.html.jinja" %} + +{% block heading scoped %} + {% block source_link scoped %} + {% if config.link_source and class.source_link %} + [source] + {% endif %} + {% endblock source_link %} + + {{ super() }} + +{% endblock heading %} diff --git a/docs/templates/python/material/function.html.jinja b/docs/templates/python/material/function.html.jinja new file mode 100644 index 000000000..7cd8a95c1 --- /dev/null +++ b/docs/templates/python/material/function.html.jinja @@ -0,0 +1,12 @@ +{% extends "_base/function.html.jinja" %} + +{% block heading scoped %} + {% block source_link scoped %} + {% if config.link_source and function.source_link %} + [source] + {% endif %} + {% endblock source_link %} + + {{ super() }} + +{% endblock heading %} diff --git a/docs/templates/python/material/module.html.jinja b/docs/templates/python/material/module.html.jinja new file mode 100644 index 000000000..a4a624c85 --- /dev/null +++ b/docs/templates/python/material/module.html.jinja @@ -0,0 +1,12 @@ +{% extends "_base/module.html.jinja" %} + +{% block heading scoped %} + {% block source_link scoped %} + {% if config.link_source and module.source_link %} + [source] + {% endif %} + {% endblock source_link %} + + {{ super() }} + +{% endblock heading %} diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index b1d4d077d..ba806925e 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -5,14 +5,18 @@ hide: # Tutorials -We are happy to welcome you to our collection of tutorials dedicated to exploring the fundamentals of Hopsworks and Machine Learning development. In addition to offering different types of use cases and common subjects in the field, it facilitates navigation and use of models in a production environment using Hopsworks Feature Store. +We are happy to welcome you to our collection of tutorials dedicated to exploring the fundamentals of Hopsworks and Machine Learning development. +In addition to offering different types of use cases and common subjects in the field, it facilitates navigation and use of models in a production environment using Hopsworks Feature Store. ## How to run the tutorials -In order to run the tutorials, you will need a Hopsworks account. To do so, go to [app.hopsworks.ai](https://app.hopsworks.ai) and create one. With a managed account, just run the Jupyter notebook from within Hopsworks. +In order to run the tutorials, you will need a Hopsworks account. +To do so, go to [app.hopsworks.ai](https://app.hopsworks.ai) and create one. +With a managed account, just run the Jupyter notebook from within Hopsworks. Generally the notebooks contain the information you will need on how to interact with the Hopsworks Platform. -The easiest way to get started is by using [Google Colab](https://colab.research.google.com/) to run the notebooks. However, you can also run them in your local Python environment with Jupyter. +The easiest way to get started is by using [Google Colab](https://colab.research.google.com/) to run the notebooks. +However, you can also run them in your local Python environment with Jupyter. You can find the raw notebook files in our [tutorials repository](https://github.com/logicalclocks/hopsworks-tutorials). ## Fraud Tutorial @@ -20,45 +24,52 @@ You can find the raw notebook files in our [tutorials repository](https://github This is a quick-start of the Hopsworks Feature Store; using a fraud use case we will load data into the feature store, create two feature groups from which we will make a training dataset and train a model. ### Batch + This is a batch use case variant of the fraud tutorial, it will give you a high level view on how to use our python APIs and the UI to navigate the feature groups. -| Notebooks | | -| ----------- | ------------------------------------ | -| 1. [How to load, engineer and create feature groups](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/1_fraud_batch_feature_pipeline.ipynb){:target="_blank"} | +| Notebooks | +| --- | +| 1. [How to load, engineer and create feature groups](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/1_fraud_batch_feature_pipeline.ipynb){:target="_blank"} | | 2. [How to create training datasets](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/2_fraud_batch_training_pipeline.ipynb){:target="_blank"} | -| 3. [How to train a model from the feature store](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/3_fraud_batch_inference.ipynb){:target="_blank"} | +| 3. [How to train a model from the feature store](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/3_fraud_batch_inference.ipynb){:target="_blank"} | ### Online + This is a online use case variant of the fraud tutorial, it is similar to the batch use case, however, in this tutorial you will get introduced to the usage of Feature Groups which are kept in online storage, and how to access single feature vectors from the online storage -at low latency. Additionally, the model will be deployed as a model serving instance, to provide a REST endpoint for real time serving. +at low latency. +Additionally, the model will be deployed as a model serving instance, to provide a REST endpoint for real time serving. -| Notebooks | | -| ----------- | ------------------------------------ | -| 1. [How to load, engineer and create feature groups](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/real-time-ai-systems/fraud_online/1_fraud_online_feature_pipeline.ipynb){:target="_blank"} | +| Notebooks | +| --- | +| 1. [How to load, engineer and create feature groups](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/real-time-ai-systems/fraud_online/1_fraud_online_feature_pipeline.ipynb){:target="_blank"} | | 2. [How to create training datasets](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/real-time-ai-systems/fraud_online/2_fraud_online_training_pipeline.ipynb){:target="_blank"} | -| 3. [How to train a model from the feature store and deploying it as a serving instance together with the online feature store](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/real-time-ai-systems/fraud_online/3_fraud_online_inference_pipeline.ipynb){:target="_blank"} | +| 3. [How to train a model from the feature store and deploying it as a serving instance together with the online feature store](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/real-time-ai-systems/fraud_online/3_fraud_online_inference_pipeline.ipynb){:target="_blank"} | ## Churn Tutorial -This is a churn tutorial with the Hopsworks feature store and model serving to build a prediction service. In this tutorial you will get introduced to the usage of Feature Groups which are kept in online storage, and how to access single feature vectors from the online storage -at low latency. Additionally, the model will be deployed as a model serving instance, to provide a REST endpoint for real time serving. +This is a churn tutorial with the Hopsworks feature store and model serving to build a prediction service. +In this tutorial you will get introduced to the usage of Feature Groups which are kept in online storage, and how to access single feature vectors from the online storage +at low latency. +Additionally, the model will be deployed as a model serving instance, to provide a REST endpoint for real time serving. -| Notebooks | | -| ----------- | ------------------------------------ | -| 1. How to load, engineer and create feature groups | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/1_churn_feature_pipeline.ipynb){:target="_blank"} | -| 2. How to create training datasets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/2_churn_training_pipeline.ipynb){:target="_blank"} | -| 3. How to train a model from the feature store and deploying it as a serving instance together with the online feature store | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/3_churn_batch_inference.ipynb){:target="_blank"} | +| Notebooks | | +| --- | --- | +| 1. How to load, engineer and create feature groups | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/1_churn_feature_pipeline.ipynb){:target="_blank"} | +| 2. How to create training datasets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/2_churn_training_pipeline.ipynb){:target="_blank"} | +| 3. How to train a model from the feature store and deploying it as a serving instance together with the online feature store | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/churn/3_churn_batch_inference.ipynb){:target="_blank"} | ## Integration Tutorials -Hopsworks is easily integrated with many tools, especially from the Python world. In this section you will find examples for some popular libraries and services. +Hopsworks is easily integrated with many tools, especially from the Python world. +In this section you will find examples for some popular libraries and services. ### Great Expectations -Great Expectations is a library for data validation. You can use Great Expectations within Hopsworks to validate data which is to be inserted into the feature store, in order to ensure that only high-quality features end up in the feature store. +Great Expectations is a library for data validation. +You can use Great Expectations within Hopsworks to validate data which is to be inserted into the feature store, in order to ensure that only high-quality features end up in the feature store. -| Notebooks | | -| ----------- | ------------------------------------ | +| Notebooks | | +| --- | --- | | 1. A brief introduction to Great Expectations concepts which are relevant for integration with the Hopsworks MLOps platform | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/Great_Expectations_Hopsworks_Concepts.ipynb){:target="_blank"} | | 2. How to integrate Great Expectations seamlessly with your Hopsworks feature pipelines | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb){:target="_blank"} | @@ -68,8 +79,8 @@ Weights and Biases is a developer tool for machine learning model training that This tutorial is a variant of the batch fraud tutorial using Weights and Biases for model training, tracking and as model registry. -| Notebooks | | -| ----------- | ------------------------------------ | -| 1. How to load, engineer and create feature groups | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/1_feature_groups.ipynb){:target="_blank"} | -| 2. How to create training datasets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/2_feature_view_creation.ipynb){:target="_blank"} | -| 3. How to train a model from the feature store and use Weights and Biases to track the process | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/3_model_training.ipynb){:target="_blank"} | +| Notebooks | | +| --- | --- | +| 1. How to load, engineer and create feature groups | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/1_feature_groups.ipynb){:target="_blank"} | +| 2. How to create training datasets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/2_feature_view_creation.ipynb){:target="_blank"} | +| 3. How to train a model from the feature store and use Weights and Biases to track the process | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/wandb/3_model_training.ipynb){:target="_blank"} | diff --git a/docs/user_guides/client_installation/index.md b/docs/user_guides/client_installation/index.md index 4c9747aa2..d5ce91667 100644 --- a/docs/user_guides/client_installation/index.md +++ b/docs/user_guides/client_installation/index.md @@ -3,17 +3,19 @@ description: Documentation on how to install the Hopsworks Python and Java libra --- # Client Installation Guide -## Hopsworks Python library +## Hopsworks Python library -The Hopsworks Python client library is required to connect to Hopsworks from your local machine or any other Python environment such as Google Colab or AWS Sagemaker. Execute the following command to install the Hopsworks client library in your Python environment: +The Hopsworks Python client library is required to connect to Hopsworks from your local machine or any other Python environment such as Google Colab or AWS Sagemaker. +Execute the following command to install the Hopsworks client library in your Python environment: !!! note "Virtual environment" It is recommended to use a virtual python environment instead of the system environment used by your operating system, in order to avoid any side effects regarding interfering dependencies. !!! attention "Windows/Conda Installation" - On Windows systems you might need to install twofish manually before installing hopsworks, if you don't have the Microsoft Visual C++ Build Tools installed. In that case, it is recommended to use a conda environment and run the following commands: - + On Windows systems you might need to install twofish manually before installing hopsworks, if you don't have the Microsoft Visual C++ Build Tools installed. +In that case, it is recommended to use a conda environment and run the following commands: + ```bash conda install twofish pip install hopsworks[python] @@ -22,18 +24,19 @@ The Hopsworks Python client library is required to connect to Hopsworks from you ```bash pip install hopsworks[python] ``` + Supported versions of Python: 3.9, 3.10, 3.11, 3.12, 3.13 ([PyPI ↗](https://pypi.org/project/hopsworks/)) ### Profiles The Hopsworks library has several profiles that bring additional dependencies and enable additional functionalities: -| Profile Name | Description | -| ------------------ | ------------- | -| No Profile | This is the base installation. Supports interacting with the feature store metadata, model registry and deployments. It also supports reading and writing from the feature store from PySpark environments. | -| `python` | This profile enables reading and writing from/to the feature store from a Python environment | +| Profile Name | Description | +| --- | --- | +| No Profile | This is the base installation. Supports interacting with the feature store metadata, model registry and deployments. It also supports reading and writing from the feature store from PySpark environments. | +| `python` | This profile enables reading and writing from/to the feature store from a Python environment | | `great-expectations` | This profile installs the [Great Expectations](https://greatexpectations.io/) Python library and enables data validation on feature pipelines | -| `polars` | This profile installs the [Polars](https://pola.rs/) library and enables reading and writing Polars DataFrames | +| `polars` | This profile installs the [Polars](https://pola.rs/) library and enables reading and writing Polars DataFrames | You can install all the above profiles with the following command: @@ -41,17 +44,20 @@ You can install all the above profiles with the following command: pip install hopsworks[python,great-expectations,polars] ``` -## HSFS Java Library: +## HSFS Java Library If you want to interact with the Hopsworks Feature Store from environments such as Spark, Flink or Beam, you can use the Hopsworks Feature Store (HSFS) Java library. -!!!note "Feature Store Only" +!!! note "Feature Store Only" - The Java library only allows interaction with the Feature Store component of the Hopsworks platform. Additionally each environment might restrict the supported API operation. You can see which API operation is supported by which environment [here](../fs/compute_engines) + The Java library only allows interaction with the Feature Store component of the Hopsworks platform. + Additionally each environment might restrict the supported API operation. + You can see which API operation is supported by which environment [here](../fs/compute_engines.md) -The HSFS library is available on the Hopsworks' Maven repository. If you are using Maven as build tool, you can add the following in your `pom.xml` file: +The HSFS library is available on the Hopsworks' Maven repository. +If you are using Maven as build tool, you can add the following in your `pom.xml` file: -``` +```xml Hops @@ -67,13 +73,13 @@ The HSFS library is available on the Hopsworks' Maven repository. If you are usi ``` -The library has different builds targeting different environments: +The library has different builds targeting different environments: ### HSFS Java The `artifactId` for the HSFS Java build is `hsfs`, if you are using Maven as build tool, you can add the following dependency: -``` +```xml com.logicalclocks hsfs @@ -81,12 +87,11 @@ The `artifactId` for the HSFS Java build is `hsfs`, if you are using Maven as bu ``` - ### Spark The `artifactId` for the Spark build is `hsfs-spark-spark{spark.version}`, if you are using Maven as build tool, you can add the following dependency: -``` +```xml com.logicalclocks hsfs-spark-spark3.1 @@ -100,7 +105,7 @@ Hopsworks provides builds for Spark 3.1, 3.3 and 3.5. The builds are also provid The `artifactId` for the Flink build is `hsfs-flink`, if you are using Maven as build tool, you can add the following dependency: -``` +```xml com.logicalclocks hsfs-flink @@ -112,7 +117,7 @@ The `artifactId` for the Flink build is `hsfs-flink`, if you are using Maven as The `artifactId` for the Beam build is `hsfs-beam`, if you are using Maven as build tool, you can add the following dependency: -``` +```xml com.logicalclocks hsfs-beam @@ -126,5 +131,5 @@ If you are using a local python environment and want to connect to Hopsworks, yo ## Other environments -The Hopsworks Feature Store client libraries can also be installed in external environments, such as Databricks, AWS Sagemaker, or Azure Machine Learning. For more information, see [Client Integrations](../integrations/index.md). - +The Hopsworks Feature Store client libraries can also be installed in external environments, such as Databricks, AWS Sagemaker, or Azure Machine Learning. +For more information, see [Client Integrations](../integrations/index.md). diff --git a/docs/user_guides/fs/compute_engines.md b/docs/user_guides/fs/compute_engines.md index 26e44acef..3c721c0f5 100644 --- a/docs/user_guides/fs/compute_engines.md +++ b/docs/user_guides/fs/compute_engines.md @@ -10,7 +10,7 @@ As such, Hopsworks supports five computational engines: 2. [Python](https://www.python.org/): For pure Python environments without dependencies on Spark, Hopsworks supports [Pandas Dataframes](https://pandas.pydata.org/) and [Polars Dataframes](https://pola.rs/). 3. [Apache Flink](https://flink.apache.org): Flink Data Streams are currently supported as an experimental feature from Java/Scala environments. 4. [Apache Beam](https://beam.apache.org/) *experimental*: Beam Data Streams are currently supported as an experimental feature from Java/Scala environments. -5. [Java](https://www.java.com): For pure Java environments without dependencies on Spark, Hopsworks supports writing using List of POJO Objects. +5. [Java](https://www.java.com): For pure Java environments without dependencies on Spark, Hopsworks supports writing using List of POJO Objects. Hopsworks supports running [compute on the platform itself](../../concepts/dev/inside.md) in the form of [Jobs](../projects/jobs/pyspark_job.md) or in [Jupyter Notebooks](../projects/jupyter/python_notebook.md). Alternatively, you can also connect to Hopsworks using Python or Spark from [external environments](../../concepts/dev/outside.md), given that there is network connectivity. @@ -19,65 +19,72 @@ Alternatively, you can also connect to Hopsworks using Python or Spark from [ext Hopsworks is aiming to provide functional parity between the computational engines, however, there are certain Hopsworks functionalities which are exclusive to the engines. -| Functionality | Method | Spark | Python | Flink | Beam | Java | Comment | -| ----------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------ | ---------------------- | ------------------ | ------------------ |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Feature Group Creation from dataframes | [`FeatureGroup.create_feature_group()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#create_feature_group) | :white_check_mark: | :white_check_mark: | - | - | - | Currently Flink/Beam/Java doesn't support registering feature group metadata. Thus it needs to be pre-registered before you can write real time features computed by Flink/Beam. | -| Training Dataset Creation from dataframes | [`TrainingDataset.save()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/training_dataset_api/#save) | :white_check_mark: | - | - | - | - | Functionality was deprecated in version 3.0 | -| Data validation using Great Expectations for streaming dataframes | [`FeatureGroup.validate()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#validate)
[`FeatureGroup.insert_stream()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#insert_stream) | - | - | - | - | - | `insert_stream` does not perform any data validation even when a expectation suite is attached. | -| Stream ingestion | [`FeatureGroup.insert_stream()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#insert_stream) | :white_check_mark: | - | :white_check_mark: | :white_check_mark: | :white_check_mark: | Python/Pandas/Polars has currently no notion of streaming. | -| Reading from Streaming Storage Connectors | [`KafkaConnector.read_stream()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/storage_connector_api/#read_stream) | :white_check_mark: | - | - | - | - | Python/Pandas/Polars has currently no notion of streaming. For Flink/Beam/Java only write operations are supported | -| Reading training data from external storage other than S3 | [`FeatureView.get_training_data()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#get_training_data) | :white_check_mark: | - | - | - | - | Reading training data that was written to external storage using a Storage Connector other than S3 can currently not be read using HSFS APIs, instead you will have to use the storage's native client. | -| Reading External Feature Groups into Dataframe | [`ExternalFeatureGroup.read()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/external_feature_group_api/#read) | :white_check_mark: | - | - | - | - | Reading an External Feature Group directly into a Pandas/Polars Dataframe is not supported, however, you can use the [Query API](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/query_api/) to create Feature Views/Training Data containing External Feature Groups. | -| Read Queries containing External Feature Groups into Dataframe | [`Query.read()`](https://docs.hopsworks.ai/feature-store-api/{{{ hopsworks_version }}}/generated/api/query_api/#read) | :white_check_mark: | - | - | - | - | Reading a Query containing an External Feature Group directly into a Pandas/Polars Dataframe is not supported, however, you can use the Query to create Feature Views/Training Data and write the data to a Storage Connector, from where you can read up the data into a Pandas/Polars Dataframe. | +| Functionality | Method | Spark | Python | Flink | Beam | Java | Comment | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Feature Group Creation from dataframes | [`FeatureStore.create_feature_group`][hsfs.feature_store.FeatureStore.create_feature_group] | :white_check_mark: | :white_check_mark: | - | - | - | Currently Flink/Beam/Java doesn't support registering feature group metadata. Thus it needs to be pre-registered before you can write real time features computed by Flink/Beam. | +| Training Dataset Creation from dataframes | [`TrainingDataset.save`][hsfs.training_dataset.TrainingDataset.save] | :white_check_mark: | - | - | - | - | Functionality was deprecated in version 3.0 | +| Data validation using Great Expectations for streaming dataframes | [`FeatureGroup.validate`][hsfs.feature_group.FeatureGroup.validate]
[`FeatureGroup.insert_stream`][hsfs.feature_group.FeatureGroup.insert_stream] | - | - | - | - | - | `insert_stream` does not perform any data validation even when a expectation suite is attached. | +| Stream ingestion | [`FeatureGroup.insert_stream`][hsfs.feature_group.FeatureGroup.insert_stream] | :white_check_mark: | - | :white_check_mark: | :white_check_mark: | :white_check_mark: | Python/Pandas/Polars has currently no notion of streaming. | +| Reading from Streaming Storage Connectors | [`KafkaConnector.read_stream`][hsfs.storage_connector.KafkaConnector.read_stream] | :white_check_mark: | - | - | - | - | Python/Pandas/Polars has currently no notion of streaming. For Flink/Beam/Java only write operations are supported | +| Reading training data from external storage other than S3 | [`FeatureView.get_training_data`][hsfs.feature_view.FeatureView.get_training_data] | :white_check_mark: | - | - | - | - | Reading training data that was written to external storage using a Storage Connector other than S3 can currently not be read using HSFS APIs, instead you will have to use the storage's native client. | +| Reading External Feature Groups into Dataframe | [`ExternalFeatureGroup.read`][hsfs.feature_group.ExternalFeatureGroup.read] | :white_check_mark: | - | - | - | - | Reading an External Feature Group directly into a Pandas/Polars Dataframe is not supported, however, you can use the [Query API][hsfs.constructor.query.Query] to create Feature Views/Training Data containing External Feature Groups. | +| Read Queries containing External Feature Groups into Dataframe | [`Query.read`][hsfs.constructor.query.Query.read] | :white_check_mark: | - | - | - | - | Reading a Query containing an External Feature Group directly into a Pandas/Polars Dataframe is not supported, however, you can use the Query to create Feature Views/Training Data and write the data to a Storage Connector, from where you can read up the data into a Pandas/Polars Dataframe. | ## Python -### Inside Hopsworks +### Python Inside Hopsworks -If you are using Spark or Python within Hopsworks, there is no further configuration required. Head over to the [Getting Started Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. +If you are using Spark or Python within Hopsworks, there is no further configuration required. +Head over to the [Getting Started Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. -### Outside Hopsworks +### Python Outside Hopsworks -Connecting to the Feature Store from any Python environment, such as your local environment or Google Colab, requires setting up an API Key and installing the HSFS Python client library. The [Python integration guide](../integrations/python.md) explains step by step how to connect to the Feature Store from any Python environment. +Connecting to the Feature Store from any Python environment, such as your local environment or Google Colab, requires setting up an API Key and installing the HSFS Python client library. +The [Python integration guide](../integrations/python.md) explains step by step how to connect to the Feature Store from any Python environment. ## Spark -### Inside Hopsworks +### Spark Inside Hopsworks -If you are using Spark or Python within Hopsworks, there is no further configuration required. Head over to the [Getting Started Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. +If you are using Spark or Python within Hopsworks, there is no further configuration required. +Head over to the [Getting Started Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. -### Outside Hopsworks +### Spark Outside Hopsworks -Connecting to the Feature Store from an external Spark cluster, such as Cloudera or Databricks, requires configuring it with the Hopsworks client jars, configuration and certificates. The [Spark integration guide](../integrations/spark.md) explains step by step how to connect to the Feature Store from an external Spark cluster. +Connecting to the Feature Store from an external Spark cluster, such as Cloudera or Databricks, requires configuring it with the Hopsworks client jars, configuration and certificates. +The [Spark integration guide](../integrations/spark.md) explains step by step how to connect to the Feature Store from an external Spark cluster. ## Flink -### Inside Hopsworks +### Flink Inside Hopsworks -If you are using Flink within Hopsworks, there is no further configuration required. For more details head over to the [Getting Started Guide](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/flink). +If you are using Flink within Hopsworks, there is no further configuration required. +For more details head over to the [Getting Started Guide](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/flink). -### Outside Hopsworks +### Flink Outside Hopsworks -Connecting to the Feature Store from an external Flink cluster, such as GCP DataProc or AWS EMR, requires configuring the Hopsworks certificates. The [Flink integration guide](../integrations/flink.md) explains step by step how to connect to the Feature Store from an external Flink cluster. +Connecting to the Feature Store from an external Flink cluster, such as GCP DataProc or AWS EMR, requires configuring the Hopsworks certificates. +The [Flink integration guide](../integrations/flink.md) explains step by step how to connect to the Feature Store from an external Flink cluster. ## Beam -### Inside Hopsworks +### Beam Inside Hopsworks Beam is only supported as an external client. -### Outside Hopsworks +### Beam Outside Hopsworks -Connecting to the Feature Store from Beam DataFlowRunner, requires configuring the Hopsworks certificates. The [Beam integration guide](../integrations/beam.md) explains step by step how to connect to the Feature Store from Beam Dataflow Runner. +Connecting to the Feature Store from Beam DataFlowRunner, requires configuring the Hopsworks certificates. +The [Beam integration guide](../integrations/beam.md) explains step by step how to connect to the Feature Store from Beam Dataflow Runner. !!! warning Apache Beam integration with Hopsworks feature store was only tested using Dataflow Runner. - For more details head over to the [Getting Started Guide](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/beam). ## Java + It is also possible to interact to Hopsworks feature store using pure Java environments without dependencies on Spark, Flink or Beam. For more details head over to the [Getting Started Guide](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/java). diff --git a/docs/user_guides/fs/data_source/creation/adls.md b/docs/user_guides/fs/data_source/creation/adls.md index d0d7400ac..367e75e69 100644 --- a/docs/user_guides/fs/data_source/creation/adls.md +++ b/docs/user_guides/fs/data_source/creation/adls.md @@ -2,19 +2,24 @@ ## Introduction -Azure Data Lake Storage (ADLS) Gen2 is a HDFS-compatible filesystem on Azure for data analytics. The ADLS Gen2 filesystem stores its data in Azure Blob storage, ensuring low-cost storage, high availability, and disaster recovery. In Hopsworks, you can access ADLS Gen2 by defining a Data Source and creating and granting permissions to a service principal. +Azure Data Lake Storage (ADLS) Gen2 is a HDFS-compatible filesystem on Azure for data analytics. +The ADLS Gen2 filesystem stores its data in Azure Blob storage, ensuring low-cost storage, high availability, and disaster recovery. +In Hopsworks, you can access ADLS Gen2 by defining a Data Source and creating and granting permissions to a service principal. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a connection to your Azure ADLS filesystem. -When you're finished, you'll be able to read files using Spark through HSFS APIs. You can also use the connector to write out training data from the Feature Store, in order to make it accessible by third parties. +When you're finished, you'll be able to read files using Spark through HSFS APIs. +You can also use the connector to write out training data from the Feature Store, in order to make it accessible by third parties. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from your Azure ADLS account: -- **Data Lake Storage Gen2 Account:** Create an [Azure Data Lake Storage Gen2 account](https://docs.microsoft.com/azure/storage/data-lake-storage/quickstart-create-account) and [initialize a filesystem, enabling the hierarchical namespace](https://docs.microsoft.com/azure/storage/data-lake-storage/namespace). Note that your storage account must belong to an Azure resource group. +- **Data Lake Storage Gen2 Account:** Create an [Azure Data Lake Storage Gen2 account](https://docs.microsoft.com/azure/storage/data-lake-storage/quickstart-create-account) and [initialize a filesystem, enabling the hierarchical namespace](https://docs.microsoft.com/azure/storage/data-lake-storage/namespace). +Note that your storage account must belong to an Azure resource group. - **Azure AD application and service principal:** [Create an Azure AD application and service principal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal) that can access your ADLS storage account and its resource group. - **Service Principal Registration:** Register the service principal, granting it a role assignment such as Storage Blob Data Contributor, on the Azure Data Lake Storage Gen2 account. @@ -22,6 +27,7 @@ Before you begin this guide you'll need to retrieve the following information fr When you specify the 'container name' in the ADLS data source, you need to have previously created that container - the Hopsworks Feature Store will not create that storage container for you. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -33,7 +39,8 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter ADLS Information -Enter the details for your ADLS connector. Start by giving it a **name** and an optional **description**. +Enter the details for your ADLS connector. +Start by giving it a **name** and an optional **description**.
![ADLS Connector Creation](../../../../assets/images/guides/fs/data_source/adls_creation.png) @@ -50,7 +57,9 @@ Enter the details for your ADLS connector. Start by giving it a **name** and an ### Step 3: Azure Create an ADLS Resource -When programmatically signing in, you need to pass the tenant ID with your authentication request and the application ID. You also need a certificate or an authentication key (described in the following section). To get those values, use the following steps: +When programmatically signing in, you need to pass the tenant ID with your authentication request and the application ID. +You also need a certificate or an authentication key (described in the following section). +To get those values, use the following steps: 1. Select Azure Active Directory. 2. From App registrations in Azure AD, select your application. @@ -74,14 +83,16 @@ When programmatically signing in, you need to pass the tenant ID with your authe #### Common Problems -If you get a permission denied error when writing or reading to/from a ADLS container, it is often because the storage principal (app) does not have the correct permissions. Have you added the "Storage Blob Data Owner" or "Storage Blob Data Contributor" role to the resource group for your storage account (or the subscription for your storage group, if you apply roles at the subscription level)? Go to your resource group, then in "Access Control (IAM)", click the "Add" button to add a "role assignment". +If you get a permission denied error when writing or reading to/from a ADLS container, it is often because the storage principal (app) does not have the correct permissions. +Have you added the "Storage Blob Data Owner" or "Storage Blob Data Contributor" role to the resource group for your storage account (or the subscription for your storage group, if you apply roles at the subscription level)? +Go to your resource group, then in "Access Control (IAM)", click the "Add" button to add a "role assignment". If you get an error "StatusCode=404 StatusDescription=The specified filesystem does not exist.", then maybe you have not created the storage account or the storage container. #### References -* [How to create a service principal on Azure](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal) +- [How to create a service principal on Azure](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal) ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created ADLS connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created ADLS connector. diff --git a/docs/user_guides/fs/data_source/creation/bigquery.md b/docs/user_guides/fs/data_source/creation/bigquery.md index fa4c563c1..936bb30a4 100644 --- a/docs/user_guides/fs/data_source/creation/bigquery.md +++ b/docs/user_guides/fs/data_source/creation/bigquery.md @@ -3,38 +3,34 @@ ## Introduction A BigQuery data source provides integration to Google Cloud BigQuery. -BigQuery is Google Cloud's managed data warehouse supporting that lets you run analytics and -execute SQL queries over large scale data. Such data warehouses are often the source of raw data for feature -engineering pipelines. +BigQuery is Google Cloud's managed data warehouse supporting that lets you run analytics and execute SQL queries over large scale data. +Such data warehouses are often the source of raw data for feature engineering pipelines. -In this guide, you will configure a Data Source in Hopsworks to connect to your BigQuery project by saving the -necessary information. -When you're finished, you'll be able to execute queries and read results of BigQuery using Spark through -HSFS APIs. +In this guide, you will configure a Data Source in Hopsworks to connect to your BigQuery project by saving the necessary information. +When you're finished, you'll be able to execute queries and read results of BigQuery using Spark through HSFS APIs. The data source uses the Google `spark-bigquery-connector` behind the scenes. -To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage -'github.com/GoogleCloudDataproc/spark-bigquery-connector') +To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage 'github.com/GoogleCloudDataproc/spark-bigquery-connector') !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information about your GCP account: -- **BigQuery Project:** You need a BigQuery project, dataset and table created and have read access to it. Or, if - you wish to query a public dataset you need its corresponding details. -- **Authentication Method:** Authentication to GCP account is handled by uploading the `JSON keyfile for service - account` to the Hopsworks Project. You will need to create this JSON keyfile from GCP. For more information on - service accounts - and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account - 'creating service account keyfile') +- **BigQuery Project:** You need a BigQuery project, dataset and table created and have read access to it. + Or, if you wish to query a public dataset you need its corresponding details. +- **Authentication Method:** Authentication to GCP account is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. + You will need to create this JSON keyfile from GCP. + For more information on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account 'creating service account keyfile') !!! note To read data, the BigQuery service account user needs permission to `create read sesssion` which is available in **BigQuery Admin role**. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -44,11 +40,10 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2).
The Data Source View in the User Interface
- ### Step 2: Enter source details -Enter the details for your BigQuery storage. Start by giving -it a unique **name** and an optional -**description**. + +Enter the details for your BigQuery storage. +Start by giving it a unique **name** and an optional **description**.
![BigQuery Creation](../../../../assets/images/guides/fs/data_source/bigquery_creation.png) @@ -56,25 +51,26 @@ it a unique **name** and an optional
1. Select "Google BigQuery" as the storage. -2. Next, set the name of the parent BigQuery project. This is used for billing by GCP. -3. Authentication: Here you should upload your `JSON keyfile for service - account` used for authentication. You can choose to either - upload from your local using `Upload new file` or choose an existing file within project using `From Project`. +2. Next, set the name of the parent BigQuery project. + This is used for billing by GCP. +3. Authentication: Here you should upload your `JSON keyfile for service account` used for authentication. + You can choose to either upload from your local using `Upload new file` or choose an existing file within project using `From Project`. 4. Read Options: - In the UI set the below fields, - 1. *BigQuery Project*: The BigQuery project to read - 2. *BigQuery Dataset*: The dataset of the table (Optional) - 3. *BigQuery Table*: The table to read (Optional) - - -!!! note - *Materialization Dataset*: Temporary dataset used by BigQuery for writing. It must be set to a dataset where the GCP user has table creation permission. The queried table must be in the same location as the `materializationDataset` (e.g 'EU' or 'US'). Also, if a table in the `SQL statement` is from project other than the `parentProject` then use the fully qualified table name i.e. `[project].[dataset].[table]` - (Read more details from Google documentation on usage of query for BigQuery spark connector [here](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#reading-data-from-a-bigquery-query)). + In the UI set the below fields, + 1. *BigQuery Project*: The BigQuery project to read + 2. *BigQuery Dataset*: The dataset of the table (Optional) + 3. *BigQuery Table*: The table to read (Optional) + + !!! note + *Materialization Dataset*: Temporary dataset used by BigQuery for writing. + It must be set to a dataset where the GCP user has table creation permission. + The queried table must be in the same location as the `materializationDataset` (e.g 'EU' or 'US'). + Also, if a table in the `SQL statement` is from project other than the `parentProject` then use the fully qualified table name i.e. `[project].[dataset].[table]`. + For details, read the Google documentation on [usage of query for BigQuery Spark connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#reading-data-from-a-bigquery-query). 5. Spark Options: Optionally, you can set additional spark options using the `Key - Value` pairs. -6. Click on "Save Credentials". +6. Click on "Save Credentials". ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created BigQuery -connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created BigQuery connector. diff --git a/docs/user_guides/fs/data_source/creation/gcs.md b/docs/user_guides/fs/data_source/creation/gcs.md index c34b6571c..587f40859 100644 --- a/docs/user_guides/fs/data_source/creation/gcs.md +++ b/docs/user_guides/fs/data_source/creation/gcs.md @@ -2,40 +2,40 @@ ## Introduction -This particular type of Data Source provides integration to Google Cloud Storage (GCS). GCS is -an object storage service offered by Google Cloud. An object could be simply any piece -of immutable data consisting of a file of any format, for example a `CSV` or `PARQUET`. These objects are stored in -containers called as `buckets`. +This particular type of Data Source provides integration to Google Cloud Storage (GCS). +GCS is an object storage service offered by Google Cloud. +An object could be simply any piece of immutable data consisting of a file of any format, for example a `CSV` or `PARQUET`. +These objects are stored in containers called as `buckets`. These types of storages are often the source for raw data from which features can be engineered. -In this guide, you will configure a Data Source in Hopsworks to connect to your GCS bucket by saving the -necessary information. +In this guide, you will configure a Data Source in Hopsworks to connect to your GCS bucket by saving the necessary information. When you're finished, you'll be able to read files from the GCS bucket using Spark through HSFS APIs. -The Data Source uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Data Source for Spark and Hadoop]( -https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop') +The Data Source uses the Google `gcs-connector-hadoop` behind the scenes. +For more information, check out [Google Cloud Data Source for Spark and Hadoop](https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop'). !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information about your GCP account and bucket: -- **Bucket:** You need a GCS bucket created and have read access to it. The bucket is identified by its name. -- **Authentication Method:** Authentication to GCP account is handled by uploading the `JSON keyfile for service - account` to the Hopsworks Project. You will need to create this JSON keyfile from GCP. For more information on - service accounts - and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account - 'creating service account keyfile') -- **Server-side Encryption** GCS encrypts the data on server side by default. The connector additionally supports the - optional encryption method `Customer Supplied Encryption Key` by GCP. You can choose the encryption option `AES-256` and provide AES-256 key and hash, encoded in - standard Base64. The encryption details are stored as [Secrets](../../../projects/secrets/create_secret.md) - in the Hopsworks for keeping it secure. +- **Bucket:** You need a GCS bucket created and have read access to it. + The bucket is identified by its name. +- **Authentication Method:** Authentication to GCP account is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. + You will need to create this JSON keyfile from GCP. + For more information on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account 'creating service account keyfile') +- **Server-side Encryption** GCS encrypts the data on server side by default. + The connector additionally supports the optional encryption method `Customer Supplied Encryption Key` by GCP. + You can choose the encryption option `AES-256` and provide AES-256 key and hash, encoded in standard Base64. + The encryption details are stored as [Secrets](../../../projects/secrets/create_secret.md) in the Hopsworks for keeping it secure. Read more about encryption on [Google Documentation.](https://cloud.google.com/storage/docs/encryption/customer-supplied-keys) ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -47,9 +47,8 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter connector details -Enter the details for your GCS connector. Start by giving -it a unique **name** and an optional -**description**. +Enter the details for your GCS connector. +Start by giving it a unique **name** and an optional **description**.
![GCS Connector Creation](../../../../assets/images/guides/fs/data_source/gcs_creation.png) @@ -58,14 +57,13 @@ it a unique **name** and an optional 1. Select `Google Cloud Storage` as the storage. 2. Next, set the name of the GCS Bucket you wish to connect with. -3. Authentication: Here you should upload your `JSON keyfile for service - account` used for authentication. You can choose to either - upload from your local using `Upload new file` or choose an existing file within project using `From Project`. -4. GCS Server Side Encryption: You can leave this to `Default Encryption` if you do not wish to provide explicit encrypting keys. Otherwise, -optionally you can set the encryption setting for `AES-256` and provide the encryption key and hash when selected. +3. Authentication: Here you should upload your `JSON keyfile for service account` used for authentication. + You can choose to either upload from your local using `Upload new file` or choose an existing file within project using `From Project`. +4. GCS Server Side Encryption: You can leave this to `Default Encryption` if you do not wish to provide explicit encrypting keys. + Otherwise, optionally you can set the encryption setting for `AES-256` and provide the encryption key and hash when selected. 5. Click on `Save Credentials`. ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created GCS -connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created GCS +connector. diff --git a/docs/user_guides/fs/data_source/creation/hopsfs.md b/docs/user_guides/fs/data_source/creation/hopsfs.md index 3d89946b5..b12a03f88 100644 --- a/docs/user_guides/fs/data_source/creation/hopsfs.md +++ b/docs/user_guides/fs/data_source/creation/hopsfs.md @@ -2,19 +2,26 @@ ## Introduction -HopsFS is a HDFS-compatible filesystem on AWS/Azure/on-premises for data analytics. HopsFS stores its data on object storage in the cloud (S3 in AWs and Blob storage on Azure) and on commodity servers on-premises, ensuring low-cost storage, high availability, and disaster recovery. In Hopsworks, you can access HopsFS natively in programs (Spark, TensorFlow, etc) without the need to define a Data Source. By default, every Project has a Data Source for Training Datasets. When you create training datasets from features in the Feature Store the HopsFS connector is the default Data Source. However, if you want to output data to a different dataset, you can define a new Data Source for that dataset. +HopsFS is a HDFS-compatible filesystem on AWS/Azure/on-premises for data analytics. +HopsFS stores its data on object storage in the cloud (S3 in AWs and Blob storage on Azure) and on commodity servers on-premises, ensuring low-cost storage, high availability, and disaster recovery. +In Hopsworks, you can access HopsFS natively in programs (Spark, TensorFlow, etc) without the need to define a Data Source. +By default, every Project has a Data Source for Training Datasets. +When you create training datasets from features in the Feature Store the HopsFS connector is the default Data Source. +However, if you want to output data to a different dataset, you can define a new Data Source for that dataset. In this guide, you will configure a HopsFS Data Source in Hopsworks which points at a different directory on the file system than the Training Datasets directory. When you're finished, you'll be able to write training data to different locations in your cluster through HSFS APIs. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to identify a **directory on the filesystem** of Hopsworks, to which you want to point the Data Source that you are going to create. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -26,11 +33,12 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter HopsFS Settings -Enter the details for your HopsFS connector. Start by giving it a **name** and an optional **description**. +Enter the details for your HopsFS connector. +Start by giving it a **name** and an optional **description**. 1. Select "HopsFS" as the storage. 2. Select the top-level dataset to point the connector to. -3. Click on "Save Credentials". +3. Click on "Save Credentials".
![HopsFS Connector Creation](../../../../assets/images/guides/fs/data_source/hopsfs_creation.png) @@ -39,4 +47,4 @@ Enter the details for your HopsFS connector. Start by giving it a **name** and a ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created HopsFS connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created HopsFS connector. diff --git a/docs/user_guides/fs/data_source/creation/jdbc.md b/docs/user_guides/fs/data_source/creation/jdbc.md index b1fa4324c..ec4e8e080 100644 --- a/docs/user_guides/fs/data_source/creation/jdbc.md +++ b/docs/user_guides/fs/data_source/creation/jdbc.md @@ -2,27 +2,33 @@ ## Introduction -JDBC is an API provided by many database systems. Using JDBC connections one can query and update data in a database, usually oriented towards relational databases. Examples of databases you can connect to using JDBC are MySQL, Postgres, Oracle, DB2, MongoDB or Microsoft SQLServer. +JDBC is an API provided by many database systems. +Using JDBC connections one can query and update data in a database, usually oriented towards relational databases. +Examples of databases you can connect to using JDBC are MySQL, Postgres, Oracle, DB2, MongoDB or Microsoft SQLServer. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a JDBC connection to your database of choice. When you're finished, you'll be able to query the database using Spark through HSFS APIs. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from your JDBC compatible database: -- **JDBC Connection URL:** Consult the documentation of your target database to determine the correct JDBC URL and parameters. As an example, for MySQL the URL could be: +- **JDBC Connection URL:** Consult the documentation of your target database to determine the correct JDBC URL and parameters. + As an example, for MySQL the URL could be: -``` +```plaintext jdbc:mysql://10.0.2.15:3306/[databaseName]?useSSL=false&allowPublicKeyRetrieval=true ``` -- **Username and Password:** Typically, you will need to add username and password in your JDBC URL or as key/value parameters. So make sure you have retrieved a username and password with the suitable permissions for the database and table you want to query. +- **Username and Password:** Typically, you will need to add username and password in your JDBC URL or as key/value parameters. + So make sure you have retrieved a username and password with the suitable permissions for the database and table you want to query. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -42,21 +48,23 @@ Enter the details for your JDBC enabled database.
1. Select "JDBC" as the storage. -2. Enter the JDBC connection url. This can for example also contain the username and password. -3. Add additional key/value arguments to be passed to the connection, such as username or password. These might differ by database. - +2. Enter the JDBC connection url. + This can for example also contain the username and password. +3. Add additional key/value arguments to be passed to the connection, such as username or password. + These might differ by database. - !!! note - Driver class name is a mandatory argument even if using the default MySQL driver. Add it by specifying a property with the name `driver` and class name as value. The driver class name will differ based on the database. For MySQL databases, the class name is `com.mysql.cj.jdbc.Driver`, as shown in the example image. - - -4. Click on "Save Credentials". + !!! note + Driver class name is a mandatory argument even if using the default MySQL driver. + Add it by specifying a property with the name `driver` and class name as value. + The driver class name will differ based on the database. + For MySQL databases, the class name is `com.mysql.cj.jdbc.Driver`, as shown in the example image. +4. Click on "Save Credentials". !!! note - To be able to use the connector, you need to upload the driver JAR file to the [Jupyter configuration](../../../projects/jupyter/spark_notebook.md) or [Job configuration](../../../projects/jobs/pyspark_job.md) in `Addtional Jars`. For MySQL connections the default JDBC driver is already included in Hopsworks so this step can be skipped. - + To be able to use the connector, you need to upload the driver JAR file to the [Jupyter configuration](../../../projects/jupyter/spark_notebook.md) or [Job configuration](../../../projects/jobs/pyspark_job.md) in `Addtional Jars`. + For MySQL connections the default JDBC driver is already included in Hopsworks so this step can be skipped. ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created JDBC connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created JDBC connector. diff --git a/docs/user_guides/fs/data_source/creation/kafka.md b/docs/user_guides/fs/data_source/creation/kafka.md index 51c4aa03d..4c68b5413 100644 --- a/docs/user_guides/fs/data_source/creation/kafka.md +++ b/docs/user_guides/fs/data_source/creation/kafka.md @@ -2,23 +2,32 @@ ## Introduction -Apache Kafka is a distributed event store and stream-processing platform. It's a very popular framework for handling realtime data streams and is often used as a message broker for events coming from production systems until they are being processed and either loaded into a data warehouse or aggregated into features for Machine Learning. +Apache Kafka is a distributed event store and stream-processing platform. +It's a very popular framework for handling realtime data streams and is often used as a message broker for events coming from production systems until they are being processed and either loaded into a data warehouse or aggregated into features for Machine Learning. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a connection to your Kafka cluster. When you're finished, you'll be able to read from Kafka topics in your cluster using Spark through HSFS APIs. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from Kafka cluster, the following options are **mandatory**: -- **Kafka Bootstrap servers:** It is the url of one of the Kafka brokers which you give to fetch the initial metadata about your Kafka cluster. The metadata consists of the topics, their partitions, the leader brokers for those partitions etc. Depending upon this metadata your producer or consumer produces or consumes the data. -- **Security Protocol:** The security protocol you want to use to authenticate with your Kafka cluster. Make sure the chosen protocol is supported by your cluster. For an overview of the available protocols, please see the [Confluent Kafka Documentation](https://docs.confluent.io/platform/current/kafka/overview-authentication-methods.html). -- **Certificates:** Depending on the chosen security protocol, you might need TrustStore and KeyStore files along with the corresponding key password. Contact your Kafka administrator, if you don't know how to retrieve these. If you want to setup a data source to Hopsworks' internal Kafka cluster, you can download the needed certificates from the integration tab in your project settings. +- **Kafka Bootstrap servers:** It is the url of one of the Kafka brokers which you give to fetch the initial metadata about your Kafka cluster. + The metadata consists of the topics, their partitions, the leader brokers for those partitions etc. + Depending upon this metadata your producer or consumer produces or consumes the data. +- **Security Protocol:** The security protocol you want to use to authenticate with your Kafka cluster. + Make sure the chosen protocol is supported by your cluster. + For an overview of the available protocols, please see the [Confluent Kafka Documentation](https://docs.confluent.io/platform/current/kafka/overview-authentication-methods.html). +- **Certificates:** Depending on the chosen security protocol, you might need TrustStore and KeyStore files along with the corresponding key password. + Contact your Kafka administrator, if you don't know how to retrieve these. + If you want to setup a data source to Hopsworks' internal Kafka cluster, you can download the needed certificates from the integration tab in your project settings. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -30,35 +39,41 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter Kafka Settings -Enter the details for your Kafka connector. Start by giving it a **name** and an optional **description**. +Enter the details for your Kafka connector. +Start by giving it a **name** and an optional **description**. -1. Select "Kafka" as the storage. -2. Add all the bootstrap server addresses and ports that you want the consumers/producers to connect to. The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. -3. Choose the Security protocol. +01. Select "Kafka" as the storage. +02. Add all the bootstrap server addresses and ports that you want the consumers/producers to connect to. + The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. +03. Choose the Security protocol. !!! example "TSL/SSL" - By default, Apache Kafka communicates in `PLAINTEXT`, which means that all data is sent in the clear. To encrypt communication, you should configure all the Confluent Platform components in your deployment to use TLS/SSL encryption. + By default, Apache Kafka communicates in `PLAINTEXT`, which means that all data is sent in the clear. + To encrypt communication, you should configure all the Confluent Platform components in your deployment to use TLS/SSL encryption. TLS uses private-key/certificate pairs, which are used during the TLS handshake process. Each broker needs its own private-key/certificate pair, and the client uses the certificate to authenticate the broker. Each logical client needs a private-key/certificate pair if client authentication is enabled, and the broker uses the certificate to authenticate the client. - These are provided in the form of *TrustStore* and *KeyStore* `JKS` files together with a key password. For more information, refer to the official [Apacha Kafka Guide for TSL/SSL authentication](https://docs.confluent.io/platform/current/kafka/authentication_ssl.html). + These are provided in the form of *TrustStore* and *KeyStore* `JKS` files together with a key password. + For more information, refer to the official [Apacha Kafka Guide for TSL/SSL authentication](https://docs.confluent.io/platform/current/kafka/authentication_ssl.html). !!! example "SASL SSL or SASL plaintext" - Apache Kafka brokers support client authentication using SASL. SASL authentication can be enabled concurrently with TLS/SSL encryption (TLS/SSL client authentication will be disabled). + Apache Kafka brokers support client authentication using SASL. + SASL authentication can be enabled concurrently with TLS/SSL encryption (TLS/SSL client authentication will be disabled). - This authentication method often requires extra arguments depending on your setup. Make use of the optional additional key/value arguments (5) to provide these. + This authentication method often requires extra arguments depending on your setup. + Make use of the optional additional key/value arguments (5) to provide these. SASL authentication can be enabled concurrently with TLS/SSL encryption (TLS/SSL client authentication will be disabled). For more information, please refer to the offical [Apache Kafka Guide for SASL authentication](https://docs.confluent.io/platform/current/kafka/authentication_sasl/index.html). - - -4. The endpoint identification algorithm used by clients to validate server host name. The default value is `https`. Clients including client connections created by the broker for inter-broker communication verify that the broker host name matches the host name in the broker’s certificate. -5. Optional additional key/value arguments. -6. Click on "Save Credentials". +04. The endpoint identification algorithm used by clients to validate server host name. + The default value is `https`. + Clients including client connections created by the broker for inter-broker communication verify that the broker host name matches the host name in the broker’s certificate. +05. Optional additional key/value arguments. +06. Click on "Save Credentials".
![Kafka Connector Creation](../../../../assets/images/guides/fs/data_source/kafka_creation.png) @@ -67,4 +82,4 @@ Enter the details for your Kafka connector. Start by giving it a **name** and an ## Next Steps -Move on to the [usage guide for Data Sources](../usage.md) to see how you can use your newly created Kafka connector. \ No newline at end of file +Move on to the [usage guide for Data Sources](../usage.md) to see how you can use your newly created Kafka connector. diff --git a/docs/user_guides/fs/data_source/creation/rds.md b/docs/user_guides/fs/data_source/creation/rds.md index 50a38db16..9419ce477 100644 --- a/docs/user_guides/fs/data_source/creation/rds.md +++ b/docs/user_guides/fs/data_source/creation/rds.md @@ -2,19 +2,21 @@ ## Introduction -Amazon RDS (Relational Database Service) is a managed relational database service that supports several popular database engines, such as MySQL, PostgreSQL, Oracle, and Microsoft SQL Server. Using JDBC connections, you can query and update data in your RDS database from Hopsworks. +Amazon RDS (Relational Database Service) is a managed relational database service that supports several popular database engines, such as MySQL, PostgreSQL, Oracle, and Microsoft SQL Server. +Using JDBC connections, you can query and update data in your RDS database from Hopsworks. In this guide, you will configure a Data Source in Hopsworks to securely store the authentication information needed to set up a JDBC connection to your Amazon RDS instance. Once configured, you will be able to query your RDS database. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin, ensure you have the following information from your Amazon RDS instance: -- **Host:** You can find the endpoint for your RDS instance in the AWS Console. +- **Host:** You can find the endpoint for your RDS instance in the AWS Console. 1. Go to the AWS Console → `Aurora and RDS` 2. Click on your DB instance. @@ -24,11 +26,11 @@ Before you begin, ensure you have the following information from your Amazon RDS mydb.abcdefg1234.us-west-2.rds.amazonaws.com - - **Database:** You can specify which database to use + - **Database:** You can specify which database to use - - **Port:** Provide the port to connect to + - **Port:** Provide the port to connect to - - **Username and Password:** Obtain the username and password for your RDS database with the necessary permissions to access the required tables. + - **Username and Password:** Obtain the username and password for your RDS database with the necessary permissions to access the required tables. ## Creation in the UI @@ -59,4 +61,4 @@ Enter the details for your Amazon RDS database. ## Next Steps -Proceed to the [usage guide for data sources](../usage.md) to learn how to use your newly created connector. \ No newline at end of file +Proceed to the [usage guide for data sources](../usage.md) to learn how to use your newly created connector. diff --git a/docs/user_guides/fs/data_source/creation/redshift.md b/docs/user_guides/fs/data_source/creation/redshift.md index 241b32c4e..512001a95 100644 --- a/docs/user_guides/fs/data_source/creation/redshift.md +++ b/docs/user_guides/fs/data_source/creation/redshift.md @@ -2,30 +2,37 @@ ## Introduction -Amazon Redshift is a popular managed data warehouse on AWS, used as a data warehouse in many enterprises. +Amazon Redshift is a popular managed data warehouse on AWS, used as a data warehouse in many enterprises. -Data warehouses are often the source of raw data for feature engineering pipelines and Redshift supports scalable feature computation with SQL. However, Redshift is not viable as an online feature store that serves features to models in production, with its columnar database layout its latency is too high compared to OLTP databases or key-value stores. +Data warehouses are often the source of raw data for feature engineering pipelines and Redshift supports scalable feature computation with SQL. +However, Redshift is not viable as an online feature store that serves features to models in production, with its columnar database layout its latency is too high compared to OLTP databases or key-value stores. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a connection to your AWS Redshift cluster. When you're finished, you'll be able to query the database using Spark through HSFS APIs. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from your AWS account and Redshift database, the following options are **mandatory**: - **Cluster identifier:** The name of the cluster. -- **Database endpoint:** The endpoint for the database. Should be in the format of `[UUID].eu-west-1.redshift.amazonaws.com`. +- **Database endpoint:** The endpoint for the database. + Should be in the format of `[UUID].eu-west-1.redshift.amazonaws.com`. - **Database name:** The name of the database to query. -- **Database port:** The port of the cluster. Defaults to 5349. -- **Authentication method:** There are three options available for authenticating with the Redshift cluster. The first option is to configure a username and a password. -The second option is to configure an IAM role. With IAM roles, Jobs or notebooks launched on Hopsworks do not need to explicitly authenticate with Redshift, as the HSFS library will transparently use the IAM role to acquire a temporary credential to authenticate the specified user. -Read more about IAM roles in our [AWS credentials pass-through guide](../../../../setup_installation/admin/roleChaining.md). Lastly, - option `Instance Role` will use the default ARN Role configured for the cluster instance. +- **Database port:** The port of the cluster. + Defaults to 5349. +- **Authentication method:** There are three options available for authenticating with the Redshift cluster. + The first option is to configure a username and a password. + The second option is to configure an IAM role. + With IAM roles, Jobs or notebooks launched on Hopsworks do not need to explicitly authenticate with Redshift, as the HSFS library will transparently use the IAM role to acquire a temporary credential to authenticate the specified user. + Read more about IAM roles in our [AWS credentials pass-through guide](../../../../setup_installation/admin/roleChaining.md). + Lastly, option `Instance Role` will use the default ARN Role configured for the cluster instance. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -37,22 +44,23 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter The Connector Information -Enter the details for your Redshift connector. Start by giving it a **name** and an optional **description**. - -1. Select "Redshift" as the storage. -2. The name of the cluster. -3. The database endpoint. Should be in the format `[UUID].eu-west-1.redshift.amazonaws.com`. For example, if the endpoint info - displayed in Redshift is `cluster-id.uuid.eu-north-1.redshift.amazonaws.com:5439/dev` the value to enter - here is just `uuid.eu-north-1.redshift.amazonaws.com` -4. The database name. -5. The database port. -6. The database username, here you have the possibility to let Hopsworks auto-create the username for you. -7. Database Driver (optional): You can use the default JDBC Redshift Driver `com.amazon.redshift.jdbc42.Driver` - included in Hopsworks or set a different driver (More on this later). -8. Optionally provide the database group and table for the connector. A database group is the group created - for the user if applicable. More information, at [redshift documentation](https://docs.aws.amazon.com/redshift/latest/dg/r_Groups.html) -9. Set the appropriate authentication method. -10. Click on "Save Credentials". +Enter the details for your Redshift connector. +Start by giving it a **name** and an optional **description**. + +01. Select "Redshift" as the storage. +02. The name of the cluster. +03. The database endpoint. + Should be in the format `[UUID].eu-west-1.redshift.amazonaws.com`. + For example, if the endpoint info displayed in Redshift is `cluster-id.uuid.eu-north-1.redshift.amazonaws.com:5439/dev` the value to enter here is just `uuid.eu-north-1.redshift.amazonaws.com` +04. The database name. +05. The database port. +06. The database username, here you have the possibility to let Hopsworks auto-create the username for you. +07. Database Driver (optional): You can use the default JDBC Redshift Driver `com.amazon.redshift.jdbc42.Driver` included in Hopsworks or set a different driver (More on this later). +08. Optionally provide the database group and table for the connector. + A database group is the group created for the user if applicable. + More information, at [redshift documentation](https://docs.aws.amazon.com/redshift/latest/dg/r_Groups.html) +09. Set the appropriate authentication method. +10. Click on "Save Credentials".
![Redshift Connector Creation](../../../../assets/images/guides/fs/data_source/redshift_creation.png) @@ -61,18 +69,22 @@ Enter the details for your Redshift connector. Start by giving it a **name** and !!! warning "Session Duration" By default, the session duration that the role will be assumed for is 1 hour or 3600 seconds. - This means if you want to use the data source for example to [read or create an external Feature Group from Redshift](../usage.md##creating-an-external-feature-group), the operation cannot take longer than one hour. + This means if you want to use the data source for example to [read or create an external Feature Group from Redshift](../usage.md#creating-an-external-feature-group), the operation cannot take longer than one hour. - Your administrator can change the default session duration for AWS data sources, by first [increasing the max session duration of the IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html#id_roles_use_view-role-max-session) that you are assuming. And then changing the `fs_data_source_session_duration` [configuration property](../../../../setup_installation/admin/variables.md) to the appropriate value in seconds. + Your administrator can change the default session duration for AWS data sources, by first [increasing the max session duration of the IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html#id_roles_use_view-role-max-session) that you are assuming. + And then changing the `fs_data_source_session_duration` [configuration property](../../../../setup_installation/admin/variables.md) to the appropriate value in seconds. ### Step 3: Upload the Redshift database driver (optional) -The `redshift-jdbc42` JDBC driver is included by default in the Hopsworks distribution. -If you wish to use a different driver, you need to upload it on Hopsworks and add it as a dependency of Jobs and Jupyter Notebooks that need it. First, you need to [download the library](https://docs.aws.amazon.com/redshift/latest/mgmt/jdbc20-download-driver.html). Select the driver version without the AWS SDK. +The `redshift-jdbc42` JDBC driver is included by default in the Hopsworks distribution. +If you wish to use a different driver, you need to upload it on Hopsworks and add it as a dependency of Jobs and Jupyter Notebooks that need it. +First, you need to [download the library](https://docs.aws.amazon.com/redshift/latest/mgmt/jdbc20-download-driver.html). +Select the driver version without the AWS SDK. #### Add the driver to Jupyter Notebooks and Spark jobs -You can now add the driver file to the default job and Jupyter configuration. This way, all jobs and Jupyter instances in the project will have the driver attached so that Spark can access it. +You can now add the driver file to the default job and Jupyter configuration. +This way, all jobs and Jupyter instances in the project will have the driver attached so that Spark can access it. 1. Go into the Project's settings. 2. Select "Compute configuration". @@ -84,8 +96,10 @@ You can now add the driver file to the default job and Jupyter configuration. Th
Attaching the Redshift Driver to all Jobs and Jupyter Instances of the Project
-Alternatively, you can choose the "From Project" option. You will first have to upload the jar file to the Project using the File Browser. After you have uploaded the jar -file, you can select it using the "From Project" option. To upload the jar file to the Project through the File Browser, see the example below: +Alternatively, you can choose the "From Project" option. +You will first have to upload the jar file to the Project using the File Browser. +After you have uploaded the jar file, you can select it using the "From Project" option. +To upload the jar file to the Project through the File Browser, see the example below: 1. Open File Browser 2. Navigate to "Resources" directory @@ -96,14 +110,10 @@ file, you can select it using the "From Project" option. To upload the jar file
Redshift Driver Upload in the File Browser
- - -!!! tip - If you face network connectivity issues to your Redshift cluster, a common cause could be the cluster database port - not being accessible from outside the Redshift cluster VPC network. A quick and dirty way to enable connectivity is - to [Enable Publicly Accessible](https://aws.amazon.com/premiumsupport/knowledge-center/redshift-cluster-private-public/). - However, in a production setting, you should use [VPC peering](https://docs.aws.amazon.com/vpc/latest/peering/what-is-vpc-peering.html) - or some equivalent mechanism for connecting the clusters. +!!! tip + If you face network connectivity issues to your Redshift cluster, a common cause could be the cluster database port not being accessible from outside the Redshift cluster VPC network. + A quick and dirty way to enable connectivity is to [Enable Publicly Accessible](https://aws.amazon.com/premiumsupport/knowledge-center/redshift-cluster-private-public/). + However, in a production setting, you should use [VPC peering](https://docs.aws.amazon.com/vpc/latest/peering/what-is-vpc-peering.html) or some equivalent mechanism for connecting the clusters. ## Next Steps diff --git a/docs/user_guides/fs/data_source/creation/s3.md b/docs/user_guides/fs/data_source/creation/s3.md index ef88615a3..47d71f7c9 100644 --- a/docs/user_guides/fs/data_source/creation/s3.md +++ b/docs/user_guides/fs/data_source/creation/s3.md @@ -2,27 +2,40 @@ ## Introduction -Amazon S3 or Amazon Simple Storage Service is a service offered by AWS that provides object storage. That means you can store arbitrary objects associated with a key. These kind of storage systems are often used as Data Lakes with large volumes of unstructured data or file based storage. Popular file formats are `CSV` or `PARQUET`. +Amazon S3 or Amazon Simple Storage Service is a service offered by AWS that provides object storage. +That means you can store arbitrary objects associated with a key. +These kinds of storage systems are often used as Data Lakes with large volumes of unstructured data or file based storage. +Popular file formats are `CSV` or `PARQUET`. -There are so called Data Lake House technologies such as Delta Lake or Apache Hudi, building an additional layer on top of object based storage with files, to provide database semantics like ACID transactions among others. This has the advantage that cheap storage can be turned into a cloud native data warehouse. These kind of storages are often the source for raw data from which features can be engineered. +There are so called Data Lake House technologies such as Delta Lake or Apache Hudi, building an additional layer on top of object based storage with files, to provide database semantics like ACID transactions among others. +This has the advantage that cheap storage can be turned into a cloud native data warehouse. +These kinds of storages are often the source for raw data from which features can be engineered. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a connection to your AWS S3 bucket. -When you're finished, you'll be able to read files using Spark through HSFS APIs. You can also use the connector to write out training data from the Feature Store, in order to make it accessible by third parties. +When you're finished, you'll be able to read files using Spark through HSFS APIs. +You can also use the connector to write out training data from the Feature Store, in order to make it accessible by third parties. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. +You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from your AWS S3 account and bucket: -- **Bucket:** You will need a S3 bucket that you have access to. The bucket is identified by its name. +- **Bucket:** You will need a S3 bucket that you have access to. + The bucket is identified by its name. - **Path (Optional):** If needed, a path can be defined to ensure that all operations are restricted to a specific location within the bucket. -- **Region (Optional):** You will need an S3 region to have complete control over data when managing the feature group that relies on this data source. The region is identified by its code. -- **Authentication Method:** You can authenticate using Access Key/Secret, or use IAM roles. If you want to use an IAM role it either needs to be attached to the entire Hopsworks cluster or Hopsworks needs to be able to assume the role. See [IAM role documentation](../../../../setup_installation/admin/roleChaining.md) for more information. -- **Server Side Encryption details:** If your bucket has server side encryption (SSE) enabled, make sure you know which algorithm it is using (AES256 or SSE-KMS). If you are using SSE-KMS, you need the resource ARN of the managed key. +- **Region (Optional):** You will need an S3 region to have complete control over data when managing the feature group that relies on this data source. + The region is identified by its code. +- **Authentication Method:** You can authenticate using Access Key/Secret, or use IAM roles. + If you want to use an IAM role it either needs to be attached to the entire Hopsworks cluster or Hopsworks needs to be able to assume the role. + See [IAM role documentation](../../../../setup_installation/admin/roleChaining.md) for more information. +- **Server Side Encryption details:** If your bucket has server side encryption (SSE) enabled, make sure you know which algorithm it is using (AES256 or SSE-KMS). + If you are using SSE-KMS, you need the resource ARN of the managed key. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -34,7 +47,8 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter Bucket Information -Enter the details for your S3 connector. Start by giving it a **name** and an optional **description**. +Enter the details for your S3 connector. +Start by giving it a **name** and an optional **description**. And set the name of the S3 Bucket you want to point the connector to. Optionally, specify the region if you wish to have a Hopsworks-managed feature group stored using this connector. @@ -46,41 +60,58 @@ Optionally, specify the region if you wish to have a Hopsworks-managed feature g ### Step 3: Configure Authentication #### Instance Role + Choose instance role if you have an EC2 instance profile attached to your Hopsworks cluster nodes with a role which grants you access to the specified bucket. #### Temporary Credentials -Choose temporary credentials if you are using [AWS Role chaining](../../../../setup_installation/admin/roleChaining.md) to control the access permission on a project and user role base. Once you have selected *Temporary Credentials* select the role that give access to the specified bucket. For this role to appear in the list it needs to have been configured by an administrator, see the [AWS Role chaining documentation](../../../../setup_installation/admin/roleChaining.md) for more details. + +Choose temporary credentials if you are using [AWS Role chaining](../../../../setup_installation/admin/roleChaining.md) to control the access permission on a project and user role base. +Once you have selected *Temporary Credentials* select the role that give access to the specified bucket. +For this role to appear in the list it needs to have been configured by an administrator, see the [AWS Role chaining documentation](../../../../setup_installation/admin/roleChaining.md) for more details. !!! warning "Session Duration" By default, the session duration that the role will be assumed for is 1 hour or 3600 seconds. This means if you want to use the data source for example to write [training data to S3](../usage.md#writing-training-data), the training dataset creation cannot take longer than one hour. - Your administrator can change the default session duration for AWS data sources, by first [increasing the max session duration of the IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html#id_roles_use_view-role-max-session) that you are assuming. And then changing the `fs_data_source_session_duration` [configuration variable](../../../../setup_installation/admin/variables.md) to the appropriate value in seconds. + Your administrator can change the default session duration for AWS data sources, by first [increasing the max session duration of the IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html#id_roles_use_view-role-max-session) that you are assuming. + And then changing the `fs_data_source_session_duration` [configuration variable](../../../../setup_installation/admin/variables.md) to the appropriate value in seconds. #### Access Key/Secret + The most simple authentication method are Access Key/Secret, choose this option to get started quickly, if you are able to retrieve the keys using the IAM user administration. ### Step 4: Configure Server Side Encryption + Additionally, you can specify if your Bucket has SSE enabled. #### AES256 -For AES256, there is nothing to do but enabling the encryption by toggling the `AES256` option. This is using S3-Managed Keys, also called [SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/serv-side-encryption.html). + +For AES256, there is nothing to do but enabling the encryption by toggling the `AES256` option. +This is using S3-Managed Keys, also called [SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/serv-side-encryption.html). #### SSE-KMS -With this option the [encryption key is managed by AWS KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/serv-side-encryption.html), with some additional benefits and charges for using this service. The difference is that you need to provide the resource ARN of the key. + +With this option the [encryption key is managed by AWS KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/serv-side-encryption.html), with some additional benefits and charges for using this service. +The difference is that you need to provide the resource ARN of the key. If you have SSE-KMS enabled for your bucket, you can find the key ARN in the "Properties" section of the bucket details on AWS. ### Step 5: Add Spark Options (optional) -Here you can specify any additional spark options that you wish to add to the spark context at runtime. Multiple options can be added as key - value pairs. -To connect to a S3 compatiable storage other than AWS S3, you can add the option with key as `fs.s3a.endpoint` and the endpoint you want to use as value. The data source will then be able to read from your specified S3 compatible storage. +Here you can specify any additional spark options that you wish to add to the spark context at runtime. +Multiple options can be added as key - value pairs. + +To connect to a S3 compatiable storage other than AWS S3, you can add the option with key as `fs.s3a.endpoint` and the endpoint you want to use as value. +The data source will then be able to read from your specified S3 compatible storage. !!! warning "Spark Configuration" - When using the data source within a Spark application, the credentials are set at application level. This allows users to access multiple buckets with the same data source within the same application (assuming the credentials allow it). - You can disable this behaviour by setting the option `fs.s3a.global-conf` to `False`. If the `global-conf` option is disabled, the credentials are set on a per-bucket basis and users will be able to use the credentials to access data only from the bucket specified in the data source configuration. + When using the data source within a Spark application, the credentials are set at application level. + This allows users to access multiple buckets with the same data source within the same application (assuming the credentials allow it). + You can disable this behaviour by setting the option `fs.s3a.global-conf` to `False`. + If the `global-conf` option is disabled, the credentials are set on a per-bucket basis and users will be able to use the credentials to access data only from the bucket specified in the data source configuration. ### Step 6: Save changes + Click on "Save Credentials". ## Next Steps diff --git a/docs/user_guides/fs/data_source/creation/snowflake.md b/docs/user_guides/fs/data_source/creation/snowflake.md index d2ca19ff5..2ba508135 100644 --- a/docs/user_guides/fs/data_source/creation/snowflake.md +++ b/docs/user_guides/fs/data_source/creation/snowflake.md @@ -2,30 +2,37 @@ ## Introduction -Snowflake provides a cloud-based data storage and analytics service, used as a data warehouse in many enterprises. +Snowflake provides a cloud-based data storage and analytics service, used as a data warehouse in many enterprises. -Data warehouses are often the source of raw data for feature engineering pipelines and Snowflake supports scalable feature computation with SQL. However, Snowflake is not viable as an online feature store that serves features to models in production, with its columnar database layout its latency is too high compared to OLTP databases or key-value stores. +Data warehouses are often the source of raw data for feature engineering pipelines and Snowflake supports scalable feature computation with SQL. +However, Snowflake is not viable as an online feature store that serves features to models in production, with its columnar database layout its latency is too high compared to OLTP databases or key-value stores. In this guide, you will configure a Data Source in Hopsworks to save all the authentication information needed in order to set up a connection to your Snowflake database. When you're finished, you'll be able to query the database using Spark through HSFS APIs. !!! note - Currently, it is only possible to create data sources in the Hopsworks UI. You cannot create a data source programmatically. + Currently, it is only possible to create data sources in the Hopsworks UI. + You cannot create a data source programmatically. ## Prerequisites Before you begin this guide you'll need to retrieve the following information from your Snowflake account and database, the following options are **mandatory**: -- **Snowflake Connection URL:** Consult the documentation of your target snowflake account to determine the correct connection URL. This is usually some form of your [Snowflake account identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html). +- **Snowflake Connection URL:** Consult the documentation of your target snowflake account to determine the correct connection URL. + This is usually some form of your [Snowflake account identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html). For example: -``` + +```plaintext .snowflakecomputing.com ``` + OR: -``` + +```plaintext https://-.snowflakecomputing.com ``` -The account and organization details can be viewed in the Snowsight UI under **Admin > Account** or by querying it in + +The account and organization details can be viewed in the Snowsight UI under **Admin > Account** or by querying it in SQL, as explained in [Snowflake documentation](https://docs.snowflake.com/en/user-guide/organizations-gs.html#viewing-the-name-of-your-organization-and-its-accounts). Below is an example of how to view the account and organization to get the account identifier from the Snowsight UI. @@ -37,9 +44,11 @@ Below is an example of how to view the account and organization to get the accou !!! warning "Token-based authentication or password based" The Snowflake data source supports both username and password authentication as well as token-based authentication. - Currently token-based authentication is in beta phase. Users are advised to use username/password and/or create a service account for accessing Snowflake from Hopsworks. + Currently token-based authentication is in beta phase. + Users are advised to use username/password and/or create a service account for accessing Snowflake from Hopsworks. -- **Username and Password:** Login name for the Snowflake user and password. This is often also referred to as `sfUser` and `sfPassword`. +- **Username and Password:** Login name for the Snowflake user and password. + This is often also referred to as `sfUser` and `sfPassword`. - **Warehouse:** The warehouse to use for the session after connecting - **Database:** The database to use for the session after connecting. - **Schema:** The schema to use for the session after connecting. @@ -47,9 +56,11 @@ Below is an example of how to view the account and organization to get the accou These are a few additional **optional** arguments: - **Role:** The role field can be used to specify which [Snowflake security role](https://docs.snowflake.com/en/user-guide/security-access-control-overview.html#system-defined-roles) to assume for the session after the connection is established. -- **Application:** The application field can also be specified to have better observability in Snowflake with regards to which application is running which query. The application field can be a simple string like “Hopsworks” or, for instance, the project name, to track usage and queries from each Hopsworks project. +- **Application:** The application field can also be specified to have better observability in Snowflake with regards to which application is running which query. + The application field can be a simple string like “Hopsworks” or, for instance, the project name, to track usage and queries from each Hopsworks project. ## Creation in the UI + ### Step 1: Set up new Data Source Head to the Data Source View on Hopsworks (1) and set up a new data source (2). @@ -61,18 +72,19 @@ Head to the Data Source View on Hopsworks (1) and set up a new data source (2). ### Step 2: Enter Snowflake Settings -Enter the details for your Snowflake connector. Start by giving it a **name** and an optional **description**. +Enter the details for your Snowflake connector. +Start by giving it a **name** and an optional **description**. -1. Select "Snowflake" as storage. -2. Specify the hostname for your account in the following format `.snowflakecomputing.com` -or `https://-.snowflakecomputing.com`. -3. Login name for the Snowflake user. -4. Password for the Snowflake user or Token. -5. The warehouse to connect to. -6. The database to use for the connection. -7. Add any additional optional arguments. For example, you can specify `Schema`, `Table`, `Role`, and `Application`. -8. Optional additional key/value arguments. -9. Click on "Save Credentials". +01. Select "Snowflake" as storage. +02. Specify the hostname for your account in the following format `.snowflakecomputing.com` or `https://-.snowflakecomputing.com`. +03. Login name for the Snowflake user. +04. Password for the Snowflake user or Token. +05. The warehouse to connect to. +06. The database to use for the connection. +07. Add any additional optional arguments. + For example, you can specify `Schema`, `Table`, `Role`, and `Application`. +08. Optional additional key/value arguments. +09. Click on "Save Credentials".
![Snowflake Connector Creation](../../../../assets/images/guides/fs/data_source/snowflake_creation.png) @@ -81,4 +93,4 @@ or `https://-.snowflakecomputing.com`. ## Next Steps -Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created Snowflake connector. \ No newline at end of file +Move on to the [usage guide for data sources](../usage.md) to see how you can use your newly created Snowflake connector. diff --git a/docs/user_guides/fs/data_source/index.md b/docs/user_guides/fs/data_source/index.md index 1758155a5..f9b499d0d 100644 --- a/docs/user_guides/fs/data_source/index.md +++ b/docs/user_guides/fs/data_source/index.md @@ -1,6 +1,8 @@ # Data Source Guides -You can define data sources in Hopsworks for batch and streaming data sources. Data Sources securely store the authentication information about how to connect to an external data store. They can be used from programs within Hopsworks or externally. +You can define data sources in Hopsworks for batch and streaming data sources. +Data Sources securely store the authentication information about how to connect to an external data store. +They can be used from programs within Hopsworks or externally. !!!warning @@ -9,11 +11,15 @@ You can define data sources in Hopsworks for batch and streaming data sources. D There are four main use cases for Data Sources: - Simply use it to read data from the storage into a dataframe. -- [External (on-demand) Feature Groups](../../../concepts/fs/feature_group/external_fg.md) can be defined with data sources. This way, Hopsworks stores only the metadata about the features, but does not keep a copy of the data itself. This is also called the Connector API. +- [External (on-demand) Feature Groups](../../../concepts/fs/feature_group/external_fg.md) can be defined with data sources. + This way, Hopsworks stores only the metadata about the features, but does not keep a copy of the data itself. + This is also called the Connector API. - Write [training data](../../../concepts/fs/feature_view/offline_api.md) to an external storage system to make it accessible by third parties. -- Managed [feature group](../../../user_guides/fs/feature_group/create.md) that stores offline data in an external storage system. Currently [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors are supported. +- Managed [feature group](../../../user_guides/fs/feature_group/create.md) that stores offline data in an external storage system. + Currently [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors are supported. -Data Sources provide two main mechanisms for authentication: using credentials or an authentication role (IAM Role on AWS or Managed Identity on Azure). Hopsworks supports both a single IAM role (AWS) or Managed Identity (Azure) for the whole Hopsworks cluster or multiple IAM roles (AWS) or Managed Identities (Azure) that can only be assumed by users with a specific role in a specific project. +Data Sources provide two main mechanisms for authentication: using credentials or an authentication role (IAM Role on AWS or Managed Identity on Azure). +Hopsworks supports both a single IAM role (AWS) or Managed Identity (Azure) for the whole Hopsworks cluster or multiple IAM roles (AWS) or Managed Identities (Azure) that can only be assumed by users with a specific role in a specific project. By default, each project is created with three default Data Sources: A JDBC connector to the online feature store, a HopsFS connector to the Training Datasets directory of the project and a JDBC connector to the offline feature store. @@ -54,4 +60,4 @@ For GCP the following storage systems are supported: ## Next Steps -Move on to the [Configuration and Creation Guides](creation/jdbc.md) to learn how to set up a data source. \ No newline at end of file +Move on to the [Configuration and Creation Guides](creation/jdbc.md) to learn how to set up a data source. diff --git a/docs/user_guides/fs/data_source/usage.md b/docs/user_guides/fs/data_source/usage.md index 84fde75db..d17957dbf 100644 --- a/docs/user_guides/fs/data_source/usage.md +++ b/docs/user_guides/fs/data_source/usage.md @@ -1,4 +1,5 @@ # Data Source Usage + Here, we look at how to use a Data Source after it has been created. Data Sources provide an important first step for integrating with external data. The 4 fundamental functionalities where data sources are used are: @@ -11,11 +12,12 @@ The 4 fundamental functionalities where data sources are used are: We will walk through each functionality in the sections below. ## Retrieving a Data Source + We retrieve a data source simply by its unique name. === "PySpark" ```python - import hopsworks + import hopsworks # Connect to the Hopsworks feature store project = hopsworks.login() feature_store = project.get_feature_store() @@ -28,22 +30,20 @@ We retrieve a data source simply by its unique name. import com.logicalclocks.hsfs._ val connection = HopsworksConnection.builder().build(); val featureStore = connection.getFeatureStore(); - // get directly via connector sub-type class e.g. for GCS type + // get directly via connector sub-type class, e.g., for GCS type val connector = featureStore.getGcsConnector("data_source_name") ``` ## Reading a Spark Dataframe from a Data Source One of the most common usages of a Data Source is to read data directly into a Spark Dataframe. -It's achieved via the `read` API of the connector object, which hides all the complexity of authentication and integration -with a data storage source. +It's achieved via the `read` API of the connector object, which hides all the complexity of authentication and integration with a data storage source. The `read` API primarily has two parameters for specifying the data source, `path` and `query`, depending on the data source type. The exact behaviour could change depending on the fdata source type, but broadly they could be classified as below ### Data lake/object based connectors -For data sources based on object/file storage such as AWS S3, ADLS, GCS, we set the full object path in the `path` argument -and users should pass a Spark data format (parquet, csv, orc, hudi, delta) to the `data_format` argument. +For data sources based on object/file storage such as AWS S3, ADLS, GCS, we set the full object path in the `path` argument and users should pass a Spark data format (parquet, csv, orc, hudi, delta) to the `data_format` argument. === "PySpark" ```python @@ -59,21 +59,20 @@ and users should pass a Spark data format (parquet, csv, orc, hudi, delta) to th #### Prepare Spark API -Additionally, for reading file based data sources, another way to read the data is using the `prepare_spark` method. This method -can be used if you are reading the data directly through Spark. +Additionally, for reading file based data sources, another way to read the data is using the `prepare_spark` method. +This method can be used if you are reading the data directly through Spark. -Firstly, it handles the setup of all Spark configurations or properties necessary for a particular type of connector and -prepares the absolute path to read from, along with bucket name and the appropriate file scheme of the data source. A Spark session can handle only one configuration setup at a time, so HSFS cannot set the Spark configurations when retrieving the connector since it would lead to only always initialising the last connector being retrieved. -Instead, user can do this setup explicitly with the `prepare_spark` method and therefore potentially -use multiple connectors in one Spark session. `prepare_spark` handles only one bucket associated with that particular connector, however, it is possible to set up multiple connectors with different types as long as their Spark properties do not interfere with each other. +Firstly, it handles the setup of all Spark configurations or properties necessary for a particular type of connector and prepares the absolute path to read from, along with bucket name and the appropriate file scheme of the data source. +A Spark session can handle only one configuration setup at a time, so HSFS cannot set the Spark configurations when retrieving the connector since it would lead to only always initialising the last connector being retrieved. +Instead, user can do this setup explicitly with the `prepare_spark` method and therefore potentially use multiple connectors in one Spark session. `prepare_spark` handles only one bucket associated with that particular connector, however, it is possible to set up multiple connectors with different types as long as their Spark properties do not interfere with each other. So, for example a S3 connector and a Snowflake connector can be used in the same session, without calling `prepare_spark` multiple times, as the properties don’t interfere with each other. If the data source is used in another API call, `prepare_spark` gets implicitly invoked, for example, when a user materialises a training dataset using a data source or uses the data source to set up an External Feature Group. -So users do not need to call `prepare_spark` every time they do an operation with a connector, it is only necessary when reading directly using Spark . Using `prepare_spark` is also -not necessary when using the `read` API. +So users do not need to call `prepare_spark` every time they do an operation with a connector, it is only necessary when reading directly using Spark. +Using `prepare_spark` is also not necessary when using the `read` API. -For example, to read directly from a S3 connector, we use the `prepare_spark` as follows +For example, to read directly from a S3 connector, we use the `prepare_spark` as follows: === "PySpark" ```python @@ -85,9 +84,10 @@ For example, to read directly from a S3 connector, we use the `prepare_spark` as ### Data warehouse/SQL based connectors -For data sources accessed via SQL such as data warehouses and JDBC compliant databases, e.g. Redshift, Snowflake, BigQuery, JDBC, users pass the SQL query to read the data to the `query` -argument. In most cases, this will be some form of a `SELECT` query. Depending on the connector type, users can also just set the table path and read the whole table without explicitly -passing any SQL query to the `query` argument. This is mostly relevant for Google BigQuery. +For data sources accessed via SQL such as data warehouses and JDBC compliant databases, e.g., Redshift, Snowflake, BigQuery, JDBC, users pass the SQL query to read the data to the `query` argument. +In most cases, this will be some form of a `SELECT` query. +Depending on the connector type, users can also just set the table path and read the whole table without explicitly passing any SQL query to the `query` argument. +This is mostly relevant for Google BigQuery. === "PySpark" ```python @@ -105,8 +105,7 @@ passing any SQL query to the `query` argument. This is mostly relevant for Googl ### Streaming based connector -For reading data streams, the Kafka Data Source supports reading a Kafka topic into Spark Structured Streaming Dataframes -instead of a static Dataframe as in other connector types. +For reading data streams, the Kafka Data Source supports reading a Kafka topic into Spark Structured Streaming Dataframes instead of a static Dataframe as in other connector types. === "PySpark" @@ -116,19 +115,14 @@ instead of a static Dataframe as in other connector types. ## Creating an External Feature Group -Another important aspect of a data source is its ability to facilitate creation of external feature groups with -the [Connector API](../../../concepts/fs/feature_group/external_fg.md). [External feature groups](../feature_group/create_external.md) are basically offline feature groups -and essentially stored as tables on external data sources. +Another important aspect of a data source is its ability to facilitate creation of external feature groups with the [Connector API](../../../concepts/fs/feature_group/external_fg.md). [External feature groups](../feature_group/create_external.md) are basically offline feature groups and essentially stored as tables on external data sources. The `Connector API` relies on data sources behind the scenes to integrate with external datasource. This enables seamless integration with any data source as long as there is a data source defined. -To create an external feature group, we use the `create_external_feature_group` API, also known as `Connector API`, -and simply pass the data source created before to the `storage_connector` argument. -Depending on the external source, we should set either the `query` argument for data warehouse based sources, or -the `path` and `data_format` arguments for data lake based sources, similar to reading into dataframes as explained in above section. +To create an external feature group, we use the `create_external_feature_group` API, also known as `Connector API`, and simply pass the data source created before to the `storage_connector` argument. +Depending on the external source, we should set either the `query` argument for data warehouse based sources, or the `path` and `data_format` arguments for data lake based sources, similar to reading into dataframes as explained in above section. -Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `storage_connector` -argument to the data source object of desired data source. +Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `storage_connector` argument to the data source object of desired data source. === "PySpark" ```python fg = feature_store.create_external_feature_group(name="sales", @@ -141,31 +135,29 @@ argument to the data source object of desired data source. ) ``` -`Connector API` (external feature groups) only stores the metadata about the features within Hopsworks, -while the actual data is still stored externally. This enables users to create feature groups within Hopsworks without the hassle of data migration. +`Connector API` (external feature groups) only stores the metadata about the features within Hopsworks, while the actual data is still stored externally. +This enables users to create feature groups within Hopsworks without the hassle of data migration. For more information on `Connector API`, read detailed guide about [external feature groups](../feature_group/create_external.md). ## Writing Training Data -Data Sources are also used while writing training data to external sources. While calling the -[Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data` , we can pass the `storage_connector` argument which is necessary to materialise -the data to external sources, as shown below. +Data Sources are also used while writing training data to external sources. +While calling the [Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data` , we can pass the `storage_connector` argument which is necessary to materialise the data to external sources, as shown below. === "PySpark" ```python # materialise a training dataset version, job = feature_view.create_training_data( description = 'describe training data', - data_format = 'spark_data_format', # e.g. data_format = "parquet" or data_format = "csv" + data_format = 'spark_data_format', # e.g., data_format = "parquet" or data_format = "csv" write_options = {"wait_for_job": False}, storage_connector = connector ) ``` -Read more about training data creation [here](../feature_view/training-data.md). +For a detailed walkthrough on managing and utilizing training data, refer to the [training data guide](../feature_view/training-data.md). ## Next Steps -We have gone through the basic use cases of a data source. -For more details about the API functionality for any specific connector type, -checkout the [API section](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/storage_connector_api/#storage-connector). +We have gone through the basic use cases of a data source. +For more details about the API functionality for any specific connector type, checkout the [API section][hsfs.storage_connector.StorageConnector]. diff --git a/docs/user_guides/fs/feature_group/create.md b/docs/user_guides/fs/feature_group/create.md index b7e25c817..51c630397 100644 --- a/docs/user_guides/fs/feature_group/create.md +++ b/docs/user_guides/fs/feature_group/create.md @@ -4,9 +4,10 @@ description: Documentation on how to create a Feature Group and the different AP # How to create a Feature Group -### Introduction +## Introduction -In this guide you will learn how to create and register a feature group with Hopsworks. This guide covers creating a feature group using the HSFS APIs as well as the user interface. +In this guide you will learn how to create and register a feature group with Hopsworks. +This guide covers creating a feature group using the HSFS APIs as well as the user interface. ## Prerequisites @@ -14,11 +15,13 @@ Before you begin this guide we suggest you read the [Feature Group](../../../con ## Create using the HSFS APIs -To create a feature group using the HSFS APIs, you need to provide a Pandas, Polars or Spark DataFrame. The DataFrame will contain all the features you want to register within the feature group, as well as the primary key, event time and partition key. +To create a feature group using the HSFS APIs, you need to provide a Pandas, Polars or Spark DataFrame. +The DataFrame will contain all the features you want to register within the feature group, as well as the primary key, event time and partition key. ### Create a Feature Group -The first step to create a feature group is to create the API metadata object representing a feature group. Using the HSFS API you can execute: +The first step to create a feature group is to create the API metadata object representing a feature group. +Using the HSFS API you can execute: #### Batch Write API @@ -36,44 +39,61 @@ The first step to create a feature group is to create the API metadata object re ) ``` -The full method documentation is available [here](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#featuregroup). If you need to create a feature group with vector similarity search supported, refer to [this guide](../vector_similarity_search.md#extending-feature-groups-with-similarity-search). `name` is the only mandatory parameter of the `create_feature_group` and represents the name of the feature group. +You can read the full [`FeatureStore.create_feature_group`][hsfs.feature_store.FeatureStore.create_feature_group] documentation to get more details. +If you need to create a feature group with vector similarity search supported, refer to [the vector similarity guide](../vector_similarity_search.md#extending-feature-groups-with-similarity-search). +`name` is the only mandatory parameter of the `create_feature_group` and represents the name of the feature group. In the example above we created the first version of a feature group named *weather*, we provide a description to make it searchable to the other project members, as well as making the feature group available online. -Additionally we specify which columns of the DataFrame will be used as primary key, partition key and event time. Composite primary key and multi level partitioning is also supported. +Additionally we specify which columns of the DataFrame will be used as primary key, partition key and event time. +Composite primary key and multi level partitioning is also supported. The version number is optional, if you don't specify the version number the APIs will create a new version by default with a version number equals to the highest existing version number plus one. -The last parameter used in the examples above is `stream`. The `stream` parameter controls whether to enable the streaming write APIs to the online and offline feature store. When using the APIs in a Python environment this behavior is the default and it requires the time travel format to be set to 'HUDI'. +The last parameter used in the examples above is `stream`. +The `stream` parameter controls whether to enable the streaming write APIs to the online and offline feature store. +When using the APIs in a Python environment this behavior is the default and it requires the time travel format to be set to 'HUDI'. ##### Primary key -A primary key is required when using the default table format (Hudi) to store offline feature data. When inserting data in a feature group on the offline feature store, the DataFrame you are writing is checked against the existing data in the feature group. If a row with the same primary key is found in the feature group, the row will be updated. If the primary key is not found, the row is appended to the feature group. +A primary key is required when using the default table format (Hudi) to store offline feature data. +When inserting data in a feature group on the offline feature store, the DataFrame you are writing is checked against the existing data in the feature group. +If a row with the same primary key is found in the feature group, the row will be updated. +If the primary key is not found, the row is appended to the feature group. When writing data on the online feature store, existing rows with the same primary key will be overwritten by new rows with the same primary key. ##### Event time -The event time column represents the time at which the event was generated. For example, with transaction data, the event time is the time at which a given transaction happened. -In the context of feature pipelines, the event time is often also the end timestamp of the interval of events included in the feature computation. For example, computing the feature "number of purchases by customer last week", the event time should be the last day of this "last week" window. +The event time column represents the time at which the event was generated. +For example, with transaction data, the event time is the time at which a given transaction happened. +In the context of feature pipelines, the event time is often also the end timestamp of the interval of events included in the feature computation. +For example, computing the feature "number of purchases by customer last week", the event time should be the last day of this "last week" window. -The event time is added to the primary key when writing to the offline feature store. This will make sure that the offline feature store has the entire history of feature values over time. As an example, if a user has made multiple purchases on a website, each of the purchases for a given user (identified by a user_id) will be saved in the feature group, with each purchase having a different event time (the combination of user_id and event_time makes up the primary key for the offline feature store). +The event time is added to the primary key when writing to the offline feature store. +This will make sure that the offline feature store has the entire history of feature values over time. +As an example, if a user has made multiple purchases on a website, each of the purchases for a given user (identified by a user_id) will be saved in the feature group, with each purchase having a different event time (the combination of user_id and event_time makes up the primary key for the offline feature store). -The event time **is not** part of the primary key when writing to the online feature store. This will ensure that the online feature store has the most recent version of the feature vector for each primary key. +The event time **is not** part of the primary key when writing to the online feature store. +This will ensure that the online feature store has the most recent version of the feature vector for each primary key. !!!note "Event time data type restriction" The supported data types for the event time column are: `timestamp`, `date` and `bigint`. ##### Partition key -It is best practice to add a partition key. When you specify a partition key, the data in the feature group will be stored under multiple directories based on the value of the partition column(s). +It is best practice to add a partition key. +When you specify a partition key, the data in the feature group will be stored under multiple directories based on the value of the partition column(s). All the rows with a given value as partition key will be stored in the same directory. Choosing the correct partition key has significant impact on the query performance as the execution engine (Spark) will be able to skip listing and reading files belonging to partitions which are not included in the query. As an example, if you have partitioned your feature group by day and you are creating a training dataset that includes only the last year of data, Spark will read only 365 partitions and not the entire history of data. -On the other hand, if the partition key is too fine grained (e.g. timestamp at millisecond resolution) - a large number of small partitions will be generated. This will slow down query execution as Spark will need to list and read a large amount of small directories/files. +On the other hand, if the partition key is too fine grained (e.g., timestamp at millisecond resolution) - a large number of small partitions will be generated. +This will slow down query execution as Spark will need to list and read a large amount of small directories/files. If you do not provide a partition key, all the feature data will be stored as files in a single directory. -The system has a limit of 10240 direct children (files or other subdirectories) per directory. This means that, as you add new data to a non-partitioned feature group, new files will be created and you might reach the limit. If you do reach the limit, your feature engineering pipeline will fail with the following error: +The system has a limit of 10240 direct children (files or other subdirectories) per directory. +This means that, as you add new data to a non-partitioned feature group, new files will be created and you might reach the limit. +If you do reach the limit, your feature engineering pipeline will fail with the following error: ```sh MaxDirectoryItemsExceededException - The directory item limit is exceeded: limit=10240 items=10240 @@ -83,19 +103,23 @@ By using partitioning the system will write the feature data in different subdir ##### Table format -When you create a feature group, you can specify the table format you want to use to store the data in your feature group by setting the `time_travel_format` parameter. The currently support values are "HUDI", "DELTA", "NONE" (which defaults to Parquet). +When you create a feature group, you can specify the table format you want to use to store the data in your feature group by setting the `time_travel_format` parameter. +The currently support values are "HUDI", "DELTA", "NONE" (which defaults to Parquet). ##### Data Source -During the creation of a feature group, it is possible to define the `storage_connector` parameter, this allows for management of offline data in the desired table format outside the Hopsworks cluster. Currently, [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors and "DELTA" `time_travel_format` format is supported. +During the creation of a feature group, it is possible to define the `storage_connector` parameter, this allows for management of offline data in the desired table format outside the Hopsworks cluster. +Currently, [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors and "DELTA" `time_travel_format` format is supported. ##### Online Table Configuration -When defining online-enabled feature groups it is also possible to configure the online table. You can specify [table options](https://docs.rondb.com/table_options/#table-options) by providing comments. Additionally, it is also possible to define whether online data is stored in memory or on disk using [table space](https://docs.rondb.com/disk_columns/#disk-columns). +When defining online-enabled feature groups it is also possible to configure the online table. +You can specify [table options](https://docs.rondb.com/table_options/#table-options) by providing comments. +Additionally, it is also possible to define whether online data is stored in memory or on disk using [table space](https://docs.rondb.com/disk_columns/#disk-columns). The code example shows the creation of an online-enabled feature group that stores online data on disk using `ts_1` table space and sets several table properties in the comment section. -``` +```python fg = fs.create_feature_group( name='air_quality', description='Air Quality characteristics of each day', @@ -107,7 +131,8 @@ fg = fs.create_feature_group( ``` !!! note Table Space - The table space needs to be provisioned at system level before it can be used. You can do so by adding the following parameters to the values.yaml file used for your deployment with the Helm Charts: + The table space needs to be provisioned at system level before it can be used. + You can do so by adding the following parameters to the values.yaml file used for your deployment with the Helm Charts: ```yaml rondb: @@ -117,8 +142,6 @@ fg = fs.create_feature_group( diskColumnGiB: 2 ``` - - #### Streaming Write API As explained above, the stream parameter controls whether to enable the streaming write APIs to the online and offline feature store. @@ -153,8 +176,11 @@ For Python environments, only the stream API is supported (stream=True). ) ``` -When using the streaming API, the data will be written directly to the online storage (if `online_enabled=True`). However, you can control when the sync to -the offline storage is going to happen. You can do it synchronously after every call to `fg.insert()`, which is the default. Often, you defer writes to a later point in order to batch together multiple writes to the offline storage (useful to reduce the overhead of many small writes): +When using the streaming API, the data will be written directly to the online storage (if `online_enabled=True`). +However, you can control when the sync to +the offline storage is going to happen. +You can do it synchronously after every call to `fg.insert()`, which is the default. +Often, you defer writes to a later point in order to batch together multiple writes to the offline storage (useful to reduce the overhead of many small writes): ```python # run multiple inserts without starting the offline materialization job @@ -186,9 +212,11 @@ Four main considerations influence the write and the query performance: ##### Partitioning on a feature group level -**Partitioning on the feature group level** allows Hopsworks and the table format (Hudi or Delta) to push down filters to the filesystem when reading from feature groups. In practice that means, less directories need to be listed and less files need to be read, speeding up queries. +**Partitioning on the feature group level** allows Hopsworks and the table format (Hudi or Delta) to push down filters to the filesystem when reading from feature groups. +In practice that means, less directories need to be listed and less files need to be read, speeding up queries. For example, most commonly, filtering is done on the event time column of a feature group when generating training data or batches of data: + ```python query = fg.select_all() @@ -222,13 +250,16 @@ list and read the files in the directories of those six months that are being qu A good practice are partition keys with at most daily granularity, if they are based on time. Additionally, one can look at the size of a partition directory, which should be in the 100s of MB. -Additionally, if you are commonly training models for different categories of your data, you can add another level of partitioning for this. That is, if the query contains +Additionally, if you are commonly training models for different categories of your data, you can add another level of partitioning for this. +That is, if the query contains an additional filter: + ```python query = fg.select_all().filter(fg.country_code == "US") ``` The feature group can be created with the following partition key in order to push down filters also for the `country_code` category: + ```python fg = feature_store.create_feature_group(... partition_key=['day', 'country_code'], @@ -267,7 +298,8 @@ If the inserted Dataframe contains multiple feature group partitions, the parque In practice that means the shuffle parallelism should be set equal to the number of feature group partitions in the inserted dataframe. This will create one parquet file per feature group partition, which in many cases is optimal. - Theoretically, this rule holds up to a partition size of 2GB, which is the limit of Spark. However, one should bump this up accordingly already for smaller inputs. + Theoretically, this rule holds up to a partition size of 2GB, which is the limit of Spark. + However, one should bump this up accordingly already for smaller inputs. We recommend having shuffle parallelism `hoodie.[insert|upsert|bulkinsert].shuffle.parallelism` such that its at least input_data_size/500MB. You can change the write options on every insert, depending also on the size of the data you are writing: @@ -282,10 +314,9 @@ If the inserted Dataframe contains multiple feature group partitions, the parque ##### Backfilling of feature group partitions -Hudi scales well with the number of partitions to write, when performing backfilling of old feature partitions, meaning moving backwards in time with the event-time, -it makes sense to **batch those feature group partitions** together into a single `fg.insert()` call. As shown in the figure above, the number of utilised executors you choose for the insert -depends highly on the number of partitions and shuffle parallelism you are writing. So by writing multiple feature group partitions in a single insert, you can scale up your Spark application -and fully utilise the workers. +Hudi scales well with the number of partitions to write, when performing backfilling of old feature partitions, meaning moving backwards in time with the event-time, it makes sense to **batch those feature group partitions** together into a single `fg.insert()` call. +As shown in the figure above, the number of utilised executors you choose for the insert depends highly on the number of partitions and shuffle parallelism you are writing. +So by writing multiple feature group partitions in a single insert, you can scale up your Spark application and fully utilise the workers. In that case you can increase the Hudi shuffle parallelism accordingly. !!! danger "Concurrent feature group inserts" @@ -300,19 +331,21 @@ In that case you can increase the Hudi shuffle parallelism accordingly. ##### The choice of topic for data ingestion When creating a feature group that uses streaming write APIs for data ingestion it is possible to define the Kafka topics that should be utilized. -The default approach of using a project-wide topic functions great for use cases involving little to no overlap when producing data. However, -concurrently inserting into multiple feature groups could cause read amplification for the offline materialization job (e.g., Hudi Delta Streamer). Therefore, it is -advised to utilize separate topics when ingestions overlap or there is a large frequently running insertion into a specific feature group. +The default approach of using a project-wide topic functions great for use cases involving little to no overlap when producing data. +However, concurrently inserting into multiple feature groups could cause read amplification for the offline materialization job (e.g., Hudi Delta Streamer). +Therefore, it is advised to utilize separate topics when ingestions overlap or there is a large frequently running insertion into a specific feature group. ### Register the metadata and save the feature data -The snippet above only created the metadata object on the Python interpreter running the code. To register the feature group metadata and to save the feature data with Hopsworks, you should invoke the `insert` method: +The snippet above only created the metadata object on the Python interpreter running the code. +To register the feature group metadata and to save the feature data with Hopsworks, you should invoke the `insert` method: ```python fg.insert(df) ``` -The save method takes in input a Pandas, Polars or Spark DataFrame. HSFS will use the DataFrame columns and types to determine the name and types of features, primary key, partition key and event time. +The save method takes in input a Pandas, Polars or Spark DataFrame. +HSFS will use the DataFrame columns and types to determine the name and types of features, primary key, partition key and event time. The DataFrame *must* contain the columns specified as primary keys, partition key and event time in the `create_feature_group` call. @@ -320,11 +353,12 @@ If a feature group is online enabled, the `insert` method will store the feature ### API Reference -[FeatureGroup](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#featuregroup) +[`FeatureGroup`][hsfs.feature_group.FeatureGroup] ## Create using the UI -You can also create a new feature group through the UI. For this, navigate to the `Feature Groups` section and press the `Create` button at the top-right corner. +You can also create a new feature group through the UI. +For this, navigate to the `Feature Groups` section and press the `Create` button at the top-right corner.

@@ -332,7 +366,9 @@ You can also create a new feature group through the UI. For this, navigate to th

-Subsequently, you will be able to define its properties (such as name, mode, features, and more). Refer to the documentation above for an explanation of the parameters available, they are the same as when you create a feature group using the SDK. Finally, complete the creation by clicking `Create New Feature Group` at the bottom of the page. +Subsequently, you will be able to define its properties (such as name, mode, features, and more). +Refer to the documentation above for an explanation of the parameters available, they are the same as when you create a feature group using the SDK. +Finally, complete the creation by clicking `Create New Feature Group` at the bottom of the page.

diff --git a/docs/user_guides/fs/feature_group/create_external.md b/docs/user_guides/fs/feature_group/create_external.md index ac29c6e34..e7f7d36e4 100644 --- a/docs/user_guides/fs/feature_group/create_external.md +++ b/docs/user_guides/fs/feature_group/create_external.md @@ -4,9 +4,10 @@ description: Documentation on how to create an external feature group in Hopswor # How to create an External Feature Group -### Introduction +## Introduction -In this guide you will learn how to create and register an external feature group with Hopsworks. This guide covers creating an external feature group using the HSFS APIs as well as the user interface. +In this guide you will learn how to create and register an external feature group with Hopsworks. +This guide covers creating an external feature group using the HSFS APIs as well as the user interface. ## Prerequisites @@ -26,7 +27,8 @@ To create an external feature group using the HSFS APIs you need to provide an e ### Create an External Feature Group -The first step is to instantiate the metadata through the `create_external_feature_group` method. Once you have defined the metadata, you can +The first step is to instantiate the metadata through the `create_external_feature_group` method. +Once you have defined the metadata, you can [persist the metadata and create the feature group](#register-the-metadata) in Hopsworks by calling `fg.save()`. #### SQL based external feature group @@ -75,17 +77,21 @@ The first step is to instantiate the metadata through the `create_external_featu fg.save() ``` -The full method documentation is available [here](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/external_feature_group_api/#externalfeaturegroup). `name` is a mandatory parameter of the `create_external_feature_group` and represents the name of the feature group. +You can read the full [`FeatureStore.create_external_feature_group`][hsfs.feature_store.FeatureStore.create_external_feature_group] documentation for more details. +`name` is a mandatory parameter of the `create_external_feature_group` and represents the name of the feature group. The version number is optional, if you don't specify the version number the APIs will create a new version by default with a version number equals to the highest existing version number plus one. -If the data source is defined for a data warehouse (e.g. JDBC, Snowflake, Redshift) you need to provide a SQL statement that will be executed to compute the features. If the data source is defined for a data lake, the location of the data as well as the format need to be provided. +If the data source is defined for a data warehouse (e.g., JDBC, Snowflake, Redshift) you need to provide a SQL statement that will be executed to compute the features. +If the data source is defined for a data lake, the location of the data as well as the format need to be provided. -Additionally we specify which columns of the DataFrame will be used as primary key, and event time. Composite primary keys are also supported. +Additionally we specify which columns of the DataFrame will be used as primary key, and event time. +Composite primary keys are also supported. ### Register the metadata -In the snippet above it's important that the created metadata object gets registered in Hopsworks. To do so, you should invoke the `save` method: +In the snippet above it's important that the created metadata object gets registered in Hopsworks. +To do so, you should invoke the `save` method: === "Python" @@ -95,7 +101,8 @@ In the snippet above it's important that the created metadata object gets regist ### Enable online storage -You can enable online storage for external feature groups, however, the sync from the external storage to Hopsworks online storage is not automatic and needs to be setup manually. For an external feature group to be available online, during the creation of the feature group, the `online_enabled` option needs to be set to `True`. +You can enable online storage for external feature groups, however, the sync from the external storage to Hopsworks online storage is not automatic and needs to be setup manually. +For an external feature group to be available online, during the creation of the feature group, the `online_enabled` option needs to be set to `True`. === "Python" @@ -118,23 +125,24 @@ You can enable online storage for external feature groups, however, the sync fro external_fg.insert(df) ``` -The `insert()` method takes a DataFrame as parameter and writes it _only_ to the online feature store. Users can select which subset of the feature group data they want to make available on the online feature store by using the [query APIs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/query_api/). +The `insert()` method takes a DataFrame as parameter and writes it _only_ to the online feature store. +Users can select which subset of the feature group data they want to make available on the online feature store by using the [query APIs][hsfs.constructor.query.Query]. ### Limitations Hopsworks Feature Store does not support time-travel queries on external feature groups. Additionally, support for `.read()` and `.show()` methods when using by the Python engine is limited to external feature groups defined on BigQuery and Snowflake and only when using the [Feature Query Service](../../../setup_installation/common/arrow_flight_duckdb.md). -Nevertheless, external feature groups defined top of any data source can be used to create a training dataset from a Python environment invoking one of the following methods: [create_training_data](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#create_training_data), [create_train_test_split](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#create_train_test_split) or the [create_train_validation_test_split](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#create_train_validation_test_split) - +Nevertheless, external feature groups defined top of any data source can be used to create a training dataset from a Python environment invoking one of the following methods: [`FeatureView.create_training_data`][hsfs.feature_view.FeatureView.create_training_data], [`FeatureView.create_train_test_split`][hsfs.feature_view.FeatureView.create_train_test_split] or [`FeatureView.create_train_validation_test_split`][hsfs.feature_view.FeatureView.create_train_validation_test_split]. ### API Reference -[External FeatureGroup](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/external_feature_group_api/#externalfeaturegroup) +[`ExternalFeatureGroup`][hsfs.feature_group.ExternalFeatureGroup]. ## Create using the UI -You can also create a new feature group through the UI. For this, navigate to the `Data Source` section and make sure you have you have available Data Source for the desired platform or create [new](../data_source/index.md). +You can also create a new feature group through the UI. +For this, navigate to the `Data Source` section and make sure you have you have available Data Source for the desired platform or create [new](../data_source/index.md).

@@ -150,9 +158,11 @@ To create a feature group, proceed by clicking `Next: Select Tables` once all of

-The database navigation structure depends on your specific data source. You'll navigate through the appropriate hierarchy for your platform—such as Database → Schema → Table for Snowflake, or Project → Dataset → Table for BigQuery. +The database navigation structure depends on your specific data source. +You'll navigate through the appropriate hierarchy for your platform—such as Database → Schema → Table for Snowflake, or Project → Dataset → Table for BigQuery. -In the UI you can select one or more tables, for each selected table, you must designate one or more columns as primary keys before proceeding. You can also optionally select a single column as a timestamp for the row (supported types are timestamp, date and bigint), edit names and data types of individual columns you want to include. +In the UI you can select one or more tables, for each selected table, you must designate one or more columns as primary keys before proceeding. +You can also optionally select a single column as a timestamp for the row (supported types are timestamp, date and bigint), edit names and data types of individual columns you want to include.

@@ -160,7 +170,8 @@ In the UI you can select one or more tables, for each selected table, you must d

-Complete the creation by clicking `Next: Review Configuration` at the bottom of the page. As the last step, you will be able to rename the feature groups and confirm their creation. +Complete the creation by clicking `Next: Review Configuration` at the bottom of the page. +As the last step, you will be able to rename the feature groups and confirm their creation.

diff --git a/docs/user_guides/fs/feature_group/create_spine.md b/docs/user_guides/fs/feature_group/create_spine.md index be8382bcb..0efba91c2 100644 --- a/docs/user_guides/fs/feature_group/create_spine.md +++ b/docs/user_guides/fs/feature_group/create_spine.md @@ -4,7 +4,7 @@ description: Documentation on how to create Spine Group in Hopsworks and the dif # How to create Spine Group -### Introduction +## Introduction In this guide you will learn how to create and register a Spine Group with Hopsworks. @@ -16,7 +16,8 @@ Before you begin this guide we suggest you read the [Spine Group](../../../conce ### Create a Spine Group -Instead of using a feature group to save the label, you can also use a spine to use a Dataframe containing the labels on the fly. A spine is essentially a metadata object similar to a Feature Group, which tells the feature store the relevant event time column and primary key columns to perform point-in-time correct joins. +Instead of using a feature group to save the label, you can also use a spine to use a Dataframe containing the labels on the fly. +A spine is essentially a metadata object similar to a Feature Group, which tells the feature store the relevant event time column and primary key columns to perform point-in-time correct joins. Additionally, apart from primary key and event time information, a Spark dataframe is required in order to infer the schema of the group from. @@ -41,7 +42,8 @@ Once created, note that you can inspect the dataframe in the Spine Group: trans_spine.dataframe.show() ``` -And you can always also replace the dataframe contained within the Spine Group. You just need to make sure it has the same schema. +And you can always also replace the dataframe contained within the Spine Group. +You just need to make sure it has the same schema. === "Python" @@ -53,8 +55,9 @@ And you can always also replace the dataframe contained within the Spine Group. !!! warning "Python support" - Currently the HSFS library does not support usage of Spine Groups for training data creation or batch data retrieval in the Python engine. However, it is supported to create Spine Groups from the Python engine. + Currently the HSFS library does not support usage of Spine Groups for training data creation or batch data retrieval in the Python engine. + However, it is supported to create Spine Groups from the Python engine. ### API Reference -[SpineGroup](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/spine_group_api/#spinegroup) +[`SpineGroup`][hsfs.feature_group.SpineGroup]. diff --git a/docs/user_guides/fs/feature_group/data_types.md b/docs/user_guides/fs/feature_group/data_types.md index e0796d16c..84454771c 100644 --- a/docs/user_guides/fs/feature_group/data_types.md +++ b/docs/user_guides/fs/feature_group/data_types.md @@ -1,6 +1,6 @@ # How to manage schema and feature data types -### Introduction +## Introduction In this guide, you will learn how to manage the feature group schema and control the data type of the features in a feature group. @@ -13,12 +13,13 @@ We also suggest you familiarize yourself with the APIs to [create a feature grou When a feature is stored in both the online and offline feature stores, it will be stored in a data type native to each store. -* **[Offline data type](#offline-data-types)**: The data type of the feature when stored on the offline feature store. The offline feature store is based on Apache Hudi and Hive Metastore, as such, - [Hive Data Types](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types) can be leveraged. -* **[Online data type](#online-data-types)**: The data type of the feature when stored on the online feature store. The online storage is based on RonDB and hence, - [MySQL Data Types](https://dev.mysql.com/doc/refman/8.0/en/data-types.html) can be leveraged. +- **[Offline data type](#offline-data-types)**: The data type of the feature when stored on the offline feature store. + The offline feature store is based on Apache Hudi and Hive Metastore, as such, [Hive Data Types](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types) can be leveraged. +- **[Online data type](#online-data-types)**: The data type of the feature when stored on the online feature store. + The online storage is based on RonDB and hence, [MySQL Data Types](https://dev.mysql.com/doc/refman/8.0/en/data-types.html) can be leveraged. -The offline data type is always required, even if the feature group is stored only online. On the other hand, if the feature group is not *online_enabled*, its features will not have an online data type. +The offline data type is always required, even if the feature group is stored only online. +On the other hand, if the feature group is not *online_enabled*, its features will not have an online data type. The offline and online types for each feature are automatically inferred from the Spark or Pandas types of the input DataFrame as outlined in the following two sections. The default mapping, however, can be overwritten by using an [explicit schema definition](#explicit-schema-definition). @@ -89,10 +90,12 @@ More on how Hopsworks handles [string types](#string-online-data-types), [compl #### String online data types -String types are stored as *VARCHAR(100)* by default. This type is fixed-size, meaning it can only hold as many characters as specified in the argument (e.g. VARCHAR(100) can hold up to 100 unicode characters). -The size should thus be within the maximum string length of the input data. Furthermore, the VARCHAR size has to be in line with the [online restrictions for row size](#online-restrictions-for-row-size). +String types are stored as *VARCHAR(100)* by default. +This type is fixed-size, meaning it can only hold as many characters as specified in the argument (e.g., VARCHAR(100) can hold up to 100 unicode characters). +The size should thus be within the maximum string length of the input data. +Furthermore, the VARCHAR size has to be in line with the [online restrictions for row size](#online-restrictions-for-row-size). -If the string size exceeds 100 characters, a larger type (e.g. VARCHAR(500)) can be specified via an [explicit schema definition](#explicit-schema-definition). +If the string size exceeds 100 characters, a larger type (e.g., VARCHAR(500)) can be specified via an [explicit schema definition](#explicit-schema-definition). If the string size is unknown or if it exceeds the maximum row size, then the [TEXT type](https://docs.rondb.com/blobs/) can be used instead. String data that exceeds the specified VARCHAR size will lead to an error when data gets written to the online feature store. @@ -100,15 +103,16 @@ When in doubt, use the TEXT type instead, but note that it comes with a potentia #### Complex online data types -Hopsworks allows users to store complex types (e.g. *ARRAY*) in the online feature store. Hopsworks serializes the complex features transparently and stores them as VARBINARY in the online feature store. -The serialization happens when calling the [save()](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#save), -[insert()](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#insert) or [insert_stream()](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#insert_stream) methods. -The deserialization will be executed when calling the [get_serving_vector()](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/training_dataset_api/#get_serving_vector) method to retrieve data from the online feature store. +Hopsworks allows users to store complex types (e.g. *ARRAY*) in the online feature store. +Hopsworks serializes the complex features transparently and stores them as VARBINARY in the online feature store. +The serialization happens when calling the [`FeatureGroup.save`][hsfs.feature_group.FeatureGroup.save], +[`FeatureGroup.insert`][hsfs.feature_group.FeatureGroup.insert] or [`FeatureGroup.insert_stream`][hsfs.feature_group.FeatureGroup.insert_stream] methods. +The deserialization will be executed when calling the [`TrainingDataset.get_serving_vector`][hsfs.training_dataset.TrainingDataset.get_serving_vector] method to retrieve data from the online feature store. If users query directly the online feature store, for instance using the `fs.sql("SELECT ...", online=True)` statement, it will return a binary blob. On the feature store UI, the online feature type for complex features will be reported as *VARBINARY*. -If the binary size exceeds 100 bytes, a larger type (e.g. VARBINARY(500)) can be specified via an [explicit schema definition](#explicit-schema-definition). +If the binary size exceeds 100 bytes, a larger type (e.g., VARBINARY(500)) can be specified via an [explicit schema definition](#explicit-schema-definition). If the binary size is unknown of if it exceeds the maximum row size, then the [BLOB type](https://docs.rondb.com/blobs/) can be used instead. Binary data that exceeds the specified VARBINARY size will lead to an error when data gets written to the online feature store. @@ -120,7 +124,7 @@ When a feature is being used as a primary key, certain types are not allowed. Examples of such types are *FLOAT*, *DOUBLE*, *TEXT* and *BLOB*. Additionally, the size of the sum of the primary key online data types storage requirements **should not exceed 4KB**. -#### Online restrictions for row size +#### Online restrictions for row size The online feature store supports **up to 500 columns** and all column types combined **should not exceed 30000 Bytes**. The byte size of each column is determined by its data type and calculated as follows: @@ -145,9 +149,11 @@ The byte size of each column is determined by its data type and calculated as fo !!! note "VARCHAR / VARBINARY overhead" - For VARCHAR and VARBINARY data types, an additional 1 byte is required if the size is less than 256 bytes. If the size is 256 bytes or greater, 2 additional bytes are required. + For VARCHAR and VARBINARY data types, an additional 1 byte is required if the size is less than 256 bytes. + If the size is 256 bytes or greater, 2 additional bytes are required. - Memory allocation is performed in groups of 4 bytes. For example, a VARBINARY(100) requires 104 bytes of memory: + Memory allocation is performed in groups of 4 bytes. + For example, a VARBINARY(100) requires 104 bytes of memory: - 100 bytes for the data itself - 1 byte of overhead @@ -155,114 +161,121 @@ The byte size of each column is determined by its data type and calculated as fo Since memory is allocated in 4-byte groups, storing 101 bytes requires 26 groups (26 × 4 = 104 bytes) of allocated memory. - #### Pre-insert schema validation for online feature groups -For online enabled feature groups, the dataframe to be ingested needs to adhere to the online schema definitions. The input dataframe is validated for schema checks accordingly. + +For online enabled feature groups, the dataframe to be ingested needs to adhere to the online schema definitions. +The input dataframe is validated for schema checks accordingly. The validation is enabled by default and can be disabled by setting below key word argument when calling `insert()` === "Python" ```python feature_group.insert(df, validation_options={'online_schema_validation':False}) ``` -The most important validation checks or error messages are mentioned below along with possible corrective actions. +The most important validation checks or error messages are mentioned below along with possible corrective actions. -1. Primary key contains null values +01. Primary key contains null values - **Rule** Primary key column should not contain any null values. - - **Example correction** Drop the rows containing null primary keys. Alternatively, find the null values and assign them an unique value as per preferred strategy for data imputation. - - === "Pandas" - ```python - # Drop rows: assuming 'id' is the primary key column - df = df.dropna(subset=['id']) - # For composite keys - df = df.dropna(subset=['id1', 'id2']) - - # Data imputation: replace null values with incrementing last interger id - # existing max id - max_id = df['id'].max() - # counter to generate new id - next_id = max_id + 1 - # for each null id, assign the next id incrementally - for idx in df[df['id'].isna()].index: - df.loc[idx, 'id'] = next_id - next_id += 1 - ``` - -2. Primary key column missing + - **Example correction** Drop the rows containing null primary keys. + Alternatively, find the null values and assign them an unique value as per preferred strategy for data imputation. + + === "Pandas" + ```python + # Drop rows: assuming 'id' is the primary key column + df = df.dropna(subset=['id']) + # For composite keys + df = df.dropna(subset=['id1', 'id2']) + + # Data imputation: replace null values with incrementing last interger id + # existing max id + max_id = df['id'].max() + # counter to generate new id + next_id = max_id + 1 + # for each null id, assign the next id incrementally + for idx in df[df['id'].isna()].index: + df.loc[idx, 'id'] = next_id + next_id += 1 + ``` + +02. Primary key column missing - **Rule** The dataframe to be inserted must contain all the columns defined as primary key(s) in the feature group. - **Example correction** Add all the primary key columns in the dataframe. - - === "Pandas" - ```python - # increamenting primary key upto the length of dataframe - df['id'] = range(1, len(df) + 1) - ``` -3. String length exceeded + === "Pandas" + ```python + # increamenting primary key upto the length of dataframe + df['id'] = range(1, len(df) + 1) + ``` - - **Rule** The character length of a string should be within the maximum length capacity in the online schema type of a feature. If the feature group is not created and explicit feature schema was not provided, the limit will be auto-increased to the maximum length found in a string column in the dataframe. +03. String length exceeded + + - **Rule** The character length of a string should be within the maximum length capacity in the online schema type of a feature. + If the feature group is not created and explicit feature schema was not provided, the limit will be auto-increased to the maximum length found in a string column in the dataframe. - **Example correction** - - - Trim the string values to fit within maximum limit set during feature group creation. - - === "Pandas" - ```python - max_length = 100 - df['text_column'] = df['text_column'].str.slice(0, max_length) - ``` - - - Another option is to simply [create new version of the feature group](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_or_create_feature_group) and insert the dataframe. - - - !!!note - The total row size limit should be less than 30kb as per [row size restrictions](#online-restrictions-for-row-size). In such cases it is possible to define the feature as **TEXT** or **BLOB**. - Below is an example of explicitly defining the string column as TEXT as online type. - - === "Pandas" - ```python - import pandas as pd - # example dummy dataframe with the string column - df = pd.DataFrame(columns=['id', 'string_col']) - from hsfs.feature import Feature - features = [ - Feature(name="id",type="bigint",online_type="bigint"), - Feature(name="string_col",type="string",online_type="text") - ] - - fg = fs.get_or_create_feature_group(name="fg_manual_text_schema", - version=1, - features=features, - online_enabled=True, - primary_key=['id']) - fg.insert(df) - ``` + + - Trim the string values to fit within maximum limit set during feature group creation. + + === "Pandas" + ```python + max_length = 100 + df['text_column'] = df['text_column'].str.slice(0, max_length) + ``` + + - Another option is to simply [create new version of the feature group][hsfs.feature_store.FeatureStore.get_or_create_feature_group] and insert the dataframe. + + !!!note + The total row size limit should be less than 30kb as per [row size restrictions](#online-restrictions-for-row-size). + In such cases it is possible to define the feature as **TEXT** or **BLOB**. + Below is an example of explicitly defining the string column as TEXT as online type. + + === "Pandas" + ```python + import pandas as pd + # example dummy dataframe with the string column + df = pd.DataFrame(columns=['id', 'string_col']) + from hsfs.feature import Feature + features = [ + Feature(name="id",type="bigint",online_type="bigint"), + Feature(name="string_col",type="string",online_type="text") + ] + + fg = fs.get_or_create_feature_group(name="fg_manual_text_schema", + version=1, + features=features, + online_enabled=True, + primary_key=['id']) + fg.insert(df) + ``` ### Timestamps and Timezones -All timestamp features are stored in Hopsworks in UTC time. Also, all timestamp-based functions (such as [point-in-time joins](../../../concepts/fs/feature_view/offline_api.md#point-in-time-correct-training-data)) use UTC time. +All timestamp features are stored in Hopsworks in UTC time. +Also, all timestamp-based functions (such as [point-in-time joins](../../../concepts/fs/feature_view/offline_api.md#point-in-time-correct-training-data)) use UTC time. This ensures consistency of timestamp features across different client timezones and simplifies working with timestamp-based functions in general. -When ingesting timestamp features, the [Feature Store Write API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#insert) will automatically handle the conversion to UTC, if necessary. +When ingesting timestamp features, the [`FeatureGroup.insert`][hsfs.feature_group.FeatureGroup.insert] will automatically handle the conversion to UTC, if necessary. The following table summarizes how different timestamp types are handled: -| Data Frame (Data Type) | Environment | Handling | -|---------------------------------------|-------------------------|----------------------------------------------------------| -| Pandas DataFrame (datetime64[ns]) | Python-only and PySpark | interpreted as UTC, independent of the client's timezone | -| Pandas DataFrame (datetime64[ns, tz]) | Python-only and PySpark | timezone-sensitive conversion from 'tz' to UTC | -| Spark (TimestampType) | PySpark and Spark | interpreted as UTC, independent of the client's timezone | +| Data Frame (Data Type) | Environment | Handling | +| --- | --- | --- | +| Pandas DataFrame (datetime64[ns]) | Python-only and PySpark | interpreted as UTC, independent of the client's timezone | +| Pandas DataFrame (datetime64[ns, tz]) | Python-only and PySpark | timezone-sensitive conversion from 'tz' to UTC | +| Spark (TimestampType) | PySpark and Spark | interpreted as UTC, independent of the client's timezone | -Timestamp features retrieved from the Feature Store, e.g. using the [Feature Store Read API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#read), use a timezone-unaware format: +Timestamp features retrieved from the Feature Store, e.g., using the [Feature Store Read API][hsfs.feature_group.FeatureGroup.read], use a timezone-unaware format: | Data Frame (Data Type) | Environment | Timezone | |---------------------------------------|-------------------------|------------------------| | Pandas DataFrame (datetime64[ns]) | Python-only | timezone-unaware (UTC) | | Spark (TimestampType) | PySpark and Spark | timezone-unaware (UTC) | -Note that our PySpark/Spark client automatically sets the Spark SQL session's timezone to UTC. This ensures that Spark SQL will correctly interpret all timestamps as UTC. The setting will only apply to the client's session, and you don't have to worry about setting/unsetting the configuration yourself. +Note that our PySpark/Spark client automatically sets the Spark SQL session's timezone to UTC. +This ensures that Spark SQL will correctly interpret all timestamps as UTC. +The setting will only apply to the client's session, and you don't have to worry about setting/unsetting the configuration yourself. ## Explicit schema definition -When creating a feature group it is possible for the user to control both the offline and online data type of each column. If users explicitly define the schema for the feature group, Hopsworks is going to use that schema to create the feature group, without performing any type mapping. +When creating a feature group it is possible for the user to control both the offline and online data type of each column. +If users explicitly define the schema for the feature group, Hopsworks is going to use that schema to create the feature group, without performing any type mapping. You can explicitly define the feature group schema as follows: === "Python" @@ -282,7 +295,8 @@ You can explicitly define the feature group schema as follows: ## Append features to existing feature groups -Hopsworks supports appending additional features to an existing feature group. Adding additional features to an existing feature group is not considered a breaking change. +Hopsworks supports appending additional features to an existing feature group. +Adding additional features to an existing feature group is not considered a breaking change. === "Python" ```python @@ -297,4 +311,5 @@ Hopsworks supports appending additional features to an existing feature group. A fg.append_features(features) ``` -When adding additional features to a feature group, you can provide a default values for existing entries in the feature group. You can also backfill the new features for existing entries by running an `insert()` operation and update all existing combinations of *primary key* - *event time*. +When adding additional features to a feature group, you can provide a default values for existing entries in the feature group. +You can also backfill the new features for existing entries by running an `insert()` operation and update all existing combinations of *primary key* - *event time*. diff --git a/docs/user_guides/fs/feature_group/data_validation.md b/docs/user_guides/fs/feature_group/data_validation.md index 84402694a..e17216853 100644 --- a/docs/user_guides/fs/feature_group/data_validation.md +++ b/docs/user_guides/fs/feature_group/data_validation.md @@ -7,64 +7,103 @@ ## Introduction -Clean, high quality feature data is of paramount importance to being able to train and serve high quality models. Hopsworks offers integration with [Great Expectations](https://greatexpectations.io/) to enable a smooth data validation workflow. This guide is designed to help you integrate a data validation step when inserting new DataFrames into a Feature Group. Note that validation is performed inline as part of your feature pipeline (on the client machine) - it is not executed by Hopsworks after writing features. +Clean, high quality feature data is of paramount importance to being able to train and serve high quality models. +Hopsworks offers integration with [Great Expectations](https://greatexpectations.io/) to enable a smooth data validation workflow. +This guide is designed to help you integrate a data validation step when inserting new DataFrames into a Feature Group. +Note that validation is performed inline as part of your feature pipeline (on the client machine) - it is not executed by Hopsworks after writing features. ## UI ### Create a Feature Group (Pre-requisite) -In the UI, you must create a Feature Group first before attaching an Expectation Suite. You can find out more information about creating a Feature Group [here](create.md). You can attach at most one expectation suite to a Feature Group. Data validation is an optional step and is not required to write to a Feature Group. +In the UI, you must create a Feature Group first before attaching an Expectation Suite. +You can find out more information about [creating a Feature Group](create.md). +You can attach at most one expectation suite to a Feature Group. +Data validation is an optional step and is not required to write to a Feature Group. ### Step 1: Find and Edit Feature Group -Click on the Feature Group section in the navigation menu. Find your Feature Group in the list and click on its name to access the Feature Group page. Select `edit` in the top right corner or scroll to the Expectations section and click on `Edit Expectation Suite`. +Click on the Feature Group section in the navigation menu. +Find your Feature Group in the list and click on its name to access the Feature Group page. +Select `edit` in the top right corner or scroll to the Expectations section and click on `Edit Expectation Suite`. ### Step 2: Edit General Expectation Suite Settings -Scroll to the Expectation Suite section. Click add Expectation Suite and edit its metadata: +Scroll to the Expectation Suite section. +Click add Expectation Suite and edit its metadata: - Choose a name for your expectation suite. -- Checkbox enabled. This controls whether the Expectation Suite will be used to validate a Dataframe automatically upon insertion into a Feature Group. Note that validation is executed by the client. Disabling validation allows you to skip the validation step without deleting the Expectation Suite. -- 'ALWAYS' vs. 'STRICT' mode. This option controls what happens after validation. Hopsworks defaults to 'ALWAYS', where data is written to the Feature Group regardless of the validation result. This means that even if expectations are failing or throw an exception, Hopsworks will attempt to insert the data into the Feature Group. In 'STRICT' mode, Hopsworks will only write data to the Feature Group if each individual expectation has been successful. +- Checkbox enabled. + This controls whether the Expectation Suite will be used to validate a Dataframe automatically upon insertion into a Feature Group. + Note that validation is executed by the client. + Disabling validation allows you to skip the validation step without deleting the Expectation Suite. +- 'ALWAYS' vs. 'STRICT' mode. + This option controls what happens after validation. + Hopsworks defaults to 'ALWAYS', where data is written to the Feature Group regardless of the validation result. + This means that even if expectations are failing or throw an exception, Hopsworks will attempt to insert the data into the Feature Group. + In 'STRICT' mode, Hopsworks will only write data to the Feature Group if each individual expectation has been successful. ### Step 3: Add new expectations -By clicking on `Add expectation` one can choose an expectation type from a searchable dropdown menu. Currently, only the built-in expectations from the Great Expectations framework are supported. For user-defined expectations, please use the Rest API or python client. +By clicking on `Add expectation` one can choose an expectation type from a searchable dropdown menu. +Currently, only the built-in expectations from the Great Expectations framework are supported. +For user-defined expectations, please use the Rest API or python client. -All default kwargs associated to the selected expectation type are populated as a json below the dropdown menu. Edit the arguments in the json to configure the Expectation. In particular, arguments such as `column`, `columnA`, `columnB`, `column_set` and `column_list` require valid feature name(s). Click the tick button to save the expectation configuration and append it to the Expectation Suite locally. +All default kwargs associated to the selected expectation type are populated as a json below the dropdown menu. +Edit the arguments in the json to configure the Expectation. +In particular, arguments such as `column`, `columnA`, `columnB`, `column_set` and `column_list` require valid feature name(s). +Click the tick button to save the expectation configuration and append it to the Expectation Suite locally. !!! info - Click the `Save feature group` button to persist your changes! + Click the `Save feature group` button to persist your changes! -You can use the button `Clear Expectation Suite` to clean up before saving changes if you changed your mind. If the Expectation Suite is already registered, it will instead show a button to delete the Expectation Suite. +You can use the button `Clear Expectation Suite` to clean up before saving changes if you changed your mind. +If the Expectation Suite is already registered, it will instead show a button to delete the Expectation Suite. ### Step 4: Save new data to a Feature Group -Use the python client to write a DataFrame to the Feature Group. Note that if an expectation suite is enabled for a Feature Group, calling the `insert` method will run validation and default to uploading the corresponding validation report to Hopsworks. The report is uploaded even if validation fails and 'STRICT' mode is selected. +Use the python client to write a DataFrame to the Feature Group. +Note that if an expectation suite is enabled for a Feature Group, calling the `insert` method will run validation and default to uploading the corresponding validation report to Hopsworks. +The report is uploaded even if validation fails and 'STRICT' mode is selected. ### Step 5: Check Validation Results Summary -Hopsworks shows a visual summary of validation reports. To check it out, go to your Feature Group overview and scroll to the expectation section. Click on the `Validation Results` tab and check that all went according to plan. Each row corresponds to an expectation in the suite. Features can have several corresponding expectations and the same type of expectation can be applied to different features. +Hopsworks shows a visual summary of validation reports. +To check it out, go to your Feature Group overview and scroll to the expectation section. +Click on the `Validation Results` tab and check that all went according to plan. +Each row corresponds to an expectation in the suite. +Features can have several corresponding expectations and the same type of expectation can be applied to different features. -You can navigate to older reports using the dropdown menu. Should you need more than the information displayed in the UI for e.g., debugging, the full report can be downloaded by clicking on the corresponding button. +You can navigate to older reports using the dropdown menu. +Should you need more than the information displayed in the UI for e.g., debugging, the full report can be downloaded by clicking on the corresponding button. ### Step 6: Check Validation History -The `Validation Reports` tab in the Expectations section displays a brief history of recent validations. Each row corresponds to a validation report, with some summary information about the success of the validation step. You can download the full report by clicking the download icon button that appears at the end of the row. +The `Validation Reports` tab in the Expectations section displays a brief history of recent validations. +Each row corresponds to a validation report, with some summary information about the success of the validation step. +You can download the full report by clicking the download icon button that appears at the end of the row. ## Code -Hopsworks python client interfaces with the Great Expectations library to enable you to add data validation to your feature engineering pipeline. In this section, we show you how in a single line you enable automatic validation on each insertion of new data into your Feature Group. Whether you have an existing Feature Group you want to add validation to or Follow the guide or get your hands dirty by running our [tutorial data validation notebook](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb) in google colab. +Hopsworks python client interfaces with the Great Expectations library to enable you to add data validation to your feature engineering pipeline. +In this section, we show you how in a single line you enable automatic validation on each insertion of new data into your Feature Group. +Whether you have an existing Feature Group you want to add validation to or Follow the guide or get your hands dirty by running our [tutorial data validation notebook](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb) in google colab. -First checkout the pre-requisite and Hopsworks setup to follow the guide below. Create a project, install the hopsworks client and connect via the generated API key. You are ready to load your data in a DataFrame. The second step is a short introduction to the relevant Great Expectations API to build data validation suited to your data. Third and final step shows how to attach your Expectation Suite to the Feature Group to benefit from automatic validation on insertion capabilities. +First checkout the pre-requisite and Hopsworks setup to follow the guide below. +Create a project, install the hopsworks client and connect via the generated API key. +You are ready to load your data in a DataFrame. +The second step is a short introduction to the relevant Great Expectations API to build data validation suited to your data. +Third and final step shows how to attach your Expectation Suite to the Feature Group to benefit from automatic validation on insertion capabilities. ### Step 1: Pre-requisite In order to define and validate an expectation when writing to a Feature Group, you will need: -- A Hopsworks project. If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. +- A Hopsworks project. + If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. - An API key, you can get one by going to "Account Settings" on [app.hopsworks.ai](https://app.hopsworks.ai). -- The [Hopsworks Python library](https://pypi.org/project/hopsworks) installed in your client. See the [installation guide](../../client_installation/index.md). +- The [Hopsworks Python library](https://pypi.org/project/hopsworks) installed in your client. + See the [installation guide](../../client_installation/index.md). #### Connect your notebook to Hopsworks @@ -78,7 +117,8 @@ project = hopsworks.login() fs = project.get_feature_store() ``` -You will be prompt to paste your API key to connect the notebook to your project. The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. +You will be prompt to paste your API key to connect the notebook to your project. +The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. #### Import your data @@ -94,11 +134,14 @@ df.head(3) ### Step 2: Great Expectation Introduction -To validate the data, we will use the [Great Expectations](https://greatexpectations.io/) library. Below is a short introduction on how to build an Expectation Suite to validate your data. Everything is done using the Great Expectations API so you can re-use any prior knowledge you may have of the library. +To validate the data, we will use the [Great Expectations](https://greatexpectations.io/) library. +Below is a short introduction on how to build an Expectation Suite to validate your data. +Everything is done using the Great Expectations API so you can re-use any prior knowledge you may have of the library. #### Create an Expectation Suite -Create (or import an existing) expectation suite using the Great Expectations library. This suite will hold all the validation tests we want to perform on our data before inserting them into Hopsworks. +Create (or import an existing) expectation suite using the Great Expectations library. +This suite will hold all the validation tests we want to perform on our data before inserting them into Hopsworks. ```python3 import great_expectations as ge @@ -110,7 +153,8 @@ expectation_suite = ge.core.ExpectationSuite( #### Add Expectations in the Source Code -Add some expectation to your suite. Each expectation configuration corresponds to a validation test to be run against your data. +Add some expectation to your suite. +Each expectation configuration corresponds to a validation test to be run against your data. ```python3 expectation_suite.add_expectation( @@ -138,7 +182,8 @@ expectation_suite.add_expectation( #### Using Great Expectations Profiler -Building Expectation Suite by hand can be a major time commitment when you have dozens of Features. Great Expectations offers `Profiler` classes to inspect a sample of your data and infers a suitable Expectation Suite that you will be able to register with Hopsworks. +Building Expectation Suite by hand can be a major time commitment when you have dozens of Features. +Great Expectations offers `Profiler` classes to inspect a sample of your data and infers a suitable Expectation Suite that you will be able to register with Hopsworks. ```python3 ge_profiler = ge.profile.BasicSuiteBuilderProfiler() @@ -147,9 +192,12 @@ expectation_suite_profiler, _ = ge_profiler.profile(ge.from_pandas(df)) Once you have built an Expectation Suite you are satisfied with, it is time to create your first validation enabled Feature Group. -### Step 3: Attach an Expectation Suite to your Feature Group to enable Automatic Validation on Insertion. +### Step 3: Attach an Expectation Suite to your Feature Group to enable Automatic Validation on Insertion -Writing data in Hopsworks is done using Feature Groups. Once a Feature Group is registered in the Feature Store, you can use it to insert your pandas DataFrames. For more information see [create Feature Group](create.md). To benefit from automatic validation on insertion, attach your newly created Expectation Suite when creating the Feature Group: +Writing data in Hopsworks is done using Feature Groups. +Once a Feature Group is registered in the Feature Store, you can use it to insert your pandas DataFrames. +For more information see [create Feature Group](create.md). +To benefit from automatic validation on insertion, attach your newly created Expectation Suite when creating the Feature Group: ```python3 fg = fs.create_feature_group( @@ -168,22 +216,29 @@ or, if the Feature Group already exist, you can simply run: fg.save_expectation_suite(expectation_suite) ``` -That is all there is to it. Hopsworks will now automatically use your suite to validate the DataFrames you want to write to the Feature Group. Try it out! +That is all there is to it. +Hopsworks will now automatically use your suite to validate the DataFrames you want to write to the Feature Group. +Try it out! ```python3 job, validation_report = fg.insert(df.head(5)) ``` -As you can see, Hopsworks runs the validation in the client before attempting to insert the data. By default, Hopsworks will try to insert the data even if validation fails to prevent data loss. However it can be configured for production setup to be more restrictive, checkout the [data validation advanced guide](data_validation_advanced.md). +As you can see, Hopsworks runs the validation in the client before attempting to insert the data. +By default, Hopsworks will try to insert the data even if validation fails to prevent data loss. +However it can be configured for production setup to be more restrictive, checkout the [data validation advanced guide](data_validation_advanced.md). !!!info - Note that once the Expectation Suite is attached to the Feature Group, any subsequent attempt to insert to this Feature Group will apply the Data Validation step even from a different client or in a scheduled job. + Note that once the Expectation Suite is attached to the Feature Group, any subsequent attempt to insert to this Feature Group will apply the Data Validation step even from a different client or in a scheduled job. ### Step 4: Data Quality Monitoring -Upon running validation, Great Expectations generates a report to help you assess the quality of your data. Nothing to do here, Hopsworks client automatically uploads the validation report to the backend when ingesting new data. It enables you to monitor the quality of the inserted data in the Feature Group over time. +Upon running validation, Great Expectations generates a report to help you assess the quality of your data. +Nothing to do here, Hopsworks client automatically uploads the validation report to the backend when ingesting new data. +It enables you to monitor the quality of the inserted data in the Feature Group over time. -You can checkout a summary of the reports in the UI on your Feature Group page. As you can see, your Feature Group conveniently gather all in one place: your data, the Expectation Suite and the reports generated each time you inserted data! +You can checkout a summary of the reports in the UI on your Feature Group page. +As you can see, your Feature Group conveniently gather all in one place: your data, the Expectation Suite and the reports generated each time you inserted data! Hopsworks client API allows you to retrieve validation reports for further analysis. @@ -206,11 +261,15 @@ validation_history = fg.get_validation_history( You can find the expectationIds in the UI or using `fg.get_expectation_suite` and looking it up in the expectation's meta field. !!! info - If Validation Reports or Results are too long, they can be truncated to fit in the database. A full version of the reports can be downloaded from the UI. + If Validation Reports or Results are too long, they can be truncated to fit in the database. + A full version of the reports can be downloaded from the UI. ## Conclusion -The integration between Hopsworks and Great Expectations makes it simple to add a data validation step to your feature engineering pipeline. Build your Expectation Suite and attach it to your Feature Group with a single line of code. No need to add any code to your pipeline or job scripts, calling `fg.insert` will now automatically validate the data before inserting them in the Feature Group. The validation reports are stored along your data in Hopsworks allowing us to provide basic monitoring capabilities to quickly spot a data quality issue in the UI. +The integration between Hopsworks and Great Expectations makes it simple to add a data validation step to your feature engineering pipeline. +Build your Expectation Suite and attach it to your Feature Group with a single line of code. +No need to add any code to your pipeline or job scripts, calling `fg.insert` will now automatically validate the data before inserting them in the Feature Group. +The validation reports are stored along your data in Hopsworks allowing us to provide basic monitoring capabilities to quickly spot a data quality issue in the UI. ## Going Further diff --git a/docs/user_guides/fs/feature_group/data_validation_advanced.md b/docs/user_guides/fs/feature_group/data_validation_advanced.md index 0ef1c10c2..2c2f745d9 100644 --- a/docs/user_guides/fs/feature_group/data_validation_advanced.md +++ b/docs/user_guides/fs/feature_group/data_validation_advanced.md @@ -1,21 +1,27 @@ # Advanced Data Validation Options and Best Practices -The introduction to the data validation guide can be found [here](data_validation.md). The notebook example to get started with Data Validation in Hopsworks can be found [here](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb). +The introduction to the data validation guide can be found in the [Data Validation Guide](data_validation.md). +The notebook example to get started with Data Validation in Hopsworks can be found in the [Fraud Batch Data Validation Tutorial](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb). ## Data Validation Configuration Options in Hopsworks ### Validation Ingestion Policy -Depending on your use case you can setup data validation as a monitoring or gatekeeping tool when trying to insert new data in your Feature Group. Switch behaviour by using the `validation_ingestion_policy` kwarg: +Depending on your use case you can setup data validation as a monitoring or gatekeeping tool when trying to insert new data in your Feature Group. +Switch behaviour by using the `validation_ingestion_policy` kwarg: -- `"ALWAYS"` is the default option and will attempt to insert the data regardless of the validation result. Hassle free, it is ideal to monitor data ingestion in a development setup. -- `"STRICT"` is the best option for production ready projects. This will prevent insertion of DataFrames which do not pass all data quality requirements. Ideal to avoid "garbage-in, garbage-out" scenarios, at the price of a potential loss of data. Check out the best practice section for more on that. +- `"ALWAYS"` is the default option and will attempt to insert the data regardless of the validation result. + Hassle free, it is ideal to monitor data ingestion in a development setup. +- `"STRICT"` is the best option for production ready projects. + This will prevent insertion of DataFrames which do not pass all data quality requirements. + Ideal to avoid "garbage-in, garbage-out" scenarios, at the price of a potential loss of data. + Check out the best practice section for more on that. -#### In the UI +#### Validation Ingestion Policy in UI Go to the Feature Group edit page, in the Expectation section you can choose between the options above. -#### In the python client +#### Validation Ingestion Policy in Python ```python3 fg.expectation_suite.validation_ingestion_policy = "ALWAYS" # "STRICT" @@ -27,19 +33,23 @@ If your suite is registered with Hopsworks, it will persist the change to the se Should you wish to do so, you can disable data validation on a punctual basis or until further notice. -#### In the UI +#### Disable Data Validation in UI -You can do it in the UI in the Expectation section of the Feature Group edit page. Simply tick or untick the enabled checkbox. This will be used as the default option but can be overriden via the API. +You can do it in the UI in the Expectation section of the Feature Group edit page. +Simply tick or untick the enabled checkbox. +This will be used as the default option but can be overriden via the API. -#### In the python client +#### Disable Data Validation in Python -To disable data validation until further notice in the API, you can update the `run_validation` field of the expectation suite. If your suite is registered with Hopsworks, this will persist the change to the server. +To disable data validation until further notice in the API, you can update the `run_validation` field of the expectation suite. +If your suite is registered with Hopsworks, this will persist the change to the server. ```python3 fg.expectation_suite.run_validation = False ``` -If you wish to override the default behaviour of the suite when inserting data in the Feature Group, you can do so via the `validate_options` kwarg. The example below will enable validation for this insertion only. +If you wish to override the default behaviour of the suite when inserting data in the Feature Group, you can do so via the `validate_options` kwarg. +The example below will enable validation for this insertion only. ```python3 fg.insert(df_to_validate, validation_options={"run_validation" : True}) @@ -49,15 +59,23 @@ We recommend to avoid using this option in scheduled job as it silently changes ### Edit Expectations -The one constant in life is change. If you need to add, remove or edit an expectation you can do it both in the UI or via the python client. Note that changing the expectation type or its corresponding feature will throw an error in order to preserve a meaningful validation history. +The one constant in life is change. +If you need to add, remove or edit an expectation you can do it both in the UI or via the python client. +Note that changing the expectation type or its corresponding feature will throw an error in order to preserve a meaningful validation history. -#### In Hopworks UI +#### Edit Expectations in UI -Go to the Feature Group edit page, in the expectation section. You can click on the expectation you want to edit and edit the json configuration. Check out Great Expectations documentation if you need more information on a particular expectation. +Go to the Feature Group edit page, in the expectation section. +You can click on the expectation you want to edit and edit the json configuration. +Check out Great Expectations documentation if you need more information on a particular expectation. -#### In Hopsworks Python Client +#### Edit Expectations in Python -There are several way to edit an Expectation in the python client. You can use Great Expectations API or directly go through Hopsworks. In the latter case, if you want to edit or remove an expectation, you will need the Hopsworks expectation ID. It can be found in the UI or in the meta field of an expectation. Note that you must have inserted data in the FG and attached the expectation suite to enable the Expectation API. +There are several way to edit an Expectation in the python client. +You can use Great Expectations API or directly go through Hopsworks. +In the latter case, if you want to edit or remove an expectation, you will need the Hopsworks expectation ID. +It can be found in the UI or in the meta field of an expectation. +Note that you must have inserted data in the FG and attached the expectation suite to enable the Expectation API. Get an expectation with a given id: @@ -112,11 +130,14 @@ fg.save_expectation_suite(my_suite) ### Save Validation Reports -When running validation using Great Expectations, a validation report is generated containing all validation results for the different expectations. Each result provides information about whether the provided DataFrame conforms to the corresponding expectation. These reports can be stored in Hopsworks to save a validation history for the data written to a particular Feature Group. +When running validation using Great Expectations, a validation report is generated containing all validation results for the different expectations. +Each result provides information about whether the provided DataFrame conforms to the corresponding expectation. +These reports can be stored in Hopsworks to save a validation history for the data written to a particular Feature Group. -The boilerplate of uploading report on insertion is taken care of by hopsworks, however for custom pipelines we provide an alternative method in the python client. The UI does not currently support upload of a validation report. +The boilerplate of uploading report on insertion is taken care of by hopsworks, however for custom pipelines we provide an alternative method in the python client. +The UI does not currently support upload of a validation report. -#### In Hopsworks Python Client +#### Save Validation Reports in Python ```python3 fg.save_validation_report(ge_report) @@ -124,13 +145,15 @@ fg.save_validation_report(ge_report) ### Monitor and Fetch Validation Reports -A summary of uploaded reports will then be available via an API call or in the Hopsworks UI enabling easy monitoring. For in-depth analysis, it is possible to download the complete report from the UI. +A summary of uploaded reports will then be available via an API call or in the Hopsworks UI enabling easy monitoring. +For in-depth analysis, it is possible to download the complete report from the UI. -#### In Hopsworks UI +#### Monitor and Fetch Validation Reports in UI -Open the Feature Group overview page and go to the Expectations section. One tab allows you to check the report history with general information, while the other tab allows you to explore a summary of the result for individual expectations. +Open the Feature Group overview page and go to the Expectations section. +One tab allows you to check the report history with general information, while the other tab allows you to explore a summary of the result for individual expectations. -#### In Hopsworks Python Client +#### Monitor and Fetch Validation Reports in Python ```python3 # convenience method for rapid development @@ -142,15 +165,18 @@ ge_latest_report = fg.get_latest_validation_report() validation_history = fg.get_validation_reports() ``` -### Validate your data manually +### Validate Your Data Manually -While Hopsworks provides automatic validation on insertion logic, we recognise that some use cases may require a more fine-grained control over the validation process. Therefore, Feature Group objects offers a convenience wrapper around Great Expectations to manually trigger validation using the registered Expectation Suite. +While Hopsworks provides automatic validation on insertion logic, we recognise that some use cases may require a more fine-grained control over the validation process. +Therefore, Feature Group objects offers a convenience wrapper around Great Expectations to manually trigger validation using the registered Expectation Suite. -#### In the UI +#### Validate Your Data Manually in UI -You can validate data already ingested in the Feature Group by going to the Feature Group overview page. In the top right corner is a button to trigger a validation. The button will launch a job which will read the Feature Group data, run validation and persist the associated report. +You can validate data already ingested in the Feature Group by going to the Feature Group overview page. +In the top right corner is a button to trigger a validation. +The button will launch a job which will read the Feature Group data, run validation and persist the associated report. -#### In the python client +#### Validate Your Data Manually in Python ```python3 ge_report = fg.validate(df, ingestion_result="EXPERIMENT") @@ -159,7 +185,8 @@ ge_report = fg.validate(df, ingestion_result="EXPERIMENT") # ge_report = fg.validate(df, save_report=False) ``` -If you want to apply validation to the data already in the Feature Group you can call the `.validate` without providing data. It will read the data in the Feature Group. +If you want to apply validation to the data already in the Feature Group you can call the `.validate` without providing data. +It will read the data in the Feature Group. ```python3 report = fg.validate() diff --git a/docs/user_guides/fs/feature_group/data_validation_best_practices.md b/docs/user_guides/fs/feature_group/data_validation_best_practices.md index ddeb32a9f..05072b7f7 100644 --- a/docs/user_guides/fs/feature_group/data_validation_best_practices.md +++ b/docs/user_guides/fs/feature_group/data_validation_best_practices.md @@ -1,23 +1,30 @@ # Best practices -Below is a set of recommendations and code snippets to help our users follow best practices when it comes to integrating a data validation step in your feature engineering pipelines. Rather than being prescriptive, we want to showcase how the API and configuration options can help adapt validation to your use-case. +Below is a set of recommendations and code snippets to help our users follow best practices when it comes to integrating a data validation step in your feature engineering pipelines. +Rather than being prescriptive, we want to showcase how the API and configuration options can help adapt validation to your use-case. ## Development -Data validation is generally considered to be a production-only feature and as such is often only setup once a project has reached the end of the development phase. At Hopsworks, we think there is a lot of value in setting up validation during early development. That's why we made it quick to get started and ensured that by default data validation is never an obstacle to inserting data. +Data validation is generally considered to be a production-only feature and as such is often only setup once a project has reached the end of the development phase. +At Hopsworks, we think there is a lot of value in setting up validation during early development. +That's why we made it quick to get started and ensured that by default data validation is never an obstacle to inserting data. ### Validate Early -As often with data validation, the best piece of advice is to set it up early in your development process. Use this phase to build a history you can then use when it becomes time to set quality requirements for a project in production. We made a code snippet to help you get started quickly: +As often with data validation, the best piece of advice is to set it up early in your development process. +Use this phase to build a history you can then use when it becomes time to set quality requirements for a project in production. +We made a code snippet to help you get started quickly: ```python3 -# Load sample data. Replace it with your own! +# Load sample data. +# Replace it with your own! my_data_df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv") # Use Great Expectation profiler (ignore deprecation warning) expectation_suite_profiled, validation_report = ge.from_pandas(my_data_df).profile(profiler=ge.profile.BasicSuiteBuilderProfiler) -# Create a Feature Group on hopsworks with an expectation suite attached. Don't forget to change the primary key! +# Create a Feature Group on hopsworks with an expectation suite attached. +# Don't forget to change the primary key! my_validated_data_fg = fs.get_or_create_feature_group( name="my_validated_data_fg", version=1, @@ -33,11 +40,16 @@ Any data you insert in the Feature Group from now will be validated and a report insert_job, validation_report = my_validated_data_fg.insert(my_data_df) ``` -Great Expectations profiler can inspect your data to build a standard Expectation Suite. You can attach this Expectation Suite directly when creating your Feature Group to make sure every piece of data finding its way in Hopsworks gets validated. Hopsworks will default to its `"ALWAYS"` ingestion policy, meaning data are ingested whether validation succeeds or not. This way data validation is not a barrier, just a monitoring tool. +Great Expectations profiler can inspect your data to build a standard Expectation Suite. +You can attach this Expectation Suite directly when creating your Feature Group to make sure every piece of data finding its way in Hopsworks gets validated. +Hopsworks will default to its `"ALWAYS"` ingestion policy, meaning data are ingested whether validation succeeds or not. +This way data validation is not a barrier, just a monitoring tool. ### Identify Unreliable Features -Once you setup data validation, every insertion will upload a validation report to Hopsworks. Identifying Features which often have null values or wild statistical variations can help detecting unreliable Features that need refinements or should be avoided. Here are a few expectations you might find useful: +Once you setup data validation, every insertion will upload a validation report to Hopsworks. +Identifying Features which often have null values or wild statistical variations can help detecting unreliable Features that need refinements or should be avoided. +Here are a few expectations you might find useful: - `expect_column_values_to_not_be_null` - `expect_column_(min/max/mean/stdev)_to_be_between` @@ -45,13 +57,17 @@ Once you setup data validation, every insertion will upload a validation report ### Get the stakeholders involved -Hopsworks UI helps involve every project stakeholder by enabling both setting and monitoring of data quality requirements. No coding skills needed! You can monitor data quality requirements by checking out the validation reports and results on the Feature Group page. +Hopsworks UI helps involve every project stakeholder by enabling both setting and monitoring of data quality requirements. +No coding skills needed! You can monitor data quality requirements by checking out the validation reports and results on the Feature Group page. -If you need to set or edit the existing requirements, you can go on the Feature Group edit page. The Expectation suite section allows you to edit individual expectations and set success parameters that match ever changing business requirements. +If you need to set or edit the existing requirements, you can go on the Feature Group edit page. +The Expectation suite section allows you to edit individual expectations and set success parameters that match ever changing business requirements. ## Production -Models in production require high-quality data to make accurate predictions for your customers. Hopsworks can use your Expectation Suite as a gatekeeper to make it simple to prevent low-quality data to make its way into production. Below are some simple tips and snippets to make the most of your data validation when your project is ready to enter its production phase. +Models in production require high-quality data to make accurate predictions for your customers. +Hopsworks can use your Expectation Suite as a gatekeeper to make it simple to prevent low-quality data to make its way into production. +Below are some simple tips and snippets to make the most of your data validation when your project is ready to enter its production phase. ### Be Strict in Production @@ -63,11 +79,13 @@ fg_prod.save_expectation_suite( validation_ingestion_policy="STRICT") ``` -In this setup, Hopsworks will abort inserting a DataFrame that does not successfully fulfill all expectations in the attached Expectation Suite. This ensures data quality standards are upheld for every insertion and provide downstream users with strong guarantees. +In this setup, Hopsworks will abort inserting a DataFrame that does not successfully fulfill all expectations in the attached Expectation Suite. +This ensures data quality standards are upheld for every insertion and provide downstream users with strong guarantees. ### Avoid Data Loss on materialization jobs -Aborting insertions of DataFrames which do not satisfy the data quality standards can lead to data loss in your materialization job. To avoid such loss we recommend creating a duplicate Feature Group with the same Expectation Suite in `"ALWAYS"` mode which will hold the rejected data. +Aborting insertions of DataFrames which do not satisfy the data quality standards can lead to data loss in your materialization job. +To avoid such loss we recommend creating a duplicate Feature Group with the same Expectation Suite in `"ALWAYS"` mode which will hold the rejected data. ```python3 job, report = fg_prod.insert(df) @@ -78,7 +96,8 @@ if report["success"] is False: ### Take Advantage of the Validation History -You can easily retrieve the validation history of a specific expectation to export it to your favourite visualisation tool. You can filter on time and on whether insertion was successful or not. +You can easily retrieve the validation history of a specific expectation to export it to your favourite visualisation tool. +You can filter on time and on whether insertion was successful or not. ```python3 validation_history = fg.get_validation_history( @@ -99,9 +118,13 @@ timeseries = pd.DataFrame( ### Setup Alerts -While checking your feature engineering pipeline executed properly in the morning can be good enough in the development phase, it won't make the cut for demanding production use-cases. In Hopsworks, you can setup alerts if ingestion fails or succeeds. +While checking your feature engineering pipeline executed properly in the morning can be good enough in the development phase, it won't make the cut for demanding production use-cases. +In Hopsworks, you can setup alerts if ingestion fails or succeeds. -First you will need to configure your preferred communication endpoint: slack, email or pagerduty. Check out [this page](../../../setup_installation/admin/alert.md) for more information on how to set it up. A typical use-case would be to add an alert on ingestion success to a Feature Group you created to hold data that failed validation. Here is a quick walkthrough: +First you will need to configure your preferred communication endpoint: slack, email or pagerduty. +Check out [this page](../../../setup_installation/admin/alert.md) for more information on how to set it up. +A typical use-case would be to add an alert on ingestion success to a Feature Group you created to hold data that failed validation. +Here is a quick walkthrough: 1. Go the Feature Group page in the UI 2. Scroll down and click on the `Add an alert` button. @@ -109,4 +132,7 @@ First you will need to configure your preferred communication endpoint: slack, e ## Conclusion -Hopsworks extends Great Expectations by automatically running the validation, persisting the reports along your data and allowing you to monitor data quality in its UI. How you decide to make use of these tools depends on your application and requirements. Whether in development or in production, real-time or batch, we think there is configuration that will work for your team. Check out our [quick hands-on tutorial](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb) to start applying what you learned so far. +Hopsworks extends Great Expectations by automatically running the validation, persisting the reports along your data and allowing you to monitor data quality in its UI. +How you decide to make use of these tools depends on your application and requirements. +Whether in development or in production, real-time or batch, we think there is configuration that will work for your team. +Check out our [quick hands-on tutorial](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/integrations/great_expectations/fraud_batch_data_validation.ipynb) to start applying what you learned so far. diff --git a/docs/user_guides/fs/feature_group/deprecation.md b/docs/user_guides/fs/feature_group/deprecation.md index 5ffc44bd4..531c11a3b 100644 --- a/docs/user_guides/fs/feature_group/deprecation.md +++ b/docs/user_guides/fs/feature_group/deprecation.md @@ -4,7 +4,7 @@ description: Documentation on how to deprecate a feature group in Hopsworks. # How to deprecate a Feature Group -### Introduction +## Introduction To discourage the usage of specific feature groups it is possible to deprecate them. When a feature group is deprecated, user will be warned when they try to use it or use a feature view that depends on it. @@ -13,7 +13,8 @@ In this guide you will learn how to deprecate a feature group within Hopsworks, ## Prerequisites -Before you begin this guide it is expected that there is an existing feature group in your project. You can familiarize yourself with [the creation of a feature group](./create.md) in the user guide. +Before you begin this guide it is expected that there is an existing feature group in your project. +You can familiarize yourself with [the creation of a feature group](./create.md) in the user guide. ## Deprecate using the HSFS APIs @@ -47,7 +48,8 @@ Users can also un-deprecate the feature group if need be, by setting the `deprec ## Deprecate using the UI -You can deprecate/de-deprecate feature groups through the UI. For this, navigate to the `Feature Groups` section and select a feature group. +You can deprecate/de-deprecate feature groups through the UI. +For this, navigate to the `Feature Groups` section and select a feature group.

diff --git a/docs/user_guides/fs/feature_group/feature_monitoring.md b/docs/user_guides/fs/feature_group/feature_monitoring.md index 1412be9d9..509d741c9 100644 --- a/docs/user_guides/fs/feature_group/feature_monitoring.md +++ b/docs/user_guides/fs/feature_group/feature_monitoring.md @@ -1,27 +1,36 @@ # Feature Monitoring for Feature Groups -Feature Monitoring complements the Hopsworks data validation capabilities for Feature Groups by allowing you to monitor your data once they have been ingested into the Feature Store. Hopsworks feature monitoring is centered around two functionalities: **scheduled statistics** and **statistics comparison**. +Feature Monitoring complements the Hopsworks data validation capabilities for Feature Groups by allowing you to monitor your data once they have been ingested into the Feature Store. +Hopsworks feature monitoring is centered around two functionalities: **scheduled statistics** and **statistics comparison**. Before continuing with this guide, see the [Feature monitoring guide](../feature_monitoring/index.md) to learn more about how feature monitoring works, and get familiar with the different use cases of feature monitoring for Feature Groups described in the **Use cases** sections of the [Scheduled statistics guide](../feature_monitoring/scheduled_statistics.md#use-cases) and [Statistics comparison guide](../feature_monitoring/statistics_comparison.md#use-cases). !!! warning "Limited UI support" - Currently, feature monitoring can only be configured using the [Hopsworks Python library](https://pypi.org/project/hopsworks). However, you can enable/disable a feature monitoring configuration or trigger the statistics comparison manually from the UI, as shown in the [Advanced guide](../feature_monitoring/feature_monitoring_advanced.md). + Currently, feature monitoring can only be configured using the [Hopsworks Python library](https://pypi.org/project/hopsworks). + However, you can enable/disable a feature monitoring configuration or trigger the statistics comparison manually from the UI, as shown in the [Advanced guide](../feature_monitoring/feature_monitoring_advanced.md). ## Code -In this section, we show you how to setup feature monitoring in a Feature Group using the ==Hopsworks Python library==. Alternatively, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). +In this section, we show you how to setup feature monitoring in a Feature Group using the ==Hopsworks Python library==. +Alternatively, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). -First, checkout the pre-requisite and Hopsworks setup to follow the guide below. Create a project, install the [Hopsworks Python library](https://pypi.org/project/hopsworks) in your environment, connect via the generated API key. The second step is to start a new configuration for feature monitoring. +First, checkout the pre-requisite and Hopsworks setup to follow the guide below. +Create a project, install the [Hopsworks Python library](https://pypi.org/project/hopsworks) in your environment, connect via the generated API key. +The second step is to start a new configuration for feature monitoring. -After that, you can optionally define a detection window of data to compute statistics on, or use the default detection window (i.e., whole feature data). If you want to setup scheduled statistics alone, you can jump to the last step to save your configuration. Otherwise, the third and fourth steps are also optional and show you how to setup the comparison of statistics on a schedule by defining a reference window and specifying the statistics metric to monitor. +After that, you can optionally define a detection window of data to compute statistics on, or use the default detection window (i.e., whole feature data). +If you want to setup scheduled statistics alone, you can jump to the last step to save your configuration. +Otherwise, the third and fourth steps are also optional and show you how to setup the comparison of statistics on a schedule by defining a reference window and specifying the statistics metric to monitor. ### Step 1: Pre-requisite In order to setup feature monitoring for a Feature Group, you will need: -- A Hopsworks project. If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. +- A Hopsworks project. + If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. - An API key, you can get one by going to "Account Settings" on [app.hopsworks.ai](https://app.hopsworks.ai). -- The Hopsworks Python library installed in your client. See the [installation guide](../../client_installation/index.md). +- The Hopsworks Python library installed in your client. + See the [installation guide](../../client_installation/index.md). - A Feature Group #### Connect your notebook to Hopsworks @@ -38,11 +47,14 @@ Connect the client running your notebooks to Hopsworks. fs = project.get_feature_store() ``` -You will be prompted to paste your API key to connect the notebook to your project. The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. +You will be prompted to paste your API key to connect the notebook to your project. +The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. #### Get or create a Feature Group -Feature monitoring can be enabled on already created Feature Groups. We suggest you read the [Feature Group](../../../concepts/fs/feature_group/fg_overview.md) concept page to understand what a feature group is and how it fits in the ML pipeline. We also suggest you familiarize with the APIs to [create a feature group](./create.md). +Feature monitoring can be enabled on already created Feature Groups. +We suggest you read the [Feature Group](../../../concepts/fs/feature_group/fg_overview.md) concept page to understand what a feature group is and how it fits in the ML pipeline. +We also suggest you familiarize with the APIs to [create a feature group](./create.md). The following is a code example for getting or creating a Feature Group with name `trans_fg` for transaction data. @@ -51,7 +63,7 @@ The following is a code example for getting or creating a Feature Group with nam ```python3 # Retrieve an existing feature group trans_fg = fs.get_feature_group("trans_fg", version=1) - + # Or, create a new feature group with transactions trans_fg = fs.get_or_create_feature_group( name="trans_fg", @@ -88,7 +100,8 @@ You can setup statistics monitoring on a ==single feature or multiple features== #### Statistics comparison -When enabling the comparison of statistics in a feature monitoring configuration, you need to specify a ==single feature== of your Feature Group. You can create multiple feature monitoring configurations for the same Feature Group, but each of them should point to a single feature in the Feature Group. +When enabling the comparison of statistics in a feature monitoring configuration, you need to specify a ==single feature== of your Feature Group. +You can create multiple feature monitoring configurations for the same Feature Group, but each of them should point to a single feature in the Feature Group. === "Python" @@ -102,7 +115,8 @@ When enabling the comparison of statistics in a feature monitoring configuration #### Custom schedule or percentage of window data -By default, the computation of statistics is scheduled to run endlessly, every day at 12PM. You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters. +By default, the computation of statistics is scheduled to run endlessly, every day at 12PM. +You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters. === "Python" @@ -110,7 +124,7 @@ By default, the computation of statistics is scheduled to run endlessly, every d fg_monitoring_config = trans_fg.create_statistics_monitoring( name="trans_fg_all_features_monitoring", description="Compute statistics on all data of all features of the Feature Group on a weekly basis", - cron_expression="0 0 12 ? * MON *", # weekly + cron_expression="0 0 12 ? * MON *", # weekly row_percentage=0.8, # use 80% of the data ) @@ -119,14 +133,16 @@ By default, the computation of statistics is scheduled to run endlessly, every d name="trans_fg_amount_monitoring", feature_name="amount", description="Compute descriptive statistics on the amount Feature of the Feature Group on a weekly basis", - cron_expression="0 0 12 ? * MON *", # weekly + cron_expression="0 0 12 ? * MON *", # weekly row_percentage=0.8, # use 80% of the data ) ``` ### Step 3: (Optional) Define a detection window -By default, the detection window is an _expanding window_ covering the whole Feature Group data. You can define a different detection window using the `window_length` and `time_offset` parameters provided in the `with_detection_window` method. Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter. +By default, the detection window is an _expanding window_ covering the whole Feature Group data. +You can define a different detection window using the `window_length` and `time_offset` parameters provided in the `with_detection_window` method. +Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter. === "Python" @@ -160,13 +176,15 @@ When setting up feature monitoring for a Feature Group, reference windows can be ### Step 5: (Optional) Define the statistics comparison criteria -In order to compare detection and reference statistics, you need to provide the criteria for such comparison. First, you select the metric to consider in the comparison using the `metric` parameter. Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters. +In order to compare detection and reference statistics, you need to provide the criteria for such comparison. +First, you select the metric to consider in the comparison using the `metric` parameter. +Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters. === "Python" ```python3 fm_monitoring_config.compare_on( - metric="mean", + metric="mean", threshold=0.2, # a relative change over 20% is considered anomalous relative=True, # relative or absolute change strict=False, # strict or relaxed comparison @@ -176,10 +194,10 @@ In order to compare detection and reference statistics, you need to provide the !!! info "Difference values and thresholds" For more information about the computation of difference values and the comparison against threshold bounds see the [Comparison criteria section](../feature_monitoring/statistics_comparison.md#comparison-criteria) in the Statistics comparison guide. - ### Step 6: Save configuration -Finally, you can save your feature monitoring configuration by calling the `save` method. Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically. +Finally, you can save your feature monitoring configuration by calling the `save` method. +Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically. === "Python" diff --git a/docs/user_guides/fs/feature_group/index.md b/docs/user_guides/fs/feature_group/index.md index f08e15f26..9f87c47cf 100644 --- a/docs/user_guides/fs/feature_group/index.md +++ b/docs/user_guides/fs/feature_group/index.md @@ -8,4 +8,4 @@ This section serves to provide guides and examples for the common usage of abstr - [Data Types and Schema management](data_types.md) - [Statistics](statistics.md) - [Data Validation](data_validation.md) -- [Feature Monitoring](feature_monitoring.md) \ No newline at end of file +- [Feature Monitoring](feature_monitoring.md) diff --git a/docs/user_guides/fs/feature_group/notification.md b/docs/user_guides/fs/feature_group/notification.md index 5ee2091ec..27402ed39 100644 --- a/docs/user_guides/fs/feature_group/notification.md +++ b/docs/user_guides/fs/feature_group/notification.md @@ -18,7 +18,7 @@ Subsequently [create a Kafka topic](../../projects/kafka/create_topic.md), this ## Using HSFS APIs -### Create a Feature Group with Change Data Capture +### Create a Feature Group with Change Data Capture using Python To enable Change Data Capture for an online-enabled feature group using the HSFS APIs you need to [create a feature group](./create.md) and set the `notification_topic_name` properties value to the previously created topic. @@ -33,7 +33,7 @@ To enable Change Data Capture for an online-enabled feature group using the HSFS notification_topic_name="notification_topic_name") ``` -### Update Feature Group Change Data Capture topic +### Update Feature Group with Change Data Capture topic using Python The notification topic name can be changed after the creation of the feature group. By setting the `notification_topic_name` value to `None` or empty string notification will be disabled. @@ -48,7 +48,7 @@ With the default configuration, it can take up to 30 minutes for these changes t ## Using UI -### Create a Feature Group with Change Data Capture +### Create a Feature Group with Change Data Capture using UI During the creation of the feature group enable online feature serving. When enabled you will be able to set the `CDC topic name` property. @@ -59,7 +59,7 @@ When enabled you will be able to set the `CDC topic name` property.

-### Update Feature Group with Change Data Capture topic +### Update Feature Group with Change Data Capture topic using UI The notification topic name can be changed after creation by editing the feature group. By setting the `CDC topic name` value to empty the notifications will be disabled. @@ -77,25 +77,25 @@ Once properly set up the online feature store service will produce events to the Here is an example output: -``` +```jsonc { - "projectName":"project_name", # name of the project the feature group belongs to - "projectId":119, # id of the project the feature group belongs to - "featureStoreId":67, # feature store where changes took place - "featureGroupId":14, # id of the feature group - "featureGroupName":"fg_name", # name of the feature group - "featureGroupVersion":1, # version of the feature group - "entry":{ # values of the affected feature group entry + "projectName":"project_name", // name of the project the feature group belongs to + "projectId":119, // id of the project the feature group belongs to + "featureStoreId":67, // feature store where changes took place + "featureGroupId":14, // id of the feature group + "featureGroupName":"fg_name", // name of the feature group + "featureGroupVersion":1, // version of the feature group + "entry":{ // values of the affected feature group entry "id":"15", "text":"test" }, - "featureViews":[ # list of feature views affected + "featureViews":[ // list of feature views affected { - "projectName":"project_name", # name of the project the feature view belongs to - "id":9, # id of the feature view - "name":"test", # name of the feature view - "version":1, # version of the feature view - "featurestoreId":67 # feature store where feature view resides + "projectName":"project_name", // name of the project the feature view belongs to + "id":9, // id of the feature view + "name":"test", // name of the feature view + "version":1, // version of the feature view + "featurestoreId":67 // feature store where feature view resides } ] } diff --git a/docs/user_guides/fs/feature_group/on_demand_transformations.md b/docs/user_guides/fs/feature_group/on_demand_transformations.md index d961e77a5..8fbd20652 100644 --- a/docs/user_guides/fs/feature_group/on_demand_transformations.md +++ b/docs/user_guides/fs/feature_group/on_demand_transformations.md @@ -1,18 +1,24 @@ # On-Demand Transformation Functions -[On-demand transformations](https://www.hopsworks.ai/dictionary/on-demand-transformation) produce on-demand features, which usually require parameters accessible during inference for their calculation. Hopsworks facilitates the creation of on-demand transformations without introducing [online-offline skew](https://www.hopsworks.ai/dictionary/online-offline-feature-skew), ensuring consistency while allowing their dynamic computation during online inference. +[On-demand transformations](https://www.hopsworks.ai/dictionary/on-demand-transformation) produce on-demand features, which usually require parameters accessible during inference for their calculation. +Hopsworks facilitates the creation of on-demand transformations without introducing [online-offline skew](https://www.hopsworks.ai/dictionary/online-offline-feature-skew), ensuring consistency while allowing their dynamic computation during online inference. ## On Demand Transformation Function Creation - -An on-demand transformation function may be created by associating a [transformation function](../transformation_functions.md) with a feature group. Each on-demand transformation function can generate one or multiple on-demand features. If the on-demand transformation function returns a single feature, it is automatically assigned the same name as the transformation function. However, if it returns multiple features, they are by default named using the format `functionName_outputColumnNumber`. For instance, in the example below, the on-demand transformation function `transaction_age` produces an on-demand feature named `transaction_age` and the on-demand transformation function `stripped_strings` produces the on-demand features names `stripped_strings_0` and `stripped_strings_1`. Alternatively, the name of the resulting on-demand feature can be explicitly defined using the [`alias`](../transformation_functions.md#specifying-output-features–names-for-transformation-functions) function. +An on-demand transformation function may be created by associating a [transformation function](../transformation_functions.md) with a feature group. +Each on-demand transformation function can generate one or multiple on-demand features. +If the on-demand transformation function returns a single feature, it is automatically assigned the same name as the transformation function. +However, if it returns multiple features, they are by default named using the format `functionName_outputColumnNumber`. +For instance, in the example below, the on-demand transformation function `transaction_age` produces an on-demand feature named `transaction_age` and the on-demand transformation function `stripped_strings` produces the on-demand features names `stripped_strings_0` and `stripped_strings_1`. +Alternatively, the name of the resulting on-demand feature can be explicitly defined using the [`alias`](../transformation_functions.md#specifying-output-features-names-for-transformation-functions) function. !!! warning "On-demand transformation" All on-demand transformation functions attached to a feature group must have unique names and, in contrast to model-dependent transformations, they do not have access to training dataset statistics. -Each on-demand transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function. If no feature names are provided, the transformation function will default to using features that match the name of the transformation function's argument. +Each on-demand transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function. +If no feature names are provided, the transformation function will default to using features that match the name of the transformation function's argument. -=== "Python" +=== "Python" !!! example "Creating on-demand transformation functions." ```python # Define transformation function @@ -35,12 +41,11 @@ Each on-demand transformation function can map specific features to its argument ) ``` - ### Specifying input features -The features to be used by the on-demand transformation function can be specified by providing the feature names as input to the transformation functions. +The features to be used by the on-demand transformation function can be specified by providing the feature names as input to the transformation functions. -=== "Python" +=== "Python" !!! example "Creating on-demand transformations by specifying features to be passed to transformation function." ```python fg = feature_store.create_feature_group(name="fg_transactions", @@ -55,21 +60,24 @@ The features to be used by the on-demand transformation function can be specifie ## Usage -On-demand transformation functions attached to a feature group are automatically executed in the feature pipeline when you [insert data](../create/#batch-write-api) into a feature group and [by the Python client while retrieving feature vectors](../feature_view/feature-vectors.md#retrieval) for online inference using feature views that contain on-demand features. +On-demand transformation functions attached to a feature group are automatically executed in the feature pipeline when you [insert data](./create.md#batch-write-api) into a feature group and [by the Python client while retrieving feature vectors](../feature_view/feature-vectors.md#retrieval) for online inference using feature views that contain on-demand features. The on-demand features computed by on-demand transformation functions are positioned after all other features in a feature group and are ordered alphabetically by their names. ### Inserting data -All on-demand transformation functions attached to a feature group are executed whenever new data is inserted. This process computes on-demand features from historical data. The DataFrame used for insertion must include all features required for executing all on-demand transformation functions in the feature group. +All on-demand transformation functions attached to a feature group are executed whenever new data is inserted. +This process computes on-demand features from historical data. +The DataFrame used for insertion must include all features required for executing all on-demand transformation functions in the feature group. Inserting on-demand features as historical features saves time and computational resources by removing the need to compute all on-demand features while generating training or batch data. ### Accessing on-demand features in feature views -A feature view can include on-demand features from feature groups by selecting them in the [query](../feature_view/query.md) used to create the feature view. These on-demand features are equivalent to regular features, and [model-dependent transformations](../feature_view/model-dependent-transformations.md) can be applied to them if required. +A feature view can include on-demand features from feature groups by selecting them in the [query](../feature_view/query.md) used to create the feature view. +These on-demand features are equivalent to regular features, and [model-dependent transformations](../feature_view/model-dependent-transformations.md) can be applied to them if required. -=== "Python" +=== "Python" !!! example "Creating feature view with on-demand features" ```python @@ -89,18 +97,23 @@ A feature view can include on-demand features from feature groups by selecting t ### Computing on-demand features -On-demand features in the feature view are computed in real-time during online inference using the same on-demand transformation functions used to create them. Hopsworks, by default, automatically computes all on-demand features when retrieving feature view input features (feature vectors) with the functions `get_feature_vector` and `get_feature_vectors`. Additionally, on-demand features can be computed using the `compute_on_demand_features` function or by manually executing the same on-demand transformation function. +On-demand features in the feature view are computed in real-time during online inference using the same on-demand transformation functions used to create them. +Hopsworks, by default, automatically computes all on-demand features when retrieving feature view input features (feature vectors) with the functions `get_feature_vector` and `get_feature_vectors`. +Additionally, on-demand features can be computed using the `compute_on_demand_features` function or by manually executing the same on-demand transformation function. -The values for the input parameters required to compute on-demand features can be provided using the `request_parameters` argument. If values are not provided through the `request_parameters` argument, the transformation function will verify if the feature vector contains the necessary input parameters and will use those values instead. However, if the required input parameters are also not present in the feature vector, an error will be thrown. +The values for the input parameters required to compute on-demand features can be provided using the `request_parameters` argument. +If values are not provided through the `request_parameters` argument, the transformation function will verify if the feature vector contains the necessary input parameters and will use those values instead. +However, if the required input parameters are also not present in the feature vector, an error will be thrown. !!! note By default the functions `get_feature_vector` and `get_feature_vectors` will apply model-dependent transformation present in the feature view after computing on-demand features. #### Retrieving a feature vector -The `get_feature_vector` function retrieves a single feature vector based on the feature view's serving key(s). The on-demand features in the feature vector can be computed using real-time data by passing a dictionary that associates the name of each input parameter needed for the on-demand transformation function with its respective new value to the `request_parameter` argument. +The `get_feature_vector` function retrieves a single feature vector based on the feature view's serving key(s). +The on-demand features in the feature vector can be computed using real-time data by passing a dictionary that associates the name of each input parameter needed for the on-demand transformation function with its respective new value to the `request_parameter` argument. -=== "Python" +=== "Python" !!! example "Computing on-demand features while retrieving a feature vector" ```python feature_vector = feature_view.get_feature_vector( @@ -114,9 +127,10 @@ The `get_feature_vector` function retrieves a single feature vector based on the #### Retrieving feature vectors -The `get_feature_vectors` function retrieves multiple feature vectors using a list of feature view serving keys. The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each serving key or can be a dictionary if the on-demand transformations require the same parameters for all serving keys. +The `get_feature_vectors` function retrieves multiple feature vectors using a list of feature view serving keys. +The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each serving key or can be a dictionary if the on-demand transformations require the same parameters for all serving keys. -=== "Python" +=== "Python" !!! example "Computing on-demand features while retrieving a feature vectors" ```python # Specify unique request parameters for each serving key. @@ -146,9 +160,10 @@ The `get_feature_vectors` function retrieves multiple feature vectors using a li #### Retrieving feature vector without on-demand features -The `get_feature_vector` and `get_feature_vectors` methods can return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. To achieve this, set the parameters `transform` and `on_demand_features` to `False`. +The `get_feature_vector` and `get_feature_vectors` methods can return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. +To achieve this, set the parameters `transform` and `on_demand_features` to `False`. -=== "Python" +=== "Python" !!! example "Returning untransformed feature vectors" ```python untransformed_feature_vector = feature_view.get_feature_vector( @@ -161,11 +176,13 @@ The `get_feature_vector` and `get_feature_vectors` methods can return untransfor #### Compute all on-demand features -The `compute_on_demand_features` function computes all on-demand features attached to a feature view and adds them to the feature vectors provided as input to the function. This function does not apply model-dependent transformations to any of the features. The `transform` function can be used to apply model-dependent transformations to the returned values if required. +The `compute_on_demand_features` function computes all on-demand features attached to a feature view and adds them to the feature vectors provided as input to the function. +This function does not apply model-dependent transformations to any of the features. +The `transform` function can be used to apply model-dependent transformations to the returned values if required. The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each feature vector given as input to the function or can be a dictionary if the on-demand transformations require the same parameters for all input feature vectors. -=== "Python" +=== "Python" !!! example "Computing all on-demand features and manually applying model dependent transformations." ```python # Specify request parameters for each serving key. @@ -223,7 +240,7 @@ The `request_parameter` in this case, can be a list of dictionaries that specifi On-demand transformation functions can also be accessed and executed as normal functions by using the dictionary `on_demand_transformations` that maps the on-demand features to their corresponding on-demand transformation function. -=== "Python" +=== "Python" !!! example "Executing each on-demand transformation function" ```python # Specify request parameters for each serving key. @@ -236,4 +253,4 @@ On-demand transformation functions can also be accessed and executed as normal f "on_demand_feature1" ](feature_vector["transaction_time"], datetime.now()) - ``` \ No newline at end of file + ``` diff --git a/docs/user_guides/fs/feature_group/online_ingestion_observability.md b/docs/user_guides/fs/feature_group/online_ingestion_observability.md index 63eac8d8a..08305c6fe 100644 --- a/docs/user_guides/fs/feature_group/online_ingestion_observability.md +++ b/docs/user_guides/fs/feature_group/online_ingestion_observability.md @@ -6,7 +6,8 @@ description: Documentation on Online ingestion observability in Hopsworks. ## Introduction -Knowing when ingested data becomes available for online serving—and understanding the cause of any ingestion failures—is crucial for users. To address this, the Hopsworks API provides observability features for online ingestion, allowing you to monitor ingestion status and troubleshoot issues. +Knowing when ingested data becomes available for online serving—and understanding the cause of any ingestion failures—is crucial for users. +To address this, the Hopsworks API provides observability features for online ingestion, allowing you to monitor ingestion status and troubleshoot issues. This guide explains how to use these observability features for online feature groups in Hopsworks, with examples using both the HSFS APIs and the user interface. @@ -37,7 +38,7 @@ First, create an online-enabled feature group and insert data into it: After inserting data, you can monitor the ingestion progress: -**Get the latest ingestion instance** +#### Get the latest ingestion instance === "Python" @@ -45,7 +46,7 @@ After inserting data, you can monitor the ingestion progress: oi = fg.get_latest_online_ingestion() ``` -**Get a specific ingestion by its ID** +#### Get a specific ingestion by its ID === "Python" @@ -57,7 +58,7 @@ After inserting data, you can monitor the ingestion progress: The online ingestion object provides methods to track and debug the ingestion process: -**Wait for completion** +#### Wait for completion Wait for the online ingestion to finish (equivalent to `fg.insert(fg_df, wait=True)`): @@ -67,9 +68,10 @@ Wait for the online ingestion to finish (equivalent to `fg.insert(fg_df, wait=Tr oi.wait_for_completion() ``` -**Print mini-batch results** +#### Print mini-batch results -Check the results of the ingestion. If the status is `UPSERTED` and the number of rows matches your data, the ingestion was successful: +Check the results of the ingestion. +If the status is `UPSERTED` and the number of rows matches your data, the ingestion was successful: === "Python" @@ -78,7 +80,7 @@ Check the results of the ingestion. If the status is `UPSERTED` and the number o # Example output: [{'onlineIngestionId': 1, 'status': 'UPSERTED', 'rows': 10}] ``` -**Print ingestion service logs** +#### Print ingestion service logs Retrieve logs from the online ingestion service to diagnose any issues: diff --git a/docs/user_guides/fs/feature_group/statistics.md b/docs/user_guides/fs/feature_group/statistics.md index 681e9521b..a2543a044 100644 --- a/docs/user_guides/fs/feature_group/statistics.md +++ b/docs/user_guides/fs/feature_group/statistics.md @@ -1,33 +1,45 @@ # How to compute statistics on feature data -### Introduction +## Introduction -In this guide you will learn how to configure, compute and visualize statistics for the features registered with Hopsworks. +In this guide you will learn how to configure, compute and visualize statistics for the features registered with Hopsworks. Hopsworks groups statistics in four categories: -* **Descriptive**: These are the basic statistics Hopsworks computes. They include an _approximate_ count of the distinctive values and the completeness (i.e. the percentage of non null values). For numerical features Hopsworks also computes the minimum, maximum, mean, standard deviation and the sum of each feature. Enabled by default. +- **Descriptive**: These are the basic statistics Hopsworks computes. + They include an _approximate_ count of the distinctive values and the completeness (i.e., the percentage of non null values). + For numerical features Hopsworks also computes the minimum, maximum, mean, standard deviation and the sum of each feature. + Enabled by default. -* **Histograms**: Hopsworks computes the distribution of the values of a feature. Exact histograms are computed as long as the number of distinct values is less than 20. If a feature has a numerical data type (e.g. integer, float, double, ...) and has more than 20 unique values, then the values are bucketed in 20 buckets and the histogram represents the distribution of values in those buckets. By default histograms are disabled. +- **Histograms**: Hopsworks computes the distribution of the values of a feature. + Exact histograms are computed as long as the number of distinct values is less than 20. If a feature has a numerical data type (e.g., integer, float, double, ...) and has more than 20 unique values, then the values are bucketed in 20 buckets and the histogram represents the distribution of values in those buckets. + By default histograms are disabled. -* **Correlation**: If enabled, Hopsworks computes the Pearson correlation between features of numerical data type within a feature group. By default correlation is disabled. +- **Correlation**: If enabled, Hopsworks computes the Pearson correlation between features of numerical data type within a feature group. + By default correlation is disabled. -* **Exact Statistics**: Exact statistics are an enhancement of the descriptive statistics that provide an exact count of distinctive values, entropy, uniqueness and distinctiveness of the value of a feature. These statistics are more expensive to compute as they take into consideration all the values and they don't use approximations. By default they are disabled. +- **Exact Statistics**: Exact statistics are an enhancement of the descriptive statistics that provide an exact count of distinctive values, entropy, uniqueness and distinctiveness of the value of a feature. + These statistics are more expensive to compute as they take into consideration all the values and they don't use approximations. + By default they are disabled. -When statistics are enabled, they are computed every time new data is written into the *offline* storage of a feature group. Statistics are then displayed on the Hopsworks UI and users can track how data has changed over time. +When statistics are enabled, they are computed every time new data is written into the _offline_ storage of a feature group. +Statistics are then displayed on the Hopsworks UI and users can track how data has changed over time. ## Prerequisites -Before you begin this guide we suggest you read the [Feature Group](../../../concepts/fs/feature_group/fg_overview.md) concept page to understand what a feature group is and how it fits in the ML pipeline. +Before you begin this guide we suggest you read the [Feature Group](../../../concepts/fs/feature_group/fg_overview.md) concept page to understand what a feature group is and how it fits in the ML pipeline. We also suggest you familiarize with the APIs to [create a feature group](./create.md). -## Enable statistics when creating a feature group +## Enable statistics when creating a feature group -As mentioned above, by default only descriptive statistics are enabled when creating a feature group. To enable histograms, correlations or exact statistics the `statistics_config` configuration parameter can be provided in the create statement. +As mentioned above, by default only descriptive statistics are enabled when creating a feature group. +To enable histograms, correlations or exact statistics the `statistics_config` configuration parameter can be provided in the create statement. The `statistics_config` parameter takes a dictionary with the keys: `enabled`, `correlations`, `histograms` and `exact_uniqueness` and, as values, a boolean to describe whether or not to compute the specific class of statistics. -Additionally it is possible to restrict the statistics computation to only a subset of columns. This is configurable by adding a `columns` key to the `statistics_config` parameter. The key should contain the list of columns for which to compute statistics. +Additionally it is possible to restrict the statistics computation to only a subset of columns. +This is configurable by adding a `columns` key to the `statistics_config` parameter. +The key should contain the list of columns for which to compute statistics. By default the value is empty list `[]` and the statistics are computed for all columns in the feature group. === "Python" @@ -52,31 +64,36 @@ By default the value is empty list `[]` and the statistics are computed for all ## Enable statistics after creating a feature group -It is possible users to change the statistics configuration after a feature group was created. Either to add or remove a class of statistics, or to change the set of features for which to compute statistics. +It is possible users to change the statistics configuration after a feature group was created. +Either to add or remove a class of statistics, or to change the set of features for which to compute statistics. === "Python" ```python fg.statistics_config = { - "enabled": True, - "histograms": False, - "correlations": False, - "exact_uniqueness": False - "columns": ['location_id', 'min_temp', 'max_temp'] - } + "enabled": True, + "histograms": False, + "correlations": False, + "exact_uniqueness": False, + "columns": ['location_id', 'min_temp', 'max_temp'] + } fg.update_statistics_config() ``` ## Explicitly compute statistics -As mentioned above, the statistics are computed every time new data is written into the *offline* storage of a feature group. By invoking the `compute_statistics` method, users can trigger explicitly the statistics computation for the data available in a feature group. +As mentioned above, the statistics are computed every time new data is written into the _offline_ storage of a feature group. +By invoking the `compute_statistics` method, users can trigger explicitly the statistics computation for the data available in a feature group. -This is useful when a feature group is receiving frequent updates. Users can schedule periodic statistics computation that take into consideration several data commits. +This is useful when a feature group is receiving frequent updates. +Users can schedule periodic statistics computation that take into consideration several data commits. -By default, the `compute_statistics` method computes statistics on the most recent version of the data available in a feature group. Users can provide a specific time using the `wallclock_time` parameter, to compute the statistics for a previous version of the data. +By default, the `compute_statistics` method computes statistics on the most recent version of the data available in a feature group. +Users can provide a specific time using the `wallclock_time` parameter, to compute the statistics for a previous version of the data. -Hopsworks can compute statistics of external feature groups. As external feature groups are read only from an Hopsworks perspective, statistics computation can be triggered using the `compute_statistics` method. +Hopsworks can compute statistics of external feature groups. +As external feature groups are read only from an Hopsworks perspective, statistics computation can be triggered using the `compute_statistics` method. === "Python" @@ -84,6 +101,6 @@ Hopsworks can compute statistics of external feature groups. As external feature fg.compute_statistics(wallclock_time='20220611 20:00') ``` -## Inspect statistics +## Inspect statistics You can also create a new feature group through the UI. diff --git a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md index 693044489..d820d1ef8 100644 --- a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md +++ b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md @@ -1,16 +1,17 @@ # Advanced guide -An introduction to Feature Monitoring can be found in the guides for [Feature Groups](../feature_group/feature_monitoring.md) and [Feature Views](../feature_view/feature_monitoring.md). In addition, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). +An introduction to Feature Monitoring can be found in the guides for [Feature Groups](../feature_group/feature_monitoring.md) and [Feature Views](../feature_view/feature_monitoring.md). +In addition, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). ## Retrieve feature monitoring configurations -### From UI +### Retrieve feature monitoring configurations from UI -An overview of all feature monitoring configurations is listed in the ^^Feature Monitoring^^ section in the Feature Group and Feature View overview page. +An overview of all feature monitoring configurations is listed in the ^^Feature Monitoring^^ section in the Feature Group and Feature View overview page. -### From Python client +### Retrieve feature monitoring configurations from Python -You can retrieve one or more feature monitoring configurations from the Feature Group and Feature View Python objects and filter them by name, configuration id or feature name. +You can retrieve one or more feature monitoring configurations from the Feature Group and Feature View Python objects and filter them by name, configuration id or feature name. === "Python" @@ -39,13 +40,13 @@ You can retrieve one or more feature monitoring configurations from the Feature You can enable or disable feature monitoring while keeping the historical statistics and comparison results. -### From UI +### Disable feature monitoring from UI In the overview page for feature monitoring, you can enable or disable a specific configuration by clicking on the ^^Disable^^ button. ![Disable button in a feature monitoring configuration](../../../assets/images/guides/fs/feature_monitoring/fm-config-disable-arrow.png) -### From Python client +### Disable feature monitoring from Python You can easily enable or disable a specific feature monitoring configuration using the Python object. @@ -63,7 +64,7 @@ You can easily enable or disable a specific feature monitoring configuration usi You can trigger the feature monitoring job on demand, to compute and compare statistics on the detection and reference windows according to the feature monitoring configuration. -### From UI +### Run the statistics comparison manually from UI In the overview page for feature monitoring, you can trigger the computation and comparison of statistics for a specific configuration by clicking on the ^^Run once^^ button. @@ -72,7 +73,7 @@ In the overview page for feature monitoring, you can trigger the computation and ![Run once button in a feature monitoring configuration](../../../assets/images/guides/fs/feature_monitoring/fm-config-run-once-arrow.png) -### From Python client +### Run the statistics comparison manually from Python To trigger the feature monitoring job once from the Python API, use the feature monitoring Python object as shown in the example below. @@ -85,13 +86,14 @@ To trigger the feature monitoring job once from the Python API, use the feature ## Get feature monitoring results -### From UI +### Get feature monitoring results from UI -The easiest way to explore the statistics and comparison results is using the Hopsworks ==interactive graph== for Feature Monitoring. See more information on the [Interactive graph guide](interactive_graph.md). +The easiest way to explore the statistics and comparison results is using the Hopsworks ==interactive graph== for Feature Monitoring. +See more information on the [Interactive graph guide](interactive_graph.md). ![Visualize statistics on a time series](../../../assets/images/guides/fs/feature_monitoring/fm-reference-plot.png) -### From Python client +### Get feature monitoring results from Python Alternatively, you can retrieve all the statistics and comparison results using the feature monitoring configuration Python object as shown in the example below. @@ -112,7 +114,7 @@ Alternatively, you can retrieve all the statistics and comparison results using Deleting a feature monitoring configuration also deletes the historical statistics and comparison results attached to this configuration. -### From Python client +### Delete a feature monitoring configuration from Python You can delete feature monitoring configurations using the Python API only, as shown in the example below. diff --git a/docs/user_guides/fs/feature_monitoring/index.md b/docs/user_guides/fs/feature_monitoring/index.md index d38e8c44f..1916ec18e 100644 --- a/docs/user_guides/fs/feature_monitoring/index.md +++ b/docs/user_guides/fs/feature_monitoring/index.md @@ -2,14 +2,17 @@ ## Introduction -Feature Monitoring complements the Hopsworks data validation capabilities by allowing -you to monitor your data once they have been ingested into the Feature Store. Hopsworks -feature monitoring user interface is centered around two functionalities: +Feature Monitoring complements the Hopsworks data validation capabilities by allowing you to monitor your data once they have been ingested into the Feature Store. +Hopsworks feature monitoring user interface is centered around two functionalities: -- **Scheduled Statistics**: The user defines a _detection window_ over its data for which Hopsworks will compute the statistics on a regular basis. The results are stored -in Hopsworks and enable the user to visualise the temporal evolution of statistical metrics on its data. This can be enabled for a whole Feature Group or Feature View, or for a particular Feature. For more details, see the [Scheduled statistics guide](scheduled_statistics.md). +- **Scheduled Statistics**: The user defines a _detection window_ over its data for which Hopsworks will compute the statistics on a regular basis. + The results are stored in Hopsworks and enable the user to visualise the temporal evolution of statistical metrics on its data. + This can be enabled for a whole Feature Group or Feature View, or for a particular Feature. + For more details, see the [Scheduled statistics guide](scheduled_statistics.md). -- **Statistics Comparison**: Enabled only for individual features, this variant allows the user to schedule the statistics computation on both a _detection_ and a _reference window_. By providing information about how to compare those statistics, you can setup alerts to quickly detect critical change in the data. For more details, see the [Statistics comparison guide](statistics_comparison.md). +- **Statistics Comparison**: Enabled only for individual features, this variant allows the user to schedule the statistics computation on both a _detection_ and a _reference window_. + By providing information about how to compare those statistics, you can setup alerts to quickly detect critical change in the data. + For more details, see the [Statistics comparison guide](statistics_comparison.md). !!! important To enable feature monitoring in Hopsworks, you need to set the `enable_feature_monitoring` [configuration option](../../../setup_installation/admin/variables.md) to `true`. @@ -22,29 +25,34 @@ in Hopsworks and enable the user to visualise the temporal evolution of statisti ## Statistics computation on windows of feature data -Statistics are computed on feature data defined by windows. There are different types of windows depending on how they evolve over time. A window can have either a *fixed* length (e.g., static window) or *variable* length (e.g., expanding window). Moreover, windows can stick to a *specific point in time* (e.g., static window) or *move* over time (e.g., sliding or rolling window). +Statistics are computed on feature data defined by windows. +There are different types of windows depending on how they evolve over time. +A window can have either a _fixed_ length (e.g., static window) or _variable_ length (e.g., expanding window). +Moreover, windows can stick to a _specific point in time_ (e.g., static window) or _move_ over time (e.g., sliding or rolling window). ![Types of windows](../../../assets/images/guides/fs/feature_monitoring/fm-types-of-windows.png) -!!! info "Specific values" +!!! info "Specific values" A specific value can be seen as a window of length 1 where the start and end of the window have the same value. -These types of windows apply to both *detection* and *reference* windows. Different types of windows allows for different use cases depending on whether you enable feature monitoring on your Feature Groups or Feature Views. +These types of windows apply to both _detection_ and _reference_ windows. +Different types of windows allows for different use cases depending on whether you enable feature monitoring on your Feature Groups or Feature Views. -See more details about *detection* and *reference* windows in the [Detection windows](scheduled_statistics.md#detection-windows) and [Reference windows](statistics_comparison#reference-windows) guides. +See more details about _detection_ and _reference_ windows in the [Detection windows](./scheduled_statistics.md#detection-windows) and [Reference windows](./statistics_comparison.md#reference-windows) guides. ## Visualize statistics on a time series -Hopsworks provides an interactive graph to make the exploration of statistics and statistics comparison results more efficient and help you find unexpected trends or anomalous values faster. See the [Interactive graph guide](interactive_graph.md) for more information. +Hopsworks provides an interactive graph to make the exploration of statistics and statistics comparison results more efficient and help you find unexpected trends or anomalous values faster. +See the [Interactive graph guide](interactive_graph.md) for more information. ![Feature monitoring graph](../../../assets/images/guides/fs/feature_monitoring/fm-show-shifted-points-arrow.png) - ## Alerting -Moreover, feature monitoring integrates with the Hopsworks built-in system for [alerts](../../../setup_installation/admin/alert.md), enabling you to setup alerts that will notify you as soon as shift is detected in your feature values. You can setup alerts for feature monitoring at a Feature Group, Feature View, and project level. +Moreover, feature monitoring integrates with the Hopsworks built-in system for [alerts](../../../setup_installation/admin/alert.md), enabling you to setup alerts that will notify you as soon as shift is detected in your feature values. +You can setup alerts for feature monitoring at a Feature Group, Feature View, and project level. -!!! tip "Select the correct trigger" +!!! tip "Select the correct trigger" When configuring alerts for feature monitoring, make sure you select the `feature monitoring-shift detected` or `feature monitoring-shift undetected` trigger. ![Feature monitoring alerts](../../../assets/images/guides/fs/feature_monitoring/fm-alerts.png) diff --git a/docs/user_guides/fs/feature_monitoring/interactive_graph.md b/docs/user_guides/fs/feature_monitoring/interactive_graph.md index d671347ff..be20dc941 100644 --- a/docs/user_guides/fs/feature_monitoring/interactive_graph.md +++ b/docs/user_guides/fs/feature_monitoring/interactive_graph.md @@ -2,19 +2,22 @@ Hopsworks provides an *interactive graph* to help you explore the statistics com ### Select a feature monitoring configuration -First, you need to select a feature monitoring configuration to visualize. You can achieve that by clicking on the dropdown menu under ^^Feature Selection^^ on the controls menu. +First, you need to select a feature monitoring configuration to visualize. +You can achieve that by clicking on the dropdown menu under ^^Feature Selection^^ on the controls menu. ![Select feature monitoring config](../../../assets/images/guides/fs/feature_monitoring/fm-select-config-arrow.png) ### Select a statistics metric to visualize -When you select a feature monitoring configuration, the mean values computed over time are visualized by default on the time series graph. You can choose a different statistics metric on the dropdown menu under ^^Statistics Selection^^ on the controls menu. +When you select a feature monitoring configuration, the mean values computed over time are visualized by default on the time series graph. +You can choose a different statistics metric on the dropdown menu under ^^Statistics Selection^^ on the controls menu. ![Select statistics metric](../../../assets/images/guides/fs/feature_monitoring/fm-select-metric-arrow.png) ### Visualize multiple configurations simultaneously -Multiple feature monitoring configurations can be visualized at the same time on the graph. You can add a feature monitoring configuration by clicking on the ==+== button on the controls menu. +Multiple feature monitoring configurations can be visualized at the same time on the graph. +You can add a feature monitoring configuration by clicking on the ==+== button on the controls menu. !!! note The same statistics metric will be visualized for every feature monitoring configuration selected. @@ -23,7 +26,8 @@ Multiple feature monitoring configurations can be visualized at the same time on ### Show reference statistics -In feature monitoring configurations with reference windows, you can also visualize the reference values by enabling the ^^Reference values^^ checkbox on the controls menu. Reference values can be either statistics computed over time or a specific value shown as an horizontal line. +In feature monitoring configurations with reference windows, you can also visualize the reference values by enabling the ^^Reference values^^ checkbox on the controls menu. +Reference values can be either statistics computed over time or a specific value shown as an horizontal line. !!! note The same statistics metric will be visualized for both detection and reference values. @@ -35,33 +39,37 @@ In feature monitoring configurations with reference windows, you can also visual ### Show threshold bounds -In addition to reference windows, you can define thresholds to automate the identification of data points as anomalous values. A threshold can be absolute, or relative to the statistics values under comparison. You can visualize the threshold bounds by enabling the ^^Threshold bounds^^ checkbox on the controls menu. +In addition to reference windows, you can define thresholds to automate the identification of data points as anomalous values. +A threshold can be absolute, or relative to the statistics values under comparison. +You can visualize the threshold bounds by enabling the ^^Threshold bounds^^ checkbox on the controls menu. ![Show threshold bounds](../../../assets/images/guides/fs/feature_monitoring/fm-show-threshold-arrow.png) !!! info More details about statistics comparison options can be found in [Comparison criteria](statistics_comparison.md#comparison-criteria). - ### Highlight shifted data points -If a reference window and threshold are provided, data points that fall out of the threshold bounds are considered anomalous values. You can highlight these data points by enabling the ^^Shift detected^^ checkbox on the controls menu. +If a reference window and threshold are provided, data points that fall out of the threshold bounds are considered anomalous values. +You can highlight these data points by enabling the ^^Shift detected^^ checkbox on the controls menu. ![Highlight shifted data points](../../../assets/images/guides/fs/feature_monitoring/fm-show-shifted-points-arrow.png) ### Visualize the computed differences between statistics -Alternatively, you can change the time series to show the differences computed between detection and reference statistics rather than the statistics values themselves. You can achieve that by enabling the ^^Difference^^ checkbox on the controls menu. +Alternatively, you can change the time series to show the differences computed between detection and reference statistics rather than the statistics values themselves. +You can achieve that by enabling the ^^Difference^^ checkbox on the controls menu. ![Show difference between statistics](../../../assets/images/guides/fs/feature_monitoring/fm-show-diff-arrow.png) ### List of configurations -Following the time series graph, you can find an overview of all feature monitoring configurations defined for the corresponding Feature Group or Feature View. This overview includes a summary of the detection and reference windows, statistics comparison criteria and job schedule. +Following the time series graph, you can find an overview of all feature monitoring configurations defined for the corresponding Feature Group or Feature View. +This overview includes a summary of the detection and reference windows, statistics comparison criteria and job schedule. In addition, you can trigger the statistics comparison manually, or disable the schedule of the feature monitoring job by clicking on ^^Run once^^ or ^^Disable^^ buttons, respectively. !!! note Triggering the statistics comparison manually does not affect the schedule of the feature monitoring. -![List of feature monitoring configs](../../../assets/images/guides/fs/feature_monitoring/fm-list-configs.png) \ No newline at end of file +![List of feature monitoring configs](../../../assets/images/guides/fs/feature_monitoring/fm-list-configs.png) diff --git a/docs/user_guides/fs/feature_monitoring/scheduled_statistics.md b/docs/user_guides/fs/feature_monitoring/scheduled_statistics.md index 8e5cee4a8..b739be5d5 100644 --- a/docs/user_guides/fs/feature_monitoring/scheduled_statistics.md +++ b/docs/user_guides/fs/feature_monitoring/scheduled_statistics.md @@ -1,6 +1,8 @@ -Hopsworks scheduled statistics allows you to monitor your feature data once they have been ingested into the Feature Store. You can define a ==detection window== over your data for which Hopsworks will compute the statistics on a regular basis. Statistics can be computed on all or a subset of feature values, and on one or more features simultaneously. +Hopsworks scheduled statistics allows you to monitor your feature data once they have been ingested into the Feature Store. +You can define a ==detection window== over your data for which Hopsworks will compute the statistics on a regular basis. +Statistics can be computed on all or a subset of feature values, and on one or more features simultaneously. -Hopsworks stores the computed statistics and enable you to visualise the temporal evolution of statistical metrics on your data. +Hopsworks stores the computed statistics and enable you to visualise the temporal evolution of statistical metrics on your data. ![Detection statistics visualization](../../../assets/images/guides/fs/feature_monitoring/fm-detection-plot.png) @@ -9,7 +11,8 @@ Hopsworks stores the computed statistics and enable you to visualise the tempora ## Use cases -Scheduled statistics monitoring is a powerful tool that allows you to monitor your data over time and detect anomalies in your feature data at a glance by visualizing the evolution of the statistics properties of your data in a time series. It can be enabled in both Feature Groups and Feature Views, but for different purposes. +Scheduled statistics monitoring is a powerful tool that allows you to monitor your data over time and detect anomalies in your feature data at a glance by visualizing the evolution of the statistics properties of your data in a time series. +It can be enabled in both Feature Groups and Feature Views, but for different purposes. For **Feature Groups**, scheduled statistics enables you to analyze how your Feature Group data evolve over time, and leverage your intuition to identify trends or detect noisy values in the inserted feature data. @@ -17,16 +20,21 @@ For **Feature Views**, scheduled statistics enables you to analyze the statistic ## Detection windows -Statistics are computed in a scheduled basis on a pre-defined detection window of feature data. Detection windows can be defined on the whole feature data or a subset of feature data depending on the `time_offset` and `window_length` parameters of the `with_detection_window` method. +Statistics are computed in a scheduled basis on a pre-defined detection window of feature data. +Detection windows can be defined on the whole feature data or a subset of feature data depending on the `time_offset` and `window_length` parameters of the `with_detection_window` method. ![Types of detection windows](../../../assets/images/guides/fs/feature_monitoring/fm-detection-windows.png) -In [a previous section](index.md#statistics-computation-on-windows-of-feature-data) we described different types of windows available. Taking a Feature Group as an example, the figure above describes how these windows are applied to Feature Group data, resulting in three different applications: +In [a previous section](index.md#statistics-computation-on-windows-of-feature-data) we described different types of windows available. +Taking a Feature Group as an example, the figure above describes how these windows are applied to Feature Group data, resulting in three different applications: -- A _expanding window_ covering the whole Feature Group data from its creation until the time when statistics are computing. It can be seen as an snapshot of the **latest version of your feature data**. -- A _rolling window_ covering a variable subset of feature data (e.g., feature data written last week). It helps you analyze the properties of **newly inserted feature data**. +- A _expanding window_ covering the whole Feature Group data from its creation until the time when statistics are computing. + It can be seen as an snapshot of the **latest version of your feature data**. +- A _rolling window_ covering a variable subset of feature data (e.g., feature data written last week). + It helps you analyze the properties of **newly inserted feature data**. See more details on how to define a detection window for your Feature Groups and Feature Views in the Feature Monitoring Guides for [Feature Groups](../feature_group/feature_monitoring.md) and [Feature Views](../feature_view/feature_monitoring.md). -!!! info "Next steps" - You can also define a reference window to be used as a baseline to compare against the detection window. See more details in the [Statistics comparison guide](statistics_comparison.md). \ No newline at end of file +!!! info "Next steps" + You can also define a reference window to be used as a baseline to compare against the detection window. + See more details in the [Statistics comparison guide](statistics_comparison.md). diff --git a/docs/user_guides/fs/feature_monitoring/statistics_comparison.md b/docs/user_guides/fs/feature_monitoring/statistics_comparison.md index 2a85ed0ec..dcdc56a8e 100644 --- a/docs/user_guides/fs/feature_monitoring/statistics_comparison.md +++ b/docs/user_guides/fs/feature_monitoring/statistics_comparison.md @@ -1,6 +1,9 @@ -Hopsworks feature monitoring allows you to monitor your feature data once they have been ingested into the Feature Store. You can define ==detection and reference windows== over your data for which Hopsworks will compute the statistics on a regular basis, compare them, and optionally trigger alerts when significant differences are detected. Statistics can be computed on all or a subset of feature values, and on one or more features simultaneously. Also, you can specify the criteria under which statistics will be compared and set thresholds used to classify feature values as anomalous. +Hopsworks feature monitoring allows you to monitor your feature data once they have been ingested into the Feature Store. +You can define ==detection and reference windows== over your data for which Hopsworks will compute the statistics on a regular basis, compare them, and optionally trigger alerts when significant differences are detected. +Statistics can be computed on all or a subset of feature values, and on one or more features simultaneously. +Also, you can specify the criteria under which statistics will be compared and set thresholds used to classify feature values as anomalous. -Hopsworks stores both detection and reference statistics and enable you to visualise the temporal evolution of statistical metrics. +Hopsworks stores both detection and reference statistics and enable you to visualise the temporal evolution of statistical metrics. ![Reference statistics visualization](../../../assets/images/guides/fs/feature_monitoring/fm-reference-plot.png) @@ -9,7 +12,8 @@ Hopsworks stores both detection and reference statistics and enable you to visua ## Use cases -Feature monitoring is a powerful tool that allows you to monitor your data over time and quickly detect anomalies in your feature data by comparing statistics computed on different windows of your feature data, notifying you about anomalies, and/or visualizing the evolution of these statistics and comparison results in a time series. It can be enabled in both Feature Groups and Feature Views, but for different purposes. +Feature monitoring is a powerful tool that allows you to monitor your data over time and quickly detect anomalies in your feature data by comparing statistics computed on different windows of your feature data, notifying you about anomalies, and/or visualizing the evolution of these statistics and comparison results in a time series. +It can be enabled in both Feature Groups and Feature Views, but for different purposes. For **Feature Groups**, feature monitoring helps you rapidly identify unexpected trends or anomalous values in your Feature Group data, facilitating the debugging of possible root causes such as newly introduced changes in your feature pipelines. @@ -17,16 +21,23 @@ For **Feature Views**, feature monitoring helps you quickly detect when newly in ## Reference windows -To compare statistics computed on a _detection window_ against a baseline, you need to define a _reference window_ of feature data. Reference windows can be defined in different ways depending on whether you are configuring feature monitoring on a Feature Group or Feature View. +To compare statistics computed on a _detection window_ against a baseline, you need to define a _reference window_ of feature data. +Reference windows can be defined in different ways depending on whether you are configuring feature monitoring on a Feature Group or Feature View. ![Types of reference windows](../../../assets/images/guides/fs/feature_monitoring/fm-reference-windows.png) -In [a previous section](index.md#statistics-computation-on-windows-of-feature-data) we described different types of windows available. Taking a Feature View as an example, the figure above describes how these windows are applied to Feature Group data read by a Feature View query and Training data, resulting in the following applications: +In [a previous section](index.md#statistics-computation-on-windows-of-feature-data) we described different types of windows available. +Taking a Feature View as an example, the figure above describes how these windows are applied to Feature Group data read by a Feature View query and Training data, resulting in the following applications: -- A _expanding window_ covering the whole Feature Group data from its creation until the time when statistics are computing. It can be seen as an snapshot of the latest version of your feature data. This reference window is useful when you want to compare the statistics of **newly inserted feature data against all the Feature Group data**. -- A _rolling window_ covering a variable subset of feature data (e.g., feature data written last week). It helps you compare the properties of **feature data inserted at different cadences** (e.g., feature data inserted last month and two months ago). -- A _static window_ representing a snapshot of Feature Group data read using the Feature View query at a specific point in time (i.e., Training Dataset). It helps you compare **newly inserted feature data** into your Feature Groups **against a Training Dataset version**. -- A _specific value_. It helps you target the analysis of feature data to a **specific feature and statistics metric**. +- A _expanding window_ covering the whole Feature Group data from its creation until the time when statistics are computing. + It can be seen as an snapshot of the latest version of your feature data. + This reference window is useful when you want to compare the statistics of **newly inserted feature data against all the Feature Group data**. +- A _rolling window_ covering a variable subset of feature data (e.g., feature data written last week). + It helps you compare the properties of **feature data inserted at different cadences** (e.g., feature data inserted last month and two months ago). +- A _static window_ representing a snapshot of Feature Group data read using the Feature View query at a specific point in time (i.e., Training Dataset). + It helps you compare **newly inserted feature data** into your Feature Groups **against a Training Dataset version**. +- A _specific value_. + It helps you target the analysis of feature data to a **specific feature and statistics metric**. See more details on how to define a reference window for your Feature Groups and Training Datasets in the Feature Monitoring guides for [Feature Groups](../feature_group/feature_monitoring.md) and [Feature Views](../feature_view/feature_monitoring.md). @@ -36,27 +47,32 @@ After defining the detection and reference windows, you can specify the criteria ??? no-icon "Statistics metric" - Although all descriptive statistics are computed on the pre-defined windows of feature data, the comparison of statistics is performed only on one of the statistics metrics. In other words, **difference values are only computed for a single statistics metric**. You can select the targeted statistics metric using the `metric` parameter when calling the `compare_on` method. - + Although all descriptive statistics are computed on the pre-defined windows of feature data, the comparison of statistics is performed only on one of the statistics metrics. + In other words, **difference values are only computed for a single statistics metric**. + You can select the targeted statistics metric using the `metric` parameter when calling the `compare_on` method. + ??? no-icon "Threshold bounds" - Threshold bounds are used to classify feature values as anomalous, by comparing them against the difference values computed on a specific statistics metric. You can defined a threshold value using the `threshold` parameter when calling the `compare_on` method. + Threshold bounds are used to classify feature values as anomalous, by comparing them against the difference values computed on a specific statistics metric. + You can defined a threshold value using the `threshold` parameter when calling the `compare_on` method. ??? no-icon "Relative or absolute" - _Difference_ values represent the amount of change in the detection statistics with regards to the reference values. They can be computed in absolute or relative terms, as specified in the `relative` boolean parameter when calling the `compare_on` method. + _Difference_ values represent the amount of change in the detection statistics with regards to the reference values. + They can be computed in absolute or relative terms, as specified in the `relative` boolean parameter when calling the `compare_on` method. - **Absolute**: _$detection value - reference value$_ - **Relative**: _$(detection value - reference value) / reference value$_ - ??? no-icon "Strict or relaxed" - Threshold bounds set the limits under which the amount of change between detection and reference values is ==normal==. These bounds can be strict (`<` or `>`) or relaxed (`<=` and `=>`), as defined in the `strict` parameter when calling the `compare_on` method. + Threshold bounds set the limits under which the amount of change between detection and reference values is ==normal==. + These bounds can be strict (`<` or `>`) or relaxed (`<=` and `=>`), as defined in the `strict` parameter when calling the `compare_on` method. Hopsworks stores the results of each statistics comparison and enables you to visualise them together with the detection and reference values in a time series graph. ![Threshold and shift visualization](../../../assets/images/guides/fs/feature_monitoring/fm-threshold-plot.png) -!!! info "Next steps" - You can setup alerts that will notify you whenever anomalies are detected on your feature data. See more details in the [alerting section](index.md#alerting) of the feature monitoring guide. \ No newline at end of file +!!! info "Next steps" + You can setup alerts that will notify you whenever anomalies are detected on your feature data. + See more details in the [alerting section](index.md#alerting) of the feature monitoring guide. diff --git a/docs/user_guides/fs/feature_view/batch-data.md b/docs/user_guides/fs/feature_view/batch-data.md index 1663da9b1..91fc6861e 100644 --- a/docs/user_guides/fs/feature_view/batch-data.md +++ b/docs/user_guides/fs/feature_view/batch-data.md @@ -1,7 +1,10 @@ # Batch data (analytical ML systems) ## Creation -It is very common that ML models are deployed in a "batch" setting where ML pipelines score incoming new data at a regular interval, for example, daily or weekly. Feature views support batch prediction by returning batch data as a DataFrame over a time range, by `start_time` and `end_time`. The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models to make predictions. + +It is very common that ML models are deployed in a "batch" setting where ML pipelines score incoming new data at a regular interval, for example, daily or weekly. +Feature views support batch prediction by returning batch data as a DataFrame over a time range, by `start_time` and `end_time`. +The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models to make predictions. === "Python" ```python @@ -17,7 +20,9 @@ It is very common that ML models are deployed in a "batch" setting where ML pipe ``` ## Retrieve batch data with primary keys and event time -For certain use cases, e.g. time series models, the input data needs to be sorted according to the primary key(s) and event time combination. Or one might want to merge predictions back with the original input data for postmortem analysis. + +For certain use cases, e.g., time series models, the input data needs to be sorted according to the primary key(s) and event time combination. +Or one might want to merge predictions back with the original input data for postmortem analysis. Primary key(s) and event time are not usually included in the feature view query as they are not features used for training. To retrieve the primary key(s) and/or event time when retrieving batch data for inference, you need to set the parameters `primary_key=True` and/or `event_time=True`. @@ -32,11 +37,13 @@ To retrieve the primary key(s) and/or event time when retrieving batch data for ) # return a dataframe with primary keys and event time ``` !!! note - All primary and event time columns of all the feature groups included in the feature view will be returned. If they have the same names across feature groups and the join prefix was not provided then reading operation will fail with ambiguous column exception. + All primary and event time columns of all the feature groups included in the feature view will be returned. + If they have the same names across feature groups and the join prefix was not provided then reading operation will fail with ambiguous column exception. Make sure to define the join prefix if primary key and event time columns have the same names across feature groups. For Python-clients, handling small or moderately-sized data, we recommend enabling the [ArrowFlight Server with DuckDB](../../../setup_installation/common/arrow_flight_duckdb.md), which will provide significant speedups over Spark/Hive for reading batch data. If the service is enabled, and you want to read this particular batch data with Hive instead, you can set the read_options to `{"use_hive": True}`. + ```python # get batch data with Hive df = feature_view.get_batch_data( @@ -47,34 +54,37 @@ df = feature_view.get_batch_data( ``` ## Creation with transformation -If you have specified transformation functions when creating a feature view, you will get back transformed batch data as well. If your transformation functions require statistics of training dataset, you must also provide the training data version. `init_batch_scoring` will then fetch the statistics and initialize the functions with required statistics. Then you can follow the above examples and create the batch data. Please note that transformed batch data can only be returned in the python client but not in the java client. + +If you have specified transformation functions when creating a feature view, you will get back transformed batch data as well. +If your transformation functions require statistics of training dataset, you must also provide the training data version. `init_batch_scoring` will then fetch the statistics and initialize the functions with required statistics. +Then you can follow the above examples and create the batch data. +Please note that transformed batch data can only be returned in the python client but not in the java client. ```python feature_view.init_batch_scoring(training_dataset_version=1) ``` -It is important to note that in addition to the filters defined in feature view, [extra filters](./training-data.md#Extra-filters) will be applied if they are defined in the given training dataset version. +It is important to note that in addition to the filters defined in feature view, [extra filters](./training-data.md#extra-filters) will be applied if they are defined in the given training dataset version. ## Retrieving untransformed batch data -By default, the `get_batch_data` function returns batch data with model-dependent transformations applied. However, you can retrieve untransformed batch data—while still including on-demand features—by setting the `transform` parameter to `False`. +By default, the `get_batch_data` function returns batch data with model-dependent transformations applied. +However, you can retrieve untransformed batch data—while still including on-demand features—by setting the `transform` parameter to `False`. -=== "Python" +=== "Python" !!! example "Returning untransformed batch data" ```python # Fetching untransformed batch data. untransformed_batch_data = feature_view.get_batch_data(transform=False) ``` - ## Passing Context Variables to Transformation Functions -After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the necessary context variables through the `transformation_context` parameter when fetching batch data. +After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the necessary context variables through the `transformation_context` parameter when fetching batch data. -=== "Python" +=== "Python" !!! example "Passing context variables while fetching batch data." ```python # Passing context variable to IN-MEMORY Training Dataset. batch_data = feature_view.get_batch_data(transformation_context={"context_parameter":10}) - - ``` \ No newline at end of file + ``` diff --git a/docs/user_guides/fs/feature_view/feature-server.md b/docs/user_guides/fs/feature_view/feature-server.md index 3212dde6a..8baea1794 100644 --- a/docs/user_guides/fs/feature_view/feature-server.md +++ b/docs/user_guides/fs/feature_view/feature-server.md @@ -6,21 +6,27 @@ description: Using Feature Store REST API Server for retrieving feature vectors This API server allows users to retrieve single/batch feature vectors from a feature view. -## How to use +## How to use -From Hopsworks 3.3, you can connect to the Feature Vector Server via any REST client which supports POST requests. Set the `X-API-KEY` to your Hopsworks API Key and send the request with a JSON body, [single](#request) or [batch](#request-1). By default, the server listens on the `0.0.0.0:4406` and the api version is set to `0.1.0`. Please refer to `/srv/hops/mysql-cluster/rdrs_config.json` config file located on machines running the REST Server for additional configuration parameters. +From Hopsworks 3.3, you can connect to the Feature Vector Server via any REST client which supports POST requests. +Set the `X-API-KEY` to your Hopsworks API Key and send the request with a JSON body, [single](#single-feature-vector-request) or [batch](#batch-feature-vectors-request). +By default, the server listens on the `0.0.0.0:4406` and the api version is set to `0.1.0`. +Please refer to `/srv/hops/mysql-cluster/rdrs_config.json` config file located on machines running the REST Server for additional configuration parameters. -In Hopsworks 3.7, we introduced a python client for the Online Store REST API Server. The python client is available in the `hsfs` module and can be installed using `pip install hsfs`. This client can be used instead of the Online Store SQL client in the `FeatureView.get_feature_vector(s)` methods. Check the corresponding [documentation](./feature-vectors.md) for these methods. +In Hopsworks 3.7, we introduced a python client for the Online Store REST API Server. +The python client is available in the `hsfs` module and can be installed using `pip install hsfs`. +This client can be used instead of the Online Store SQL client in the `FeatureView.get_feature_vector(s)` methods. +Check the corresponding [documentation](./feature-vectors.md) for these methods. -## Single feature vector +## Single Feature Vector -### Request +### Single Feature Vector Request `POST /{api-version}/feature_store` -**Body** +#### Single Feature Vector Request Body -``` +```json { "featureStoreName": "fsdb002", "featureViewName": "sample_2", @@ -40,21 +46,21 @@ In Hopsworks 3.7, we introduced a python client for the Online Store REST API Se } ``` -**Parameters** +#### Single Feature Vector Request Parameters -**parameter** | **type** | **note** ------------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -featureStoreName | string | -featureViewName | string | -featureViewVersion | number(int) | -entries | objects | Map of serving key of feature view as key and value of serving key as value. Serving key are a set of the primary key of feature groups which are included in the feature view query. If feature groups are joint with prefix, the primary key needs to be attached with prefix. -passedFeatures | objects | Optional. Map of feature name as key and feature value as value. This overwrites feature values in the response. -metadataOptions | objects | Optional. Map of metadataoption as key and boolean as value. Default metadata option is false. Metadata is returned on request. Metadata options available: 1\. featureName 2\. featureType | -options | objects | Optional. Map of option as key and boolean as value. Default option is false. Options available: 1\. validatePassedFeatures 2\. includeDetailedStatus +| **parameter** | **type** | **note** | +| --- | --- | --- | +| featureStoreName | string | | +| featureViewName | string | | +| featureViewVersion | number(int) | | +| entries | objects | Map of serving key of feature view as key and value of serving key as value. Serving key are a set of the primary key of feature groups which are included in the feature view query. If feature groups are joint with prefix, the primary key needs to be attached with prefix. | +| passedFeatures | objects | Optional. Map of feature name as key and feature value as value. This overwrites feature values in the response. | +| metadataOptions | objects | Optional. Map of metadataoption as key and boolean as value. Default metadata option is false. Metadata is returned on request. Metadata options available: 1\. featureName 2\. featureType | +| options | objects | Optional. Map of option as key and boolean as value. Default option is false. Options available: 1\. validatePassedFeatures 2\. includeDetailedStatus | -### Response +### Single Feature Vector Response -``` +```json { "features": [ 36, @@ -94,19 +100,19 @@ options | objects | Optional. Map of option as key and boolean as } ``` -### Error handling +### Single Feature Vector Errors -**Code** | **reason** | **response** --------- | ------------------------------------- | ------------------------------------ -200 | | -400 | Requested metadata does not exist | -400 | Error in pk or passed feature value | -401 | Access denied | Access unshared feature store failed -500 | Failed to read feature store metadata | +| **Code** | **reason** | **response** | +| -------- | ------------------------------------- | ------------------------------------ | +| 200 | | | +| 400 | Requested metadata does not exist | | +| 400 | Error in pk or passed feature value | | +| 401 | Access denied | Access unshared feature store failed | +| 500 | Failed to read feature store metadata | | -**Response with pk/pass feature error** +#### Response with PK/pass feature error -``` +```json { "code": 12, "message": "Wrong primay-key column. Column: ts", @@ -114,9 +120,9 @@ options | objects | Optional. Map of option as key and boolean as } ``` -**Response with metadata error** +#### Response with metadata error -``` +```json { "code": 2, "message": "", @@ -124,9 +130,9 @@ options | objects | Optional. Map of option as key and boolean as } ``` -**Pk value no match** +#### PK value no match -``` +```json { "features": [ 9876543, @@ -139,22 +145,26 @@ options | objects | Optional. Map of option as key and boolean as } ``` -**Detailed Status** +#### Detailed Status -If `includeDetailedStatus` option is set to true, detailed status is returned in the response. Detailed status is a list of feature group id and http status code, corresponding to each read operations perform internally by RonDB. Meaning is as follows: +If `includeDetailedStatus` option is set to true, detailed status is returned in the response. +Detailed status is a list of feature group id and http status code, corresponding to each read operations perform internally by RonDB. +Meaning is as follows: - `featureGroupId`: Id of the feature group, used to identify which table the operation correspond from. -- `httpStatus`: Http status code of the operation. - * 200 means success - * 400 means bad request, likely pk name is wrong or pk is incomplete. In particular, if pk for this table/feature group is not provided in the request, this http status is returned. - * 404 means no row corresponding to PK - * 500 means internal error. +- `httpStatus`: Http status code of the operation. + - 200 means success + - 400 means bad request, likely pk name is wrong or pk is incomplete. + In particular, if pk for this table/feature group is not provided in the request, this http status is returned. + - 404 means no row corresponding to PK + - 500 means internal error. -Both `404` and `400` set the status to `MISSING` in the response. Examples below corresponds respectively to missing row and bad request. +Both `404` and `400` set the status to `MISSING` in the response. +Examples below corresponds respectively to missing row and bad request. +Missing Row: The PK name-value pair was correctly passed, but the corresponding row was not found in the feature group. -Missing Row: The pk name,value was correctly passed but the corresponding row was not found in the feature group. -``` +```json { "features": [ 36, @@ -176,8 +186,9 @@ Missing Row: The pk name,value was correctly passed but the corresponding row wa } ``` -Bad Request e.g pk name,value pair for FG2 not provided or the corresponding column names was incorrect. -``` +Bad Request, e.g., when PK name-value pair for FG2 not provided or the corresponding column names was incorrect: + +```json { "features": [ 36, @@ -199,15 +210,15 @@ Bad Request e.g pk name,value pair for FG2 not provided or the corresponding col } ``` -## Batch feature vectors +## Batch Feature Vectors -### Request +### Batch Feature Vectors Request `POST /{api-version}/batch_feature_store` -**Body** +#### Batch Feature Vectors Request Body -``` +```json { "featureStoreName": "fsdb002", "featureViewName": "sample_2", @@ -242,23 +253,22 @@ Bad Request e.g pk name,value pair for FG2 not provided or the corresponding col } ``` -**Parameters** +#### Batch Feature Vectors Request Parameters -**parameter** | **type** | **note** ------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -featureStoreName | string | -featureViewName | string | -featureViewVersion | number(int) | -entries | `array` | Each items is a map of serving key as key and value of serving key as value. Serving key of feature view. -passedFeatures | `array` | Optional. Each items is a map of feature name as key and feature value as value. This overwrites feature values in the response. If provided, its size and order has to be equal to the size of entries. Item can be null. -metadataOptions | objects | Optional. Map of metadataoption as key and boolean as value. Default metadata option is false. Metadata is returned on request. Metadata options available: 1\. featureName 2\. featureType -options | objects | Optional. Map of option as key and boolean as value. Default option is false. Options available: 1\. validatePassedFeatures 2\. includeDetailedStatus +| **parameter** | **type** | **note** | +| --- | --- | --- | +| featureStoreName | string | | +| featureViewName | string | | +| featureViewVersion | number(int) | | +| entries | `array` | Each items is a map of serving key as key and value of serving key as value. Serving key of feature view. | +| passedFeatures | `array` | Optional. Each items is a map of feature name as key and feature value as value. This overwrites feature values in the response. If provided, its size and order has to be equal to the size of entries. Item can be null. | +| metadataOptions | objects | Optional. Map of metadataoption as key and boolean as value. Default metadata option is false. Metadata is returned on request. Metadata options available: 1\. featureName 2\. featureType | +| options | objects | Optional. Map of option as key and boolean as value. Default option is false. Options available: 1\. validatePassedFeatures 2\. includeDetailedStatus | -### Response +### Batch Feature Vectors Response -``` +```json { - { "features": [ [ 16, @@ -343,19 +353,19 @@ options | objects | Optional. Map of option as key and boole note: Order of the returned features are the same as the order of entries in the request. -### Error handling +### Batch Feature Vectors Errors -**Code** | **reason** | **response** --------- | ------------------------------------- | ------------------------------------ -200 | | -400 | Requested metadata does not exist | -404 | Missing row corresponding to pk value | -401 | Access denied | Access unshared feature store failed -500 | Failed to read feature store metadata | +| **Code** | **reason** | **response** | +| -------- | ------------------------------------- | ------------------------------------ | +| 200 | | | +| 400 | Requested metadata does not exist | | +| 404 | Missing row corresponding to pk value | | +| 401 | Access denied | Access unshared feature store failed | +| 500 | Failed to read feature store metadata | | -**Response with partial failure** +#### Response with partial failure -``` +```json { "features": [ [ @@ -397,4 +407,5 @@ note: Order of the returned features are the same as the order of entries in the ## Access control to feature store -Currently, the REST API server only supports Hopsworks API Keys for authentication and authorization. Add the API key to the HTTP requests using the `X-API-KEY` header. +Currently, the REST API server only supports Hopsworks API Keys for authentication and authorization. +Add the API key to the HTTP requests using the `X-API-KEY` header. diff --git a/docs/user_guides/fs/feature_view/feature-vectors.md b/docs/user_guides/fs/feature_view/feature-vectors.md index 6af3675c2..508568cb4 100644 --- a/docs/user_guides/fs/feature_view/feature-vectors.md +++ b/docs/user_guides/fs/feature_view/feature-vectors.md @@ -1,10 +1,23 @@ # Feature Vectors -The Hopsworks Platform integrates real-time capabilities with its Online Store. Based on [RonDB](https://www.rondb.com/), your feature vectors are served at scale at in-memory latency (~1-10ms). Checkout the benchmarks results [here](https://www.hopsworks.ai/post/feature-store-benchmark-comparison-hopsworks-and-feast#images-2) and the code [here](https://github.com/featurestoreorg/featurestore-benchmarks). The same Feature View which was used to create training datasets can be used to retrieve feature vectors for real-time predictions. This allows you to serve the same features to your model in training and serving, ensuring consistency and reducing boilerplate. Whether you are either inside the Hopsworks platform, a model serving platform, or in an external environment, such as your application server. -Below is a practical guide on how to use the Online Store Python and Java Client. The aim is to get you started quickly by providing code snippets which illustrate various use cases and functionalities of the clients. If you need to get more familiar with the concept of feature vectors, you can read this [short introduction](../../../concepts/fs/feature_view/online_api.md) first. +The Hopsworks Platform integrates real-time capabilities with its Online Store. +Based on [RonDB](https://www.rondb.com/), your feature vectors are served at scale at in-memory latency (~1-10ms). +Checkout [the benchmarks results](https://www.hopsworks.ai/post/feature-store-benchmark-comparison-hopsworks-and-feast#images-2) and [the benchmark code](https://github.com/featurestoreorg/featurestore-benchmarks). +The same Feature View which was used to create training datasets can be used to retrieve feature vectors for real-time predictions. +This allows you to serve the same features to your model in training and serving, ensuring consistency and reducing boilerplate. +Whether you are either inside the Hopsworks platform, a model serving platform, or in an external environment, such as your application server. + +Below is a practical guide on how to use the Online Store Python and Java Client. +The aim is to get you started quickly by providing code snippets which illustrate various use cases and functionalities of the clients. +If you need to get more familiar with the concept of feature vectors, you can read this [short introduction](../../../concepts/fs/feature_view/online_api.md) first. ## Retrieval -You can get back feature vectors from either python or java client by providing the primary key value(s) for the feature view. Note that filters defined in feature view and training data will not be applied when feature vectors are returned. If you need to retrieve a complete value of feature vectors without missing values, the required `entry` are [feature_view.primary_keys](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#primary_keys). Alternative, you can provide the primary key of the feature groups as the key of the entry. It is also possible to provide a subset of the entry, which will be discussed [below](#partial-feature-retrieval). + +You can get back feature vectors from either python or java client by providing the primary key value(s) for the feature view. +Note that filters defined in feature view and training data will not be applied when feature vectors are returned. +If you need to retrieve a complete value of feature vectors without missing values, the required `entry` are [FeatureView.primary_keys][hsfs.feature_view.FeatureView.primary_keys]. +Alternative, you can provide the primary key of the feature groups as the key of the entry. +It is also possible to provide a subset of the entry, which will be discussed [below](#partial-feature-retrieval). === "Python" ```python @@ -38,29 +51,36 @@ You can get back feature vectors from either python or java client by providing ``` ### Required entry -Starting from python client v3.4, you can specify different values for the primary key of the same name which exists in multiple feature groups but are not joint by the same name. The table below summarises the value of `primary_keys` in different settings. Considering that you are joining 2 feature groups, namely, `left_fg` and `right_fg`, the feature groups have different primary keys, and features (`feature_*`) in each setting. Also, the 2 feature groups are [joint](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/query_api/#join) on different *join conditions* and *prefix* as `left_fg.join(right_fg, , prefix=)`. - -For java client, and python client before v3.4, the `primary_keys` are the set of primary key of all the feature groups in the query. Python client is backward compatible. It means that the `primary_keys` used before v3.4 can be applied to python client of later versions as well. - -| Setting | primary key of `left_fg` | primary key of `right_fg` | join conditions | prefix | primary_keys | note | -|------|----------------------------|-----------------------------|-------------------------------------------|--------|-----------------------------------------------|----------------------------------------------------------| -| 1 | id | id | ```on=["id"]``` | | id | Same feature name is used in the join. | -| 2 | id1 | id2 | `left_on=["id1"], right_on=["id2"]` | | id1 | Different feature names are used in the join. | -| 3 | id1, id2 | id1 | `on=["id1"]` | | id1, id2 | `id2` is not part of the join conditions | -| 4 | id, user_id | id | `left_on=["user_id"], right_on=["id"]` | | id, user_id | Value of `user_id` is used for retrieving features from `right_fg` | -| 5 | id1 | id1, id2 | `on=["id1"]` | | id1, id2 | `id2` is not part of the join conditions | -| 6 | id | id, user_id | `left_on=["id"], right_on=["user_id"]` | “right_“| id, “right_id“ | Value of “right_id“ and "id" are used for retrieving features from `right_fg` | -| 7 | id | id, user_id | `left_on=["id"], right_on=["user_id"]` | | id, “fgId_<rightFgId>_<joinIndex>_id” | Value of “fgId_<rightFgId>_<joinIndex>_id“ and "id" are used for retrieving features from `right_fg`. See note below. | -| 8 | id | id | `left_on=["id"], right_on=["feature_1"]` | “right_“ | id, “right_id“ | No primary key from `right_fg` is used in the join. Value of `right_id` is used for retrieving features from `right_fg` | -| 9 | id | id | `left_on=["id"], right_on=["feature_1"]` | | id1, “fgId_<rightFgId>_<joinIndex>_id” | No primary key from `right_fg` is used in the join. Value of "fgId_<rightFgId>_<joinIndex>_id" is used for retrieving features from "right_fg`. See note below. | -| 10 | id | id | `left_on=["feature_1"], right_on=["id"]` | “right_“ | id, “right_id“ | No primary key from `left_fg` is used in the join. Value of `right_id` is used for retrieving features from `right_fg` | -| 11 | id | id | `left_on=["feature_1"], right_on=["id"]` | | id1, “fgId_<rightFgId>_<joinIndex>_id” | No primary key from `left_fg` is used in the join. Value of “fgId_<rightFgId>_<joinIndex>_id” is used for retrieving features from `right_fg`. See note below. | -| 12 | user, year | user, year | `left_on=["user"], right_on=["user"]` | “right_“ | user, year, “right_year“ | Value of "user" and "right_year" are used for retrieving features from `right_fg`. `right_fg` can be the same as feature group as `left_fg`. | -| 13 | user, year | user, year | `left_on=["user"], right_on=["user"]` | | user, year, “fgId_<rightFgId>_<joinIndex>_year” | Value of "user" and "fgId_<rightFgId>_<joinIndex>_year" are used for retrieving features from `right_fg`. `right_fg` can be the same as feature group as `left_fg`. See note below. | + +Starting from python client v3.4, you can specify different values for the primary key of the same name which exists in multiple feature groups but are not joint by the same name. +The table below summarises the value of `primary_keys` in different settings. +Considering that you are joining 2 feature groups, namely, `left_fg` and `right_fg`, the feature groups have different primary keys, and features (`feature_*`) in each setting. +Also, the 2 feature groups are [joint][hsfs.constructor.query.Query.join] on different *join conditions* and *prefix* as `left_fg.join(right_fg, , prefix=)`. + +For java client, and python client before v3.4, the `primary_keys` are the set of primary key of all the feature groups in the query. +Python client is backward compatible. +It means that the `primary_keys` used before v3.4 can be applied to python client of later versions as well. + +| Setting | primary key of `left_fg` | primary key of `right_fg` | join conditions | prefix | primary_keys | note | +| --- | --- | --- | --- | --- | --- | --- | +| 1 | id | id | ```on=["id"]``` | | id | Same feature name is used in the join. | +| 2 | id1 | id2 | `left_on=["id1"], right_on=["id2"]` | | id1 | Different feature names are used in the join. | +| 3 | id1, id2 | id1 | `on=["id1"]` | | id1, id2 | `id2` is not part of the join conditions | +| 4 | id, user_id | id | `left_on=["user_id"], right_on=["id"]` | | id, user_id | Value of `user_id` is used for retrieving features from `right_fg` | +| 5 | id1 | id1, id2 | `on=["id1"]` | | id1, id2 | `id2` is not part of the join conditions | +| 6 | id | id, user_id | `left_on=["id"], right_on=["user_id"]` | “right_“ | id, “right_id“ | Value of “right_id“ and "id" are used for retrieving features from `right_fg` | +| 7 | id | id, user_id | `left_on=["id"], right_on=["user_id"]` | | id, “fgId_<rightFgId>_<joinIndex>_id” | Value of “fgId_<rightFgId>_<joinIndex>_id“ and "id" are used for retrieving features from `right_fg`. See note below. | +| 8 | id | id | `left_on=["id"], right_on=["feature_1"]` | “right_“ | id, “right_id“ | No primary key from `right_fg` is used in the join. Value of `right_id` is used for retrieving features from `right_fg` | +| 9 | id | id | `left_on=["id"], right_on=["feature_1"]` | | id1, “fgId_<rightFgId>_<joinIndex>_id” | No primary key from `right_fg` is used in the join. Value of "fgId_<rightFgId>_<joinIndex>_id" is used for retrieving features from "right_fg`. See note below. | +| 10 | id | id | `left_on=["feature_1"], right_on=["id"]` | “right_“ | id, “right_id“ | No primary key from `left_fg` is used in the join. Value of `right_id` is used for retrieving features from `right_fg` | +| 11 | id | id | `left_on=["feature_1"], right_on=["id"]` | | id1, “fgId_<rightFgId>_<joinIndex>_id” | No primary key from `left_fg` is used in the join. Value of “fgId_<rightFgId>_<joinIndex>_id” is used for retrieving features from `right_fg`. See note below. | +| 12 | user, year | user, year | `left_on=["user"], right_on=["user"]` | “right_“ | user, year, “right_year“ | Value of "user" and "right_year" are used for retrieving features from `right_fg`. `right_fg` can be the same as feature group as `left_fg`. | +| 13 | user, year | user, year | `left_on=["user"], right_on=["user"]` | | user, year, “fgId_<rightFgId>_<joinIndex>_year” | Value of "user" and "fgId_<rightFgId>_<joinIndex>_year" are used for retrieving features from `right_fg`. `right_fg` can be the same as feature group as `left_fg`. See note below. | Note: -"<rightFgId>" can be found by `right_fg.id`. "<joinIndex>" is the order or the feature group in the join. In the example, it is 1 because `right_fg` is in the first join in the query `left_fg.join(right_fg, )`. +"<rightFgId>" can be found by `right_fg.id`. "<joinIndex>" is the order or the feature group in the join. +In the example, it is 1 because `right_fg` is in the first join in the query `left_fg.join(right_fg, )`. ### Missing Primary Key Entries @@ -114,7 +134,10 @@ That means, `get_feature_vectors` will never return partial feature vector, but If you are aware of missing features, you can use the [*passed features*](#passed-features) or [Partial feature retrieval](#partial-feature-retrieval) functionality, described down below. ### Partial feature retrieval -If your model can handle missing value or if you want to impute the missing value, you can get back feature vectors with partial values using python client starting from version 3.4 (Note that this does not apply to java client.). In the example below, let's say you join 2 feature groups by `fg1.join(fg2, left_on=["pk1"], right_on=["pk2"])`, required keys of the `entry` are `pk1` and `pk2`. If `pk2` is not provided, this returns feature values from the first feature group and null values from the second feature group when using the option `allow_missing=True`, otherwise it raises exception. + +If your model can handle missing value or if you want to impute the missing value, you can get back feature vectors with partial values using python client starting from version 3.4 (Note that this does not apply to java client.). +In the example below, let's say you join 2 feature groups by `fg1.join(fg2, left_on=["pk1"], right_on=["pk2"])`, required keys of the `entry` are `pk1` and `pk2`. +If `pk2` is not provided, this returns feature values from the first feature group and null values from the second feature group when using the option `allow_missing=True`, otherwise it raises exception. === "Python" ```python @@ -135,7 +158,11 @@ If your model can handle missing value or if you want to impute the missing valu ``` ### Retrieval with transformation -If you have specified transformation functions when creating a feature view, you receive transformed feature vectors. If your transformation functions require statistics of training dataset, you must also provide the training data version. `init_serving` will then fetch the statistics and initialize the functions with the required statistics. Then you can follow the above examples and retrieve the feature vectors. Please note that transformed feature vectors can only be returned in the python client but not in the java client. + +If you have specified transformation functions when creating a feature view, you receive transformed feature vectors. +If your transformation functions require statistics of training dataset, you must also provide the training data version. `init_serving` will then fetch the statistics and initialize the functions with the required statistics. +Then you can follow the above examples and retrieve the feature vectors. +Please note that transformed feature vectors can only be returned in the python client but not in the java client. === "Python" ```python @@ -143,9 +170,12 @@ If you have specified transformation functions when creating a feature view, you ``` ## Passed features -If some of the features values are only known at prediction time and cannot be computed and cached in the online feature store, you can provide those values as `passed_features` option. The `get_feature_vector` method is going to use the passed values to construct the final feature vector to submit to the model. -You can use the `passed_features` parameter to overwrite individual features being retrieved from the online feature store. The feature view will apply the necessary transformations to the passed features as it does for the feature data retrieved from the online feature store. +If some of the features values are only known at prediction time and cannot be computed and cached in the online feature store, you can provide those values as `passed_features` option. +The `get_feature_vector` method is going to use the passed values to construct the final feature vector to submit to the model. + +You can use the `passed_features` parameter to overwrite individual features being retrieved from the online feature store. +The feature view will apply the necessary transformations to the passed features as it does for the feature data retrieved from the online feature store. Please note that passed features is only available in the python client but not in the java client. @@ -172,7 +202,8 @@ Please note that passed features is only available in the python client but not ) ``` -You can also use the parameter to provide values for all the features which are part of a specific feature group and used in the feature view. In this second case, you do not have to provide the primary key value for that feature group as no data needs to be retrieved from the online feature store. +You can also use the parameter to provide values for all the features which are part of a specific feature group and used in the feature view. +In this second case, you do not have to provide the primary key value for that feature group as no data needs to be retrieved from the online feature store. === "Python" ```python @@ -197,7 +228,7 @@ By default, the `get_feature_vector` and `get_feature_vectors` functions return However, you can retrieve the untransformed feature vectors without applying model-dependent transformations while still including on-demand features by setting the `transform` parameter to False. -=== "Python" +=== "Python" !!! example "Returning untransformed feature vectors" ```python # Fetching untransformed feature vector. @@ -213,9 +244,10 @@ However, you can retrieve the untransformed feature vectors without applying mod ## Retrieving feature vector without on-demand features -The `get_feature_vector` and `get_feature_vectors` methods can also return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. To achieve this, set the parameters `transform` and `on_demand_features` to `False`. +The `get_feature_vector` and `get_feature_vectors` methods can also return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. +To achieve this, set the parameters `transform` and `on_demand_features` to `False`. -=== "Python" +=== "Python" !!! example "Returning untransformed feature vectors" ```python untransformed_feature_vector = feature_view.get_feature_vector( @@ -227,9 +259,10 @@ The `get_feature_vector` and `get_feature_vectors` methods can also return untra ``` ## Passing Context Variables to Transformation Functions + After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the required context variables using the `transformation_context` parameter when fetching the feature vectors. -=== "Python" +=== "Python" !!! example "Passing context variables while fetching batch data." ```python # Passing context variable to IN-MEMORY Training Dataset. @@ -241,11 +274,21 @@ After [defining a transformation function using a context variable](../transform ## Choose the right Client -The Online Store can be accessed via the **Python** or **Java** client allowing you to use your language of choice to connect to the Online Store. Additionally, the Python client provides two different implementations to fetch data: **SQL** or **REST**. The SQL client is the default implementation. It requires a direct SQL connection to your RonDB cluster and uses python asyncio to offer high performance even when your Feature View rows involve querying multiple different tables. The REST client is an alternative implementation connecting to [RonDB Feature Vector Server](./feature-server.md). Perfect if you want to avoid exposing ports of your database cluster directly to clients. This implementation is available as of Hopsworks 3.7. +The Online Store can be accessed via the **Python** or **Java** client allowing you to use your language of choice to connect to the Online Store. +Additionally, the Python client provides two different implementations to fetch data: **SQL** or **REST**. +The SQL client is the default implementation. +It requires a direct SQL connection to your RonDB cluster and uses python asyncio to offer high performance even when your Feature View rows involve querying multiple different tables. +The REST client is an alternative implementation connecting to [RonDB Feature Vector Server](./feature-server.md). +Perfect if you want to avoid exposing ports of your database cluster directly to clients. +This implementation is available as of Hopsworks 3.7. -Initialise the client by calling the `init_serving` method on the Feature View object before starting to fetch feature vectors. This will initialise the chosen client, test the connection, and initialise the transformation functions registered with the Feature View. Note to use the REST client in the Hopsworks Cluster python environment you will need to provide an API key explicitly as JWT authentication is not yet supported. More configuration options can be found in the [API documentation](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#init_serving). +Initialise the client by calling the `init_serving` method on the Feature View object before starting to fetch feature vectors. +This will initialise the chosen client, test the connection, and initialise the transformation functions registered with the Feature View. +Note to use the REST client in the Hopsworks Cluster python environment you will need to provide an API key explicitly as JWT authentication is not yet supported. +More configuration options can be found in the [API documentation][hsfs.feature_view.FeatureView.init_serving]. === "Python" + ```python # initialize the SQL client to fetch feature vectors from the Online Store my_feature_view.init_serving() @@ -258,9 +301,12 @@ my_feature_view.init_serving( } ) ``` -Once the client is initialised, you can start fetching feature vector(s) via the Feature View methods: `get_feature_vector(s)`. You can initialise both clients for a given Feature View and switch between them by using the force flags in the get_feature_vector(s) methods. + +Once the client is initialised, you can start fetching feature vector(s) via the Feature View methods: `get_feature_vector(s)`. +You can initialise both clients for a given Feature View and switch between them by using the force flags in the get_feature_vector(s) methods. === "Python" + ```python # initialize both clients and set the default to REST my_feature_view.init_serving( @@ -286,5 +332,8 @@ except TimeoutException: ``` ## Feature Server -In addition to Python/Java clients, from Hopsworks 3.3, a new [feature server](./feature-server.md) implemented in Go is introduced. With this new API, single or batch feature vectors can be retrieved in any programming language. Note that you can connect to the Feature Vector Server via any REST client. However registered transformation function will not be applied to values in the JSON response and values stored in Feature Groups which contain embeddings will be missing. +In addition to Python/Java clients, from Hopsworks 3.3, a new [feature server](./feature-server.md) implemented in Go is introduced. +With this new API, single or batch feature vectors can be retrieved in any programming language. +Note that you can connect to the Feature Vector Server via any REST client. +However registered transformation function will not be applied to values in the JSON response and values stored in Feature Groups which contain embeddings will be missing. diff --git a/docs/user_guides/fs/feature_view/feature_logging.md b/docs/user_guides/fs/feature_view/feature_logging.md index 0e3ac35c4..ac3d804e5 100644 --- a/docs/user_guides/fs/feature_view/feature_logging.md +++ b/docs/user_guides/fs/feature_view/feature_logging.md @@ -1,28 +1,41 @@ # User Guide: Feature and Prediction Logging with a Feature View -Feature logging is essential for debugging, monitoring, and auditing the data your models use. This guide explains how to log features and predictions, and retrieve and manage these logs with feature view in Hopsworks. +Feature logging is essential for debugging, monitoring, and auditing the data your models use. +This guide explains how to log features and predictions, and retrieve and manage these logs with feature view in Hopsworks. -## Logging Features and Predictions +## Feature and Prediction Logging -After you have trained a model, you can log the features it uses and the predictions with the feature view used to create the training data for the model. You can log either transformed or/and untransformed features values. +After you have trained a model, you can log the features it uses and the predictions with the feature view used to create the training data for the model. +You can log either transformed or/and untransformed features values. ### Enabling Feature Logging -To enable logging, set `logging_enabled=True` when creating the feature view. Two feature groups will be created for storing transformed and untransformed features, but they are not visible in the UI. The logged features will be written to the offline feature store every hour by scheduled materialization jobs which are created automatically. +To enable logging, set `logging_enabled=True` when creating the feature view. +Two feature groups will be created for storing transformed and untransformed features, but they are not visible in the UI. +The logged features will be written to the offline feature store every hour by scheduled materialization jobs which are created automatically. ```python feature_view = fs.create_feature_view("name", query, logging_enabled=True) ``` -Alternatively, you can enable logging on an existing feature view by calling `feature_view.enable_logging()`. Also, calling `feature_view.log()` will implicitly enable logging if it has not already been enabled. +Alternatively, you can enable logging on an existing feature view by calling `feature_view.enable_logging()`. +Also, calling `feature_view.log()` will implicitly enable logging if it has not already been enabled. ### Logging Features and Predictions -You can log features and predictions by calling `feature_view.log`. The logged features are written periodically to the offline store. If you need it to be available immediately, call `feature_view.materialize_log`. +You can log features and predictions by calling `feature_view.log`. +The logged features are written periodically to the offline store. +If you need it to be available immediately, call `feature_view.materialize_log`. -You can log either transformed or/and untransformed features. To get untransformed features, you can specify `transform=False` in `feature_view.get_batch_data` or `feature_view.get_feature_vector(s)`. Inference helper columns are returned along with the untransformed features. If you have On-Demand features as well, call `feature_view.compute_on_demand_features` to get the on demand features before calling `feature_view.log`.To get the transformed features, you can call `feature_view.transform` and pass the untransformed feature with the on-demand feature. +You can log either transformed or/and untransformed features. +To get untransformed features, you can specify `transform=False` in `feature_view.get_batch_data` or `feature_view.get_feature_vector(s)`. +Inference helper columns are returned along with the untransformed features. +If you have On-Demand features as well, call `feature_view.compute_on_demand_features` to get the on demand features before calling `feature_view.log`.To get the transformed features, you can call `feature_view.transform` and pass the untransformed feature with the on-demand feature. -Predictions can be optionally provided as one or more columns in the DataFrame containing the features or separately in the `predictions` argument. There must be the same number of prediction columns as there are labels in the feature view. It is required to provide predictions in the `predictions` argument if you provide the features as `list` instead of pandas `dataframe`. The training dataset version will also be logged if you have called either `feature_view.init_serving(...)` or `feature_view.init_batch_scoring(...)` or if the provided model has a training dataset version. +Predictions can be optionally provided as one or more columns in the DataFrame containing the features or separately in the `predictions` argument. +There must be the same number of prediction columns as there are labels in the feature view. +It is required to provide predictions in the `predictions` argument if you provide the features as `list` instead of pandas `dataframe`. +The training dataset version will also be logged if you have called either `feature_view.init_serving(...)` or `feature_view.init_batch_scoring(...)` or if the provided model has a training dataset version. The wallclock time of calling `feature_view.log` is automatically logged, enabling filtering by logging time when retrieving logs. @@ -52,16 +65,17 @@ predictions = pd.DataFrame({ }) # Log features and predictions -feature_view.log(features, - predictions=predictions, - training_dataset_version=1, +feature_view.log(features, + predictions=predictions, + training_dataset_version=1, model=Model(1, "model", version=1) ) ``` #### Example 3: Log Both Transformed and Untransformed Features -**Batch Features** +##### Batch Features + ```python untransformed_df = fv.get_batch_data(transformed=False) # then apply the transformations after: @@ -72,7 +86,8 @@ feature_view.log(untransformed_df) feature_view.log(transformed_features=transformed_df) ``` -**Real-time Features** +##### Real-time Features + ```python untransformed_vector = fv.get_feature_vector({"id": 1}, transform=False) # then apply the transformations after: @@ -85,7 +100,8 @@ feature_view.log(transformed_features=transformed_vector) ## Retrieving the Log Timeline -To audit and review the feature/prediction logs, you might want to retrieve the timeline of log entries. This helps understand when data was logged and monitor the logs. +To audit and review the feature/prediction logs, you might want to retrieve the timeline of log entries. +This helps understand when data was logged and monitor the logs. ### Retrieve Log Timeline @@ -103,7 +119,8 @@ You may need to read specific log entries for analysis, such as entries within a ### Read all Log Entries -Read all log entries for comprehensive analysis. The output will return all values of the same primary keys instead of just the latest value. +Read all log entries for comprehensive analysis. +The output will return all values of the same primary keys instead of just the latest value. ```python # Read all log entries @@ -113,7 +130,10 @@ print(log_entries) ### Read Log Entries within a Time Range -Focus on logs within a specific time range. You can specify `start_time` and `end_time` for filtering, but the time columns will not be returned in the DataFrame. You can provide the `start/end_time` as `datetime`, `date`, `int`, or `str` type. Accepted date format are: `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f` +Focus on logs within a specific time range. +You can specify `start_time` and `end_time` for filtering, but the time columns will not be returned in the DataFrame. +You can provide the `start/end_time` as `datetime`, `date`, `int`, or `str` type. +Accepted date format are: `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f` ```python # Read log entries from January 2022 @@ -123,7 +143,8 @@ print(log_entries) ### Read Log Entries by Training Dataset Version -Analyze logs from a particular version of the training dataset. The training dataset version column will be returned in the DataFrame. +Analyze logs from a particular version of the training dataset. +The training dataset version column will be returned in the DataFrame. ```python # Read log entries of training dataset version 1 @@ -133,7 +154,8 @@ print(log_entries) ### Read Log Entries by Model in Hopsworks -Analyze logs from a particular name and version of the HSML model. The HSML model column will be returned in the DataFrame. +Analyze logs from a particular name and version of the HSML model. +The HSML model column will be returned in the DataFrame. ```python # Read log entries of a specific HSML model @@ -143,7 +165,8 @@ print(log_entries) ### Read Log Entries using a Custom Filter -Provide filters which work similarly to the filter method in the `Query` class. The filter should be part of the query in the feature view. +Provide filters which work similarly to the filter method in the `Query` class. +The filter should be part of the query in the feature view. ```python # Read log entries where feature1 is greater than 0 @@ -175,7 +198,9 @@ feature_view.resume_logging() ## Materializing Logs -Besides the scheduled materialization job, you can materialize logs from Kafka to the offline store on demand. This does not pause the scheduled job. By default, it materializes both transformed and untransformed logs, optionally specifying whether to materialize transformed (transformed=True) or untransformed (transformed=False) logs. +Besides the scheduled materialization job, you can materialize logs from Kafka to the offline store on demand. +This does not pause the scheduled job. +By default, it materializes both transformed and untransformed logs, optionally specifying whether to materialize transformed (transformed=True) or untransformed (transformed=False) logs. ### Materialize Logs @@ -190,11 +215,13 @@ feature_view.materialize_log(wait=True, transformed=True) ## Deleting Logs -When log data is no longer needed, you might want to delete it to free up space and maintain data hygiene. This operation deletes the feature groups and recreates new ones. Scheduled materialization job and log timeline are reset as well. +When log data is no longer needed, you might want to delete it to free up space and maintain data hygiene. +This operation deletes the feature groups and recreates new ones. +Scheduled materialization job and log timeline are reset as well. ### Delete Logs -Remove all log entries (both transformed and untransformed logs), optionally specifying whether to delete transformed (transformed=True) or untransformed (transformed=False) logs. +Remove all log entries (both transformed and untransformed logs), optionally specifying whether to delete transformed (transformed=True) or untransformed (transformed=False) logs. ```python # Delete all log entries @@ -206,4 +233,5 @@ feature_view.delete_log(transformed=True) ## Summary -Feature logging is a crucial part of maintaining and monitoring your machine learning workflows. By following these examples, you can effectively log, retrieve, and delete logs, as well as manage the lifecycle of log materialization jobs, adding observability for your AI system and making it auditable. \ No newline at end of file +Feature logging is a crucial part of maintaining and monitoring your machine learning workflows. +By following these examples, you can effectively log, retrieve, and delete logs, as well as manage the lifecycle of log materialization jobs, adding observability for your AI system and making it auditable. diff --git a/docs/user_guides/fs/feature_view/feature_monitoring.md b/docs/user_guides/fs/feature_view/feature_monitoring.md index c0794cf1b..16c9362b3 100644 --- a/docs/user_guides/fs/feature_view/feature_monitoring.md +++ b/docs/user_guides/fs/feature_view/feature_monitoring.md @@ -1,27 +1,36 @@ # Feature Monitoring for Feature Views -Feature Monitoring complements the Hopsworks data validation capabilities for Feature Group data by allowing you to monitor your data once they have been ingested into the Feature Store. Hopsworks feature monitoring is centered around two functionalities: **scheduled statistics** and **statistics comparison**. +Feature Monitoring complements the Hopsworks data validation capabilities for Feature Group data by allowing you to monitor your data once they have been ingested into the Feature Store. +Hopsworks feature monitoring is centered around two functionalities: **scheduled statistics** and **statistics comparison**. Before continuing with this guide, see the [Feature monitoring guide](../feature_monitoring/index.md) to learn more about how feature monitoring works, and get familiar with the different use cases of feature monitoring for Feature Views described in the **Use cases** sections of the [Scheduled statistics guide](../feature_monitoring/scheduled_statistics.md#use-cases) and [Statistics comparison guide](../feature_monitoring/statistics_comparison.md#use-cases). !!! warning "Limited UI support" - Currently, feature monitoring can only be configured using the [Hopsworks Python library](https://pypi.org/project/hopsworks). However, you can enable/disable a feature monitoring configuration or trigger the statistics comparison manually from the UI, as shown in the [Advanced guide](../feature_monitoring/feature_monitoring_advanced.md). + Currently, feature monitoring can only be configured using the [Hopsworks Python library](https://pypi.org/project/hopsworks). + However, you can enable/disable a feature monitoring configuration or trigger the statistics comparison manually from the UI, as shown in the [Advanced guide](../feature_monitoring/feature_monitoring_advanced.md). ## Code -In this section, we show you how to setup feature monitoring in a Feature View using the ==Hopsworks Python library==. Alternatively, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). +In this section, we show you how to setup feature monitoring in a Feature View using the ==Hopsworks Python library==. +Alternatively, you can get started quickly by running our [tutorial for feature monitoring](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/feature_monitoring.ipynb). -First, checkout the pre-requisite and Hopsworks setup to follow the guide below. Create a project, install the [Hopsworks Python library](https://pypi.org/project/hopsworks) in your environment and connect via the generated API key. The second step is to start a new configuration for feature monitoring. +First, checkout the pre-requisite and Hopsworks setup to follow the guide below. +Create a project, install the [Hopsworks Python library](https://pypi.org/project/hopsworks) in your environment and connect via the generated API key. +The second step is to start a new configuration for feature monitoring. -After that, you can optionally define a detection window of data to compute statistics on, or use the default detection window (i.e., whole feature data). If you want to setup scheduled statistics alone, you can jump to the last step to save your configuration. Otherwise, the third and fourth steps are also optional and show you how to setup the comparison of statistics on a schedule by defining a reference window and specifying the statistics metric to be compared. +After that, you can optionally define a detection window of data to compute statistics on, or use the default detection window (i.e., whole feature data). +If you want to setup scheduled statistics alone, you can jump to the last step to save your configuration. +Otherwise, the third and fourth steps are also optional and show you how to setup the comparison of statistics on a schedule by defining a reference window and specifying the statistics metric to be compared. ### Step 1: Pre-requisite In order to setup feature monitoring for a Feature View, you will need: -- A Hopsworks project. If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. +- A Hopsworks project. + If you don't have a project yet you can go to [app.hopsworks.ai](https://app.hopsworks.ai), signup with your email and create your first project. - An API key, you can get one by going to "Account Settings" on [app.hopsworks.ai](https://app.hopsworks.ai). -- The [Hopsworks Python library](https://pypi.org/project/hopsworks) installed in your client. See the [installation guide](../../client_installation/index.md). +- The [Hopsworks Python library](https://pypi.org/project/hopsworks) installed in your client. + See the [installation guide](../../client_installation/index.md). - A Feature View - A Training Dataset @@ -38,11 +47,14 @@ Connect the client running your notebooks to Hopsworks. fs = project.get_feature_store() ``` -You will be prompted to paste your API key to connect the notebook to your project. The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. +You will be prompted to paste your API key to connect the notebook to your project. +The `fs` Feature Store entity is now ready to be used to insert or read data from Hopsworks. #### Get or create a Feature View -Feature monitoring can be enabled on already created Feature Views. We suggest you read the [Feature View](../../../concepts/fs/feature_view/fv_overview.md) concept page to understand what a feature view is. We also suggest you familiarize with the APIs to [create a feature view](overview.md) and how to create them using the [query abstraction](query.md). +Feature monitoring can be enabled on already created Feature Views. +We suggest you read the [Feature View](../../../concepts/fs/feature_view/fv_overview.md) concept page to understand what a feature view is. +We also suggest you familiarize with the APIs to [create a feature view](overview.md) and how to create them using the [query abstraction](query.md). The following is a code example for getting or creating a Feature View with name `trans_fv` for transaction data. @@ -102,7 +114,8 @@ You can setup statistics monitoring on a ==single feature or multiple features== #### Statistics comparison -When enabling the comparison of statistics in a feature monitoring configuration, you need to specify a ==single feature== of your Feature Group data, included in your Feature View query. You can create multiple feature monitoring configurations on the same Feature View, but each of them should point to a single feature in the Feature View query. +When enabling the comparison of statistics in a feature monitoring configuration, you need to specify a ==single feature== of your Feature Group data, included in your Feature View query. +You can create multiple feature monitoring configurations on the same Feature View, but each of them should point to a single feature in the Feature View query. === "Python" ```python3 @@ -115,15 +128,16 @@ When enabling the comparison of statistics in a feature monitoring configuration #### Custom schedule or percentage of window data -By default, the computation of statistics is scheduled to run endlessly, every day at 12PM. You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters. +By default, the computation of statistics is scheduled to run endlessly, every day at 12PM. +You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters. === "Python" ```python3 fg_monitoring_config = trans_fv.create_statistics_monitoring( name="trans_fv_all_features_monitoring", description="Compute statistics on all data of all features of the Feature Group data on a weekly basis", - cron_expression="0 0 12 ? * MON *", # weekly - row_percentage=0.8, # use 80% of the data + cron_expression="0 0 12 ? *MON*", # weekly + row_percentage=0.8, # use 80% of the data ) # or @@ -131,14 +145,16 @@ By default, the computation of statistics is scheduled to run endlessly, every d name="trans_fv_amount_monitoring", feature_name="amount", description="Compute descriptive statistics on the amount Feature of the Feature Group data on a weekly basis", - cron_expression="0 0 12 ? * MON *", # weekly + cron_expression="0 0 12 ? * MON *", # weekly row_percentage=0.8, # use 80% of the data ) ``` ### Step 3: (Optional) Define a detection window -By default, the detection window is an _expanding window_ covering the whole Feature Group data. You can define a different detection window using the `window_length` and `time_offset` parameters provided in the `with_detection_window` method. Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter. +By default, the detection window is an *expanding window* covering the whole Feature Group data. +You can define a different detection window using the `window_length` and `time_offset` parameters provided in the `with_detection_window` method. +Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter. === "Python" ```python3 @@ -175,12 +191,14 @@ When setting up feature monitoring for a Feature View, reference windows can be ### Step 5: (Optional) Define the statistics comparison criteria -In order to compare detection and reference statistics, you need to provide the criteria for such comparison. First, you select the metric to consider in the comparison using the `metric` parameter. Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters. +In order to compare detection and reference statistics, you need to provide the criteria for such comparison. +First, you select the metric to consider in the comparison using the `metric` parameter. +Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters. === "Python" ```python3 fm_monitoring_config.compare_on( - metric="mean", + metric="mean", threshold=0.2, # a relative change over 20% is considered anomalous relative=True, # relative or absolute change strict=False, # strict or relaxed comparison @@ -190,10 +208,10 @@ In order to compare detection and reference statistics, you need to provide the !!! info "Difference values and thresholds" For more information about the computation of difference values and the comparison against threshold bounds see the [Comparison criteria section](../feature_monitoring/statistics_comparison.md#comparison-criteria) in the Statistics comparison guide. - ### Step 6: Save configuration -Finally, you can save your feature monitoring configuration by calling the `save` method. Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically. +Finally, you can save your feature monitoring configuration by calling the `save` method. +Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically. === "Python" ```python3 diff --git a/docs/user_guides/fs/feature_view/helper-columns.md b/docs/user_guides/fs/feature_view/helper-columns.md index e66677b45..5a50b524c 100644 --- a/docs/user_guides/fs/feature_view/helper-columns.md +++ b/docs/user_guides/fs/feature_view/helper-columns.md @@ -3,28 +3,33 @@ description: Using Helper columns in Feature View queries for online/batch infer --- # Helper columns + Hopsworks Feature Store provides a functionality to define two types of helper columns `inference_helper_columns` and `training_helper_columns` for [feature views](./overview.md). !!! note - Both inference and training helper column name(s) must be part of the `Query` object. If helper column name(s) belong to feature group that is part of a `Join` with `prefix` defined, then this prefix needs to prepended + Both inference and training helper column name(s) must be part of the `Query` object. + If helper column name(s) belong to feature group that is part of a `Join` with `prefix` defined, then this prefix needs to prepended to the original column name when defining helper column list. ## Inference Helper columns + `inference_helper_columns` are a list of feature names that are not used for training the model itself but are used for extra information during online or batch inference. -For example, computing an [on-demand feature](../../../concepts/fs/feature_group/on_demand_feature.md) such as `days_valid` (days left that a credit card is valid at the time of the transaction) -in a credit card fraud detection system. The feature `days_valid` will be computed using the credit card expiry date that needs to be fetched from the feature store and compared to the transaction -date that the transaction is performed on (`days_valid` = `expiry_date` - `current_date`). In this use case `expiry_date` is an inference helper column. It is not used for training but is necessary +For example, computing an [on-demand feature](../../../concepts/fs/feature_group/on_demand_feature.md) such as `days_valid` (days left that a credit card is valid at the time of the transaction) +in a credit card fraud detection system. +The feature `days_valid` will be computed using the credit card expiry date that needs to be fetched from the feature store and compared to the transaction +date that the transaction is performed on (`days_valid` = `expiry_date` - `current_date`). +In this use case `expiry_date` is an inference helper column. +It is not used for training but is necessary for computing the [on-demand feature](../../../concepts/fs/feature_group/on_demand_feature.md)`days_valid` feature. - === "Python" !!! example "Define inference columns for feature views." ```python - # define query object + # define query object query = label_fg.select("fraud_label")\ - .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) - + .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) + # define feature view with helper columns feature_view = fs.get_or_create_feature_view( name='fv_with_helper_col', @@ -36,8 +41,10 @@ for computing the [on-demand feature](../../../concepts/fs/feature_group/on_dema ) ``` -### Retrieval -When retrieving data for model inference, helper columns will be omitted. However, they can be optionally fetched with inference or training data. +### Inference Data Retrieval + +When retrieving data for model inference, helper columns will be omitted. +However, they can be optionally fetched with inference or training data. #### Batch inference @@ -48,8 +55,8 @@ When retrieving data for model inference, helper columns will be omitted. Howeve # import feature functions from feature_functions import time_delta - - # Fetch feature view object + + # Fetch feature view object feature_view = fs.get_feature_view( name='fv_with_helper_col', version=1, @@ -73,8 +80,8 @@ When retrieving data for model inference, helper columns will be omitted. Howeve ```python from feature_functions import time_delta - - # Fetch feature view object + + # Fetch feature view object feature_view = fs.get_feature_view( name='fv_with_helper_col', version=1, @@ -89,32 +96,32 @@ When retrieving data for model inference, helper columns will be omitted. Howeve # here cc_num, longitute and lattitude are provided as parameters to the application cc_num = ... transaction_date = ... - + # get previous transaction location of this credit card inference_helper = feature_view.get_inference_helper({"cc_num": cc_num}, return_type="dict") - # compute location delta + # compute location delta days_valid = time_delta(transaction_date, inference_helper['expiry_date']) # Now get assembled feature vector for prediction - feature_vector = feature_view.get_feature_vector({"cc_num": cc_num}, + feature_vector = feature_view.get_feature_vector({"cc_num": cc_num}, passed_features={"days_valid": days_valid} ) ``` - ## Training Helper columns -`training_helper_columns` are a list of feature names that are not the part of the model schema itself but are used during training for the extra information. + +`training_helper_columns` are a list of feature names that are not the part of the model schema itself but are used during training for the extra information. For example one might want to use feature like `category` of the purchased product to assign different weights. === "Python" !!! example "Define training helper columns for feature views." ```python - # define query object + # define query object query = label_fg.select("fraud_label")\ - .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) - + .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) + # define feature view with helper columns feature_view = fs.get_or_create_feature_view( name='fv_with_helper_col', @@ -126,8 +133,10 @@ For example one might want to use feature like `category` of the purchased produ ) ``` -### Retrieval -When retrieving training data helper columns will be omitted. However, they can be optionally fetched. +### Training Data Retrieval + +When retrieving training data helper columns will be omitted. +However, they can be optionally fetched. === "Python" @@ -136,8 +145,8 @@ When retrieving training data helper columns will be omitted. However, they can # import feature functions from feature_functions import location_delta, time_delta - - # Fetch feature view object + + # Fetch feature view object feature_view = fs.get_feature_view( name='fv_with_helper_col', version=1, @@ -156,7 +165,7 @@ When retrieving training data helper columns will be omitted. However, they can training_dataset_version=1, training_helper_columns=True ) - ``` + ``` !!! note - To use helper columns with materialized training dataset it needs to be created with `training_helper_columns=True`. + To use helper columns with materialized training dataset it needs to be created with `training_helper_columns=True`. diff --git a/docs/user_guides/fs/feature_view/index.md b/docs/user_guides/fs/feature_view/index.md index e8830e210..89c2de606 100644 --- a/docs/user_guides/fs/feature_view/index.md +++ b/docs/user_guides/fs/feature_view/index.md @@ -11,4 +11,4 @@ This section serves to provide guides and examples for the common usage of abstr - [Helper columns](helper-columns.md) - [Model-Dependent Transformation Functions](model-dependent-transformations.md) - [Spines](spine-query.md) -- [Feature Monitoring](feature_monitoring.md) \ No newline at end of file +- [Feature Monitoring](feature_monitoring.md) diff --git a/docs/user_guides/fs/feature_view/model-dependent-transformations.md b/docs/user_guides/fs/feature_view/model-dependent-transformations.md index b6657ae5d..b16972b27 100644 --- a/docs/user_guides/fs/feature_view/model-dependent-transformations.md +++ b/docs/user_guides/fs/feature_view/model-dependent-transformations.md @@ -1,18 +1,29 @@ # Model Dependent Transformation Functions - -[Model-dependent transformations](https://www.hopsworks.ai/dictionary/model-dependent-transformations) transform feature data for a specific model. Feature encoding is one example of such a transformations. Feature encoding is parameterized by statistics from the training dataset, and, as such, many model-dependent transformations require the training dataset statistics as a parameter. Hopsworks enhances the robustness of AI pipelines by preventing [training-inference skew](https://www.hopsworks.ai/dictionary/training-inference-skew) by ensuring that the same model-dependent transformations and statistical parameters are used during both training dataset generation and online inference. +[Model-dependent transformations](https://www.hopsworks.ai/dictionary/model-dependent-transformations) transform feature data for a specific model. +Feature encoding is one example of such a transformations. +Feature encoding is parameterized by statistics from the training dataset, and, as such, many model-dependent transformations require the training dataset statistics as a parameter. +Hopsworks enhances the robustness of AI pipelines by preventing [training-inference skew](https://www.hopsworks.ai/dictionary/training-inference-skew) by ensuring that the same model-dependent transformations and statistical parameters are used during both training dataset generation and online inference. Additionally, Hopsworks offers built-in model-dependent transformation functions, such as `min_max_scaler`, `standard_scaler`, `robust_scaler`, `label_encoder`, and `one_hot_encoder`, which can be easily imported and declaratively applied to features in a feature view. ## Model Dependent Transformation Function Creation -Hopsworks allows you to create a model-dependent transformation function by attaching a [transformation function](../transformation_functions.md) to a feature view. The attached transformation function can be a simple function that takes one feature as input and outputs the transformed feature data. For example, in the case of min-max scaling a numerical feature, you will have a number as input parameter to the transformation function and a number as output. However, in the case of one-hot encoding a categorical variable, you will have a string as input and an array of 1s and 0s and output. You can also have transformation functions that take multiple features as input and produce one or more values as output. That is, transformation functions can be one-to-one, one-to-many, many-to-one, or many-to-many. - -Each model-dependent transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function. If no feature names are provided, the transformation function will default to using features from the feature view that match the name of the transformation function's argument. +Hopsworks allows you to create a model-dependent transformation function by attaching a [transformation function](../transformation_functions.md) to a feature view. +The attached transformation function can be a simple function that takes one feature as input and outputs the transformed feature data. +For example, in the case of min-max scaling a numerical feature, you will have a number as input parameter to the transformation function and a number as output. +However, in the case of one-hot encoding a categorical variable, you will have a string as input and an array of 1s and 0s and output. +You can also have transformation functions that take multiple features as input and produce one or more values as output. +That is, transformation functions can be one-to-one, one-to-many, many-to-one, or many-to-many. -Hopsworks by default generates default names of transformed features output by a model-dependent transformation function. The generated names follows a naming convention structured as `functionName_features_outputColumnNumber` if the transformation function outputs multiple columns and `functionName_features` if the transformation function outputs one column. For instance, for the function named `add_one_multiple` that outputs multiple columns in the example given below, produces output columns that would be labeled as  `add_one_multiple_feature1_feature2_feature3_0`,  `add_one_multiple_feature1_feature2_feature3_1` and  `add_one_multiple_feature1_feature2_feature3_2`. The function named `add_two` that outputs a single column in the example given below, produces a single output column names as `add_two_feature`. Additionally, Hopsworks also allows users to specify custom names for transformed feature using the [`alias`](../transformation_functions.md#specifying-output-features–names-for-transformation-functions) function. +Each model-dependent transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function. +If no feature names are provided, the transformation function will default to using features from the feature view that match the name of the transformation function's argument. +Hopsworks by default generates default names of transformed features output by a model-dependent transformation function. +The generated names follows a naming convention structured as `functionName_features_outputColumnNumber` if the transformation function outputs multiple columns and `functionName_features` if the transformation function outputs one column. +For instance, for the function named `add_one_multiple` that outputs multiple columns in the example given below, produces output columns that would be labeled as  `add_one_multiple_feature1_feature2_feature3_0`,  `add_one_multiple_feature1_feature2_feature3_1` and  `add_one_multiple_feature1_feature2_feature3_2`. +The function named `add_two` that outputs a single column in the example given below, produces a single output column names as `add_two_feature`. +Additionally, Hopsworks also allows users to specify custom names for transformed feature using the [`alias`](../transformation_functions.md#specifying-output-features-names-for-transformation-functions) function. === "Python" @@ -22,7 +33,7 @@ Hopsworks by default generates default names of transformed features output by a @udf(return_type=[int, int, int], drop=["feature1", "feature3"]) def add_one_multiple(feature1, feature2, feature3): return pd.DataFrame({"add_one_feature1":feature1 + 1, "add_one_feature2":feature2 + 1, "add_one_feature3":feature3 + 1}) - + # Defining a one to one transformation function. @udf(return_type=int) def add_two(feature): @@ -42,8 +53,7 @@ Hopsworks by default generates default names of transformed features output by a ### Specifying input features -The features to be used by a model-dependent transformation function can be specified by providing the feature names (from the feature view / feature group) as input to the transformation functions. - +The features to be used by a model-dependent transformation function can be specified by providing the feature names (from the feature view / feature group) as input to the transformation functions. === "Python" @@ -63,7 +73,8 @@ The features to be used by a model-dependent transformation function can be spec ### Using built-in transformations -Built-in transformation functions are attached in the same way. The only difference is that they can either be retrieved from the Hopsworks or imported from the `hopsworks` module. +Built-in transformation functions are attached in the same way. +The only difference is that they can either be retrieved from the Hopsworks or imported from the `hopsworks` module. === "Python" @@ -73,7 +84,7 @@ Built-in transformation functions are attached in the same way. The only differe standard_scaler = fs.get_transformation_function(name="standard_scaler") robust_scaler = fs.get_transformation_function(name="robust_scaler") label_encoder = fs.get_transformation_function(name="label_encoder") - + feature_view = fs.create_feature_view( name='transactions_view', query=query, @@ -94,7 +105,7 @@ To attach built-in transformation functions from the `hopsworks` module they can !!! example "Creating model-dependent transformation using built-in transformation functions imported from hopsworks" ```python from hopsworks.hsfs.builtin_transformations import min_max_scaler, label_encoder, robust_scaler, standard_scaler - + feature_view = fs.create_feature_view( name='transactions_view', query=query, @@ -108,12 +119,13 @@ To attach built-in transformation functions from the `hopsworks` module they can ) ``` - ## Using Model Dependent Transformations -Model-dependent transformations attached to a feature view are automatically applied when you [create training data](./training-data.md#creation), [read training data](./training-data.md#read-training-data), [read batch inference data](./batch-data.md#creation-with-transformation), or [get feature vectors](./feature-vectors.md#retrieval-with-transformation). The generated data includes untransformed features, on-demand features, if any, and the transformed features. The transformed features are organized by their output column names in alphabetical order and are positioned after the untransformed and on-demand features. +Model-dependent transformations attached to a feature view are automatically applied when you [create training data](./training-data.md#creation), [read training data](./training-data.md#read-training-data), [read batch inference data](./batch-data.md#creation-with-transformation), or [get feature vectors](./feature-vectors.md#retrieval-with-transformation). +The generated data includes untransformed features, on-demand features, if any, and the transformed features. +The transformed features are organized by their output column names in alphabetical order and are positioned after the untransformed and on-demand features. -Model-dependent transformation functions can also be manually applied to a feature vector using the `transform` function. +Model-dependent transformation functions can also be manually applied to a feature vector using the `transform` function. === "Python" @@ -129,11 +141,12 @@ Model-dependent transformation functions can also be manually applied to a featu encoded_feature_vector = fv.transform(feature_vector) ``` -#### Retrieving untransformed feature vector and batch inference data +### Retrieving untransformed feature vector and batch inference data -The `get_feature_vector`, `get_feature_vectors`, and `get_batch_data` methods can return untransformed feature vectors and batch data without applying model-dependent transformations while still including on-demand features. To achieve this, set the `transform` parameter to False. +The `get_feature_vector`, `get_feature_vectors`, and `get_batch_data` methods can return untransformed feature vectors and batch data without applying model-dependent transformations while still including on-demand features. +To achieve this, set the `transform` parameter to False. -=== "Python" +=== "Python" !!! example "Returning untransformed feature vectors and batch data." ```python # Fetching untransformed feature vector. @@ -151,4 +164,3 @@ The `get_feature_vector`, `get_feature_vectors`, and `get_batch_data` methods ca transform=False ) ``` - diff --git a/docs/user_guides/fs/feature_view/overview.md b/docs/user_guides/fs/feature_view/overview.md index 1a4da7632..e40cd1615 100644 --- a/docs/user_guides/fs/feature_view/overview.md +++ b/docs/user_guides/fs/feature_view/overview.md @@ -1,11 +1,20 @@ # Feature View -A feature view is a set of features that come from one or more feature groups. It is a logical view over the feature groups, as the feature data is only stored in feature groups. Feature views are used to read feature data for both training and serving (online and batch). You can create [training datasets](training-data.md), create [batch data](batch-data.md) and get [feature vectors](feature-vectors.md). +A feature view is a set of features that come from one or more feature groups. +It is a logical view over the feature groups, as the feature data is only stored in feature groups. +Feature views are used to read feature data for both training and serving (online and batch). +You can create [training datasets](training-data.md), create [batch data](batch-data.md) and get [feature vectors](feature-vectors.md). -If you want to understand more about the concept of feature view, you can refer to [here](../../../concepts/fs/feature_view/fv_overview.md). +If you want to understand more about the concept of feature view, you can refer to the [Feature View Overview](../../../concepts/fs/feature_view/fv_overview.md). ## Feature View Creation -[Query](./query.md) and [transformation function](./model-dependent-transformations.md) are the building blocks of a feature view. You can define your set of features by building a `query`. You can also define which columns in your feature view are the `labels`, which is useful for supervised machine learning tasks. Furthermore, in python client, each feature can be attached to its own transformation function. This way, when a feature is read (for training or scoring), the transformation is executed on-demand - just before the feature data is returned. For example, when a client reads a numerical feature, the feature value could be normalized by a StandardScalar transformation function before it is returned to the client. + +[Query](./query.md) and [transformation function](./model-dependent-transformations.md) are the building blocks of a feature view. +You can define your set of features by building a `query`. +You can also define which columns in your feature view are the `labels`, which is useful for supervised machine learning tasks. +Furthermore, in python client, each feature can be attached to its own transformation function. +This way, when a feature is read (for training or scoring), the transformation is executed on-demand - just before the feature data is returned. +For example, when a client reads a numerical feature, the feature value could be normalized by a StandardScalar transformation function before it is returned to the client. === "Python" @@ -15,7 +24,7 @@ If you want to understand more about the concept of feature view, you can refer name='transactions_view', query=query ) - + # create a feature view with transformation and label feature_view = fs.create_feature_view( name='transactions_view', @@ -44,9 +53,11 @@ If you want to understand more about the concept of feature view, you can refer .build(); ``` -You can refer to [query](./query.md) and [transformation function](./model-dependent-transformations.md) for creating `query` and `transformation_function`. To see a full example of how to create a feature view, you can read [this notebook](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/2_fraud_batch_training_pipeline.ipynb). +You can refer to [query](./query.md) and [transformation function](./model-dependent-transformations.md) for creating `query` and `transformation_function`. +To see a full example of how to create a feature view, you can read [this notebook](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/2_fraud_batch_training_pipeline.ipynb). ## Retrieval + Once you have created a feature view, you can retrieve it by its name and version. === "Python" @@ -59,7 +70,9 @@ Once you have created a feature view, you can retrieve it by its name and versio ``` ## Deletion -If there are some feature view instances which you do not use anymore, you can delete a feature view. It is important to mention that all training datasets (include all materialised hopsfs training data) will be deleted along with the feature view. + +If there are some feature view instances which you do not use anymore, you can delete a feature view. +It is important to mention that all training datasets (include all materialised hopsfs training data) will be deleted along with the feature view. === "Python" ```python @@ -72,16 +85,18 @@ If there are some feature view instances which you do not use anymore, you can d ## Tags -Feature views also support tags. You can attach, get, and remove tags. You can refer to [here]() if you want to learn more about how tags work. +Feature views also support tags. +You can attach, get, and remove tags. +You can learn more in [Tags Guide](../tags/tags.md). === "Python" ```python # attach feature_view.add_tag(name="tag_schema", value={"key", "value"} - + # get feature_view.get_tag(name="tag_schema") - + #remove feature_view.delete_tag(name="tag_schema") ``` @@ -100,4 +115,5 @@ Feature views also support tags. You can attach, get, and remove tags. You can r ``` ## Next -Once you have created a feature view, you can now [create training data](./training-data.md) \ No newline at end of file + +Once you have created a feature view, you can now [create training data](./training-data.md) diff --git a/docs/user_guides/fs/feature_view/query.md b/docs/user_guides/fs/feature_view/query.md index 70f8fb294..775dac9c8 100644 --- a/docs/user_guides/fs/feature_view/query.md +++ b/docs/user_guides/fs/feature_view/query.md @@ -1,10 +1,14 @@ # Query vs DataFrame -HSFS provides a DataFrame API to ingest data into the Hopsworks Feature Store. You can also retrieve feature data in a DataFrame, that can either be used directly to train models or [materialized to file(s)](./training-data.md) for later use to train models. +HSFS provides a DataFrame API to ingest data into the Hopsworks Feature Store. +You can also retrieve feature data in a DataFrame, that can either be used directly to train models or [materialized to file(s)](./training-data.md) for later use to train models. -The idea of the Feature Store is to have pre-computed features available for both training and serving models. The key functionality required to generate training datasets from reusable features are: feature selection, joins, filters, and point in time queries. The Query object enables you to select features from different feature groups to join together to be used in a feature view. +The idea of the Feature Store is to have pre-computed features available for both training and serving models. +The key functionality required to generate training datasets from reusable features are: feature selection, joins, filters, and point in time queries. +The Query object enables you to select features from different feature groups to join together to be used in a feature view. -The joining functionality is heavily inspired by the APIs used by Pandas to merge DataFrames. The APIs allow you to specify which features to select from which feature group, how to join them and which features to use in join conditions. +The joining functionality is heavily inspired by the APIs used by Pandas to merge DataFrames. +The APIs allow you to specify which features to select from which feature group, how to join them and which features to use in join conditions. === "Python" ```python @@ -20,7 +24,7 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg # save the query to feature view feature_view = fs.create_feature_view( - version=1, + version=1, name='credit_card_fraud', labels=["is_fraud"], query=selected_features @@ -54,7 +58,9 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg val query = featureView.getQuery() ``` -If a data scientist wants to modify a new feature that is not available in the feature store, she can write code to compute the new feature (using existing features or external data) and ingest the new feature values into the feature store. If the new feature is based solely on existing feature values in the Feature Store, we call it a derived feature. The same HSFS APIs can be used to compute derived features as well as features using external data sources. +If a data scientist wants to modify a new feature that is not available in the feature store, she can write code to compute the new feature (using existing features or external data) and ingest the new feature values into the feature store. +If the new feature is based solely on existing feature values in the Feature Store, we call it a derived feature. +The same HSFS APIs can be used to compute derived features as well as features using external data sources. ## The Query Abstraction @@ -75,14 +81,15 @@ Selecting features from a feature group is a lazy operation, returning a query w === "Scala" ```Scala val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions") - + # Returns Query val selectedFeatures = creditCardTransactionsFg.select(Seq("amount", "latitude", "longitude")) ``` #### Join -Similarly, joins return query objects. The simplest join in one where we join all of the features together from two different feature groups without specifying a join key - `HSFS` will infer the join key as a common primary key between the two feature groups. +Similarly, joins return query objects. +The simplest join in one where we join all of the features together from two different feature groups without specifying a join key - `HSFS` will infer the join key as a common primary key between the two feature groups. By default, Hopsworks will use the maximal matching subset of the primary keys of the two feature groups as joining key(s), if not specified otherwise. === "Python" @@ -97,8 +104,11 @@ By default, Hopsworks will use the maximal matching subset of the primary keys o val selectedFeatures = creditCardTransactionsFg.join(accountDetailsFg) ``` More complex joins are possible by selecting subsets of features from the joined feature groups and by specifying a join key and type. -Possible join types are "inner", "left" or "right". By default`join_type` is `"left". Furthermore, it is possible to specify different -features for the join key of the left and right feature group. The join key lists should contain the names of the features to join on. +Possible join types are "inner", "left" or "right". +By default`join_type` is `"left". +Furthermore, it is possible to specify different +features for the join key of the left and right feature group. +The join key lists should contain the names of the features to join on. === "Python" ```python @@ -115,9 +125,9 @@ features for the join key of the left and right feature group. The join key list ``` !!! warning - If there is feature name clash in the query then prefixes will be automatically generated and applied. Generated prefix is feature group - alias in the query (e.g. fg1, fg2). Prefix is applied to the right feature group of the query. - + If there is feature name clash in the query then prefixes will be automatically generated and applied. + Generated prefix is feature group alias in the query (e.g., fg1, fg2). + Prefix is applied to the right feature group of the query. ### Data modeling in Hopsworks @@ -125,7 +135,7 @@ Since v4.0 Hopsworks Feature selection API supports both Star and Snowflake Sche #### Star schema data model -When choosing Star Schema data model all tables are children of the parent (the left most) feature group, which has all +When choosing Star Schema data model all tables are children of the parent (the left most) feature group, which has all foreign keys for its child feature groups.

@@ -144,7 +154,7 @@ foreign keys for its child feature groups. .join(cc_issuer_details.select_all()) ``` -In online inference, when you want to retrieve features in your online model, you have to provide all foreign key values, +In online inference, when you want to retrieve features in your online model, you have to provide all foreign key values, known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view. === "Python" @@ -157,10 +167,10 @@ known as the serving_keys, from the parent feature group to retrieve your precom }) ``` -#### Snowflake schema -Hopsworks also provides the possibility to define a feature view that consists of a nested tree of children (to up to a depth of 20) -from the root (left most) feature group. This is called Snowflake Schema data model where you need to build nested tables (subtrees) using joins, and then join the -subtrees to their parents iteratively until you reach the root node (the leftmost feature group in the feature selection): +#### Snowflake schema + +Hopsworks also provides the possibility to define a feature view that consists of a nested tree of children (to up to a depth of 20) from the root (left most) feature group. +This is called Snowflake Schema data model where you need to build nested tables (subtrees) using joins, and then join the subtrees to their parents iteratively until you reach the root node (the leftmost feature group in the feature selection):

@@ -180,13 +190,12 @@ subtrees to their parents iteratively until you reach the root node (the leftmos .join(merchant_details.select_all()) ``` -Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group) -to retrieve the precomputed features: +Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group) to retrieve the precomputed features: === "Python" ```python feature vector = feature_view.get_feature_vector({ - ‘cc_num’: “1234 5555 3333 8888”, + ‘cc_num’: “1234 5555 3333 8888”, ‘merchant_id’: 44208484, }) ``` @@ -195,7 +204,8 @@ to retrieve the precomputed features: In the same way as joins, applying filters to feature groups creates a query with the applied filter. -Filters are constructed with Python Operators `==`, `>=`, `<=`, `!=`, `>`, `<` and additionally with the methods `isin` and `like`. Bitwise Operators `&` and `|` are used to construct conjunctions. +Filters are constructed with Python Operators `==`, `>=`, `<=`, `!=`, `>`, `<` and additionally with the methods `isin` and `like`. +Bitwise Operators `&` and `|` are used to construct conjunctions. For the Scala part of the API, equivalent methods are available in the `Feature` and `Filter` classes. === "Python" @@ -247,7 +257,8 @@ The filters can be applied at any point of the query: #### Joins and/or Filters on feature view query The query retrieved from a feature view can be extended with new joins and/or new filters. -However, this operation will not update the metadata and persist the updated query of the feature view itself. This query can then be used to create a new feature view. +However, this operation will not update the metadata and persist the updated query of the feature view itself. +This query can then be used to create a new feature view. === "Python" ```python @@ -290,7 +301,7 @@ However, this operation will not update the metadata and persist the updated que feature_view.query.join(merchant_details_fg.select_all()) \ .filter((credit_card_transactions_fg.category == "Cash Withdrawal") - # to apply new logic independent of purchase type from above + # to apply new logic independent of purchase type from above # re-fetch new feature view and its query instance feature_view = fs.get_feature_view(“credit_card_fraud”, version=1) @@ -305,18 +316,18 @@ However, this operation will not update the metadata and persist the updated que merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1) accountDetailsFg = fs.getFeatureGroup("account_details", 1) creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1) - + // fetch new feature view and its query instance val featureView = fs.getFeatureView(“credit_card_fraud”, version=1) - + // apply join/filter logic based on purchase type featureView.getQuery.join(merchantDetailsFg.selectAll()) .filter(creditCardTransactionsFg.getFeature("category").eq("Cash Withdrawal")) - - // to apply new logic independent of purchase type from above + + // to apply new logic independent of purchase type from above // re-fetch new feature view and its query instance val featureView = fs.getFeatureView(“credit_card_fraud”, 1) - + // apply new join/filter logic based on account details featureView.getQuery.join(merchantDetailsFg.selectAll()) .filter(accountDetailsFg.getFeature("gender").eq("F")) diff --git a/docs/user_guides/fs/feature_view/training-data.md b/docs/user_guides/fs/feature_view/training-data.md index e5692cd07..82e20155e 100644 --- a/docs/user_guides/fs/feature_view/training-data.md +++ b/docs/user_guides/fs/feature_view/training-data.md @@ -2,13 +2,19 @@ Training data can be created from the feature view and used by different ML libraries for training different models. -You can read [training data concepts](../../../concepts/fs/feature_view/offline_api.md) for more details. To see a full example of how to create training data, you can read [this notebook](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/2_fraud_batch_training_pipeline.ipynb). +You can read [training data concepts](../../../concepts/fs/feature_view/offline_api.md) for more details. +To see a full example of how to create training data, you can read [this notebook](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/2_fraud_batch_training_pipeline.ipynb). For Python-clients, handling small or moderately-sized data, we recommend enabling the [ArrowFlight Server with DuckDB](../../../setup_installation/common/arrow_flight_duckdb.md) service, which will provide significant speedups over Spark/Hive for reading and creating in-memory training datasets. ## Creation -It can be created as in-memory DataFrames or materialised as `tfrecords`, `parquet`, `csv`, or `tsv` files to HopsFS or in all other locations, for example, S3, GCS. If you materialise a training dataset, a `PySparkJob` will be launched. By default, `create_training_data` waits for the job to finish. However, you can run the job asynchronously by passing `write_options={"wait_for_job": False}`. You can monitor the job status in the [jobs overview UI](../../projects/jobs/pyspark_job.md#step-1-jobs-overview). + +It can be created as in-memory DataFrames or materialised as `tfrecords`, `parquet`, `csv`, or `tsv` files to HopsFS or in all other locations, for example, S3, GCS. +If you materialise a training dataset, a `PySparkJob` will be launched. +By default, `create_training_data` waits for the job to finish. +However, you can run the job asynchronously by passing `write_options={"wait_for_job": False}`. +You can monitor the job status in the [jobs overview UI](../../projects/jobs/pyspark_job.md#step-1-jobs-overview). ```python # create a training dataset as dataframe @@ -25,11 +31,16 @@ version, job = feature_view.create_training_data( print(job.id) # get the job's id and view the job status in the UI ``` - ### Extra filters -Sometimes data scientists need to train different models using subsets of a dataset. For example, there can be different models for different countries, seasons, and different groups. One way is to create different feature views for training different models. Another way is to add extra filters on top of the feature view when creating training data. -In the [transaction fraud example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/1_fraud_batch_feature_pipeline.ipynb), there are different transaction categories, for example: "Health/Beauty", "Restaurant/Cafeteria", "Holliday/Travel" etc. Examples below show how to create training data for different transaction categories. +Sometimes data scientists need to train different models using subsets of a dataset. +For example, there can be different models for different countries, seasons, and different groups. +One way is to create different feature views for training different models. +Another way is to add extra filters on top of the feature view when creating training data. + +In the [transaction fraud example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/batch-ai-systems/fraud_batch/1_fraud_batch_feature_pipeline.ipynb), there are different transaction categories, for example: "Health/Beauty", "Restaurant/Cafeteria", "Holliday/Travel" etc. +Examples below show how to create training data for different transaction categories. + ```python # Create a training dataset for Health/Beauty df_health = feature_view.training_data( @@ -43,13 +54,15 @@ df_restaurant_travel = feature_view.training_data( ) ``` - ### Train/Validation/Test Splits -In most cases, ML practitioners want to slice a dataset into multiple splits, most commonly train-test splits or train-validation-test splits, so that they can train and test their models. Feature view provides a sklearn-like API for this purpose, so it is very easy to create a training dataset with different splits. + +In most cases, ML practitioners want to slice a dataset into multiple splits, most commonly train-test splits or train-validation-test splits, so that they can train and test their models. +Feature view provides a sklearn-like API for this purpose, so it is very easy to create a training dataset with different splits. Create a training dataset (as in-memory DataFrames) or materialise a training dataset with train and test splits. + ```python -# create a training dataset +# create a training dataset X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2) # materialise a training dataset @@ -61,13 +74,14 @@ version, job = feature_view.create_train_test_split( ``` Create a training dataset (as in-memory DataFrames) or materialise a training dataset with train, validation, and test splits. + ```python # create a training dataset as DataFrame X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(validation_size=0.3, test_size=0.2) # materialise a training dataset version, job = feature_view.create_train_validation_test_split( - validation_size = 0.3, + validation_size = 0.3, test_size = 0.2 description = 'transactions fraud batch training dataset', data_format = 'csv' @@ -76,13 +90,19 @@ version, job = feature_view.create_train_validation_test_split( If the [ArrowFlight Server with DuckDB](../../../setup_installation/common/arrow_flight_duckdb.md) service is enabled, and you want to create a particular in-memory training dataset with Hive instead, you can set `read_options={"use_hive": True}`. + ```python # create a training dataset as DataFrame with Hive X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2, read_options={"use_hive: True}) ``` ## Read Training Data -Once you have created a training dataset, all its metadata are saved in Hopsworks. This enables you to reproduce exactly the same dataset at a later point in time. This holds for training data as both DataFrames or files. That is, you can delete the training data files (for example, to reduce storage costs), but still reproduce the training data files later on if you need to. + +Once you have created a training dataset, all its metadata are saved in Hopsworks. +This enables you to reproduce exactly the same dataset at a later point in time. +This holds for training data as both DataFrames or files. +That is, you can delete the training data files (for example, to reduce storage costs), but still reproduce the training data files later on if you need to. + ```python # get a training dataset feature_df, label_df = feature_view.get_training_data(training_dataset_version=1) @@ -95,23 +115,23 @@ X_train, X_val, X_test, y_train, y_val, y_test = feature_view.get_train_validati ``` ## Passing Context Variables to Transformation Functions + Once you have [defined a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the required context variables using the `transformation_context` parameter when generating IN-MEMORY training data or materializing a training dataset. !!! note Passing context variables for materializing a training dataset is only supported in the PySpark Kernel. - -=== "Python" +=== "Python" !!! example "Passing context variables while creating training data." ```python # Passing context variable to IN-MEMORY Training Dataset. - X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, + X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, primary_key=True, event_time=True, transformation_context={"context_parameter":10}) # Passing context variable to Materialized Training Dataset. - version, job = feature_view.get_train_test_split(training_dataset_version=1, + version, job = feature_view.get_train_test_split(training_dataset_version=1, primary_key=True, event_time=True, transformation_context={"context_parameter":10}) @@ -119,26 +139,30 @@ Once you have [defined a transformation function using a context variable](../tr ``` ## Read training data with primary key(s) and event time -For certain use cases, e.g. time series models, the input data needs to be sorted according to the primary key(s) and event time combination. + +For certain use cases, e.g., time series models, the input data needs to be sorted according to the primary key(s) and event time combination. Primary key(s) and event time are not usually included in the feature view query as they are not features used for training. To retrieve the primary key(s) and/or event time when retrieving training data, you need to set the parameters `primary_key=True` and/or `event_time=True`. - ```python # get a training dataset -X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, +X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, primary_key=True, event_time=True) ``` !!! note - All primary and event time columns of all the feature groups included in the feature view will be returned. If they have the same names across feature groups and the join prefix was not provided then reading operation will fail with ambiguous column exception. - Make sure to define the join prefix if primary key and event time columns have the same names across feature groups. + All primary and event time columns of all the feature groups included in the feature view will be returned. + If they have the same names across feature groups and the join prefix was not provided then reading operation will fail with ambiguous column exception. + Make sure to define the join prefix if primary key and event time columns have the same names across feature groups. - To use primary key(s) and event time column with materialized training datasets it needs to be created with `primary_key=True` and/or `with_event_time=True`. + To use primary key(s) and event time column with materialized training datasets it needs to be created with `primary_key=True` and/or `with_event_time=True`. ## Deletion -To clean up unused training data, you can delete all training data or for a particular version. Note that all metadata of training data and materialised files stored in HopsFS will be deleted and cannot be recreated anymore. + +To clean up unused training data, you can delete all training data or for a particular version. +Note that all metadata of training data and materialised files stored in HopsFS will be deleted and cannot be recreated anymore. + ```python # delete a training data version feature_view.delete_training_dataset(training_dataset_version=1) @@ -146,7 +170,11 @@ feature_view.delete_training_dataset(training_dataset_version=1) # delete all training datasets feature_view.delete_all_training_datasets() ``` -It is also possible to keep the metadata and delete only the materialised files. Then you can recreate the deleted files by just specifying a version, and you get back the exact same dataset again. This is useful when you are running out of storage. + +It is also possible to keep the metadata and delete only the materialised files. +Then you can recreate the deleted files by just specifying a version, and you get back the exact same dataset again. +This is useful when you are running out of storage. + ```python # delete files of a training data version feature_view.purge_training_data(training_dataset_version=1) @@ -154,18 +182,23 @@ feature_view.purge_training_data(training_dataset_version=1) # delete files of all training datasets feature_view.purge_all_training_data() ``` + To recreate a training dataset: + ```python feature_view.recreate_training_dataset(training_dataset_version =1) ``` ## Tags -Similar to feature view, You can attach, get, and remove tags. You can refer to [here](../tags/tags.md) if you want to learn more about how tags work. + +Similar to feature view, You can attach, get, and remove tags. +You can learn more in [Tags Guide](../tags/tags.md). + ```python # attach feature_view.add_training_dataset_tag( - training_dataset_version=1, - name="tag_schema", + training_dataset_version=1, + name="tag_schema", value={"key", "value"} ) @@ -177,4 +210,6 @@ feature_view.delete_training_dataset_tag(training_dataset_version=1, name="tag_s ``` ## Next -Once you have created a training dataset and trained your model, you can deploy your model in a "batch" or "online" setting. Next, you can learn how to create [batch data](./batch-data.md) and get [feature vectors](./feature-vectors.md). \ No newline at end of file + +Once you have created a training dataset and trained your model, you can deploy your model in a "batch" or "online" setting. +Next, you can learn how to create [batch data](./batch-data.md) and get [feature vectors](./feature-vectors.md). diff --git a/docs/user_guides/fs/index.md b/docs/user_guides/fs/index.md index 4d6ad1f0b..1d4d0fbb4 100644 --- a/docs/user_guides/fs/index.md +++ b/docs/user_guides/fs/index.md @@ -8,4 +8,4 @@ This section serves to provide guides and examples for the common usage of abstr - [Vector Similarity Search](vector_similarity_search.md) - [Compute Engines](compute_engines.md) - [Integrations](../integrations/index.md) -- [Transformations](transformation_functions.md) \ No newline at end of file +- [Transformations](transformation_functions.md) diff --git a/docs/user_guides/fs/provenance/provenance.md b/docs/user_guides/fs/provenance/provenance.md index 4362d9247..272cb10bb 100644 --- a/docs/user_guides/fs/provenance/provenance.md +++ b/docs/user_guides/fs/provenance/provenance.md @@ -14,19 +14,22 @@ In the provenance pages we will call a provenance artifact or shortly artifact, With the following provenance graph: -``` +```plaintext data source -> feature group -> feature group -> feature view -> training dataset -> model ``` -we will call the parent, the artifact to the left, and the child, the artifact to the right. So a feature view has a number of feature groups as parents and can have a number of training datasets as children. +we will call the parent, the artifact to the left, and the child, the artifact to the right. +So a feature view has a number of feature groups as parents and can have a number of training datasets as children. -Tracking provenance allows users to determine where and if an artifact is being used. You can track, for example, if feature groups are being used to create additional (derived) feature groups or feature views, or if their data is eventually used to train models. +Tracking provenance allows users to determine where and if an artifact is being used. +You can track, for example, if feature groups are being used to create additional (derived) feature groups or feature views, or if their data is eventually used to train models. You can interact with the provenance graph using the UI or the APIs. ## Step 1: Data Source lineage -The relationship between data sources and feature groups is captured automatically when you create an external feature group. You can inspect the relationship between data sources and feature groups using the APIs. +The relationship between data sources and feature groups is captured automatically when you create an external feature group. +You can inspect the relationship between data sources and feature groups using the APIs. === "Python" @@ -44,9 +47,10 @@ The relationship between data sources and feature groups is captured automatical user_profiles_fg.save() ``` -### Using the APIs +### Step 1, Using Python -Starting from a feature group metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the data sources that are part of the feature group. To do so, you can use the [get_storage_connector_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_storage_connector_provenance) method. +Starting from a feature group metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the data sources that are part of the feature group. +To do so, you can use the [`FeatureGroup.get_storage_connector_provenance`][hsfs.feature_group.FeatureGroup.get_storage_connector_provenance] method. === "Python" @@ -71,7 +75,8 @@ Starting from a feature group metadata object, you can traverse upstream the pro user_profiles_fg.get_storage_connector() ``` -To traverse the provenance graph in the opposite direction (i.e. from the data source to the feature group), you can use the [get_feature_groups_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/storage_connector_api/#get_feature_groups_provenance) method. When navigating the provenance graph downstream, the `deleted` feature groups are not tracked by provenance, as such, the `deleted` property will always return an empty list. +To traverse the provenance graph in the opposite direction (i.e., from the data source to the feature group), you can use the [`StorageConnector.get_feature_groups_provenance`][hsfs.storage_connector.StorageConnector.get_feature_groups_provenance] method. +When navigating the provenance graph downstream, the `deleted` feature groups are not tracked by provenance, as such, the `deleted` property will always return an empty list. === "Python" @@ -97,7 +102,9 @@ To traverse the provenance graph in the opposite direction (i.e. from the data s ### Assign parents to a feature group -When creating a feature group, it is possible to specify a list of feature groups used to create the derived features. For example, you could have an external feature group defined over a Snowflake or Redshift table, which you use to compute the features and save them in a feature group. You can mark the external feature group as parent of the feature group you are creating by using the `parents` parameter in the [get_or_create_feature_group](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_or_create_feature_group) or [create_feature_group](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#create_feature_group) methods: +When creating a feature group, it is possible to specify a list of feature groups used to create the derived features. +For example, you could have an external feature group defined over a Snowflake or Redshift table, which you use to compute the features and save them in a feature group. +You can mark the external feature group as parent of the feature group you are creating by using the `parents` parameter in the [`FeatureStore.get_or_create_feature_group`][hsfs.feature_store.FeatureStore.get_or_create_feature_group] or [`FeatureStore.create_feature_group`][hsfs.feature_store.FeatureStore.create_feature_group] methods: === "Python" @@ -150,7 +157,8 @@ Another example use case for derived feature group is if you have a feature grou ### List feature group parents -You can query the provenance graph of a feature group using the UI and the APIs. From the APIs you can list the parent feature groups by calling the method [get_parent_feature_groups](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_parent_feature_groups) +You can query the provenance graph of a feature group using the UI and the APIs. +From the APIs you can list the parent feature groups by calling the method [`FeatureGroup.get_parent_feature_groups`][hsfs.feature_group.FeatureGroup.get_parent_feature_groups] === "Python" @@ -167,9 +175,10 @@ You can query the provenance graph of a feature group using the UI and the APIs. lineage.inaccessible ``` -A parent is marked as `deleted` (and added to the deleted list) if the parent feature group was deleted. `inaccessible` if you no longer have access to the parent feature group (e.g. the parent feature group belongs to a project you no longer have access to). +A parent is marked as `deleted` (and added to the deleted list) if the parent feature group was deleted. `inaccessible` if you no longer have access to the parent feature group (e.g., the parent feature group belongs to a project you no longer have access to). -To traverse the provenance graph in the opposite direction (i.e. from the parent feature group to the child), you can use the [get_generate_feature_groups](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_generated_feature_groups) method. When navigating the provenance graph downstream, the `deleted` feature groups are not tracked by provenance, as such, the `deleted` property will always return an empty list. +To traverse the provenance graph in the opposite direction (i.e., from the parent feature group to the child), you can use the [`FeatureGroup.get_generated_feature_groups`][hsfs.feature_group.FeatureGroup.get_generated_feature_groups] method. +When navigating the provenance graph downstream, the `deleted` feature groups are not tracked by provenance, as such, the `deleted` property will always return an empty list. === "Python" @@ -183,7 +192,8 @@ To traverse the provenance graph in the opposite direction (i.e. from the parent lineage.inaccessible ``` -You can also visualize the relationship between the parent and child feature groups in the UI. In each feature group overview page you can find a provenance section with the graph of parent data source/feature groups and child feature groups/feature views. +You can also visualize the relationship between the parent and child feature groups in the UI. +In each feature group overview page you can find a provenance section with the graph of parent data source/feature groups and child feature groups/feature views.

@@ -194,11 +204,13 @@ You can also visualize the relationship between the parent and child feature gro ## Step 3: Feature view lineage -The relationship between feature groups and feature views is captured automatically when you create a feature view. You can inspect the relationship between feature groups and feature views using the APIs or the UI. +The relationship between feature groups and feature views is captured automatically when you create a feature view. +You can inspect the relationship between feature groups and feature views using the APIs or the UI. -### Using the APIs +### Step 3, Using Python -Starting from a feature view metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the feature groups that are part of the feature view. To do so, you can use the [get_parent_feature_groups](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#get_parent_feature_groups) method. +Starting from a feature view metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the feature groups that are part of the feature view. +To do so, you can use the [`FeatureView.get_parent_feature_groups`][hsfs.feature_view.FeatureView.get_parent_feature_groups] method. === "Python" @@ -215,7 +227,10 @@ Starting from a feature view metadata object, you can traverse upstream the prov lineage.inaccessible ``` -You can also traverse the provenance graph in the opposite direction. Starting from a feature group you can navigate downstream and list all the feature views the feature group is used in. As for the derived feature group example above, when navigating the provenance graph downstream `deleted` feature views are not tracked. As such, the `deleted` property will always be empty. +You can also traverse the provenance graph in the opposite direction. +Starting from a feature group you can navigate downstream and list all the feature views the feature group is used in. +As for the derived feature group example above, when navigating the provenance graph downstream `deleted` feature views are not tracked. +As such, the `deleted` property will always be empty. === "Python" @@ -229,7 +244,7 @@ You can also traverse the provenance graph in the opposite direction. Starting f lineage.inaccessible ``` -Users can call the [get_models_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_view_api/#get_models_provenance) method which will return a [Link](#provenance-links) object. +Users can call the [`FeatureView.get_models_provenance`][hsfs.feature_view.FeatureView.get_models_provenance] method which will return a [provenance Link object](#provenance-links). You can also retrive directly the accessible models, without the need to extract them from the provenance links object: === "Python" @@ -242,7 +257,8 @@ You can also retrive directly the accessible models, without the need to extract models = fraud_fv.get_models(training_dataset_version: 1) ``` -Also we added a utility method to retrieve from the user's accessible models, the last trained one. Last is determined based on timestamp when it was saved into the model registry. +Also we added a utility method to retrieve from the user's accessible models, the last trained one. +Last is determined based on timestamp when it was saved into the model registry. === "Python" ```python @@ -252,7 +268,7 @@ Also we added a utility method to retrieve from the user's accessible models, th model = fraud_fv.get_newest_model(training_dataset_version: 1) ``` -### Using the UI +### Step 3, Using UI In the feature view overview UI you can explore the provenance graph of the feature view: @@ -268,5 +284,7 @@ In the feature view overview UI you can explore the provenance graph of the feat All the `_provenance` methods return a `Link` dictionary object that contains `accessible`, `inaccesible`, `deleted` lists. - `accessible` - contains any artifact from the result, that the user has access to. -- `inaccessible` - contains any artifacts that might have been shared at some point in the past, but where this sharing was retracted. Since the relation between artifacts is still maintained in the provenance, the user will only have access to limited metadata and the artifacts will be included in this `inaccessible` list. -- `deleted` - contains artifacts that are deleted with children stil present in the system. There is minimum amount of metadata for the deleted allowing for some limited human readable identification. +- `inaccessible` - contains any artifacts that might have been shared at some point in the past, but where this sharing was retracted. +Since the relation between artifacts is still maintained in the provenance, the user will only have access to limited metadata and the artifacts will be included in this `inaccessible` list. +- `deleted` - contains artifacts that are deleted with children stil present in the system. +There is minimum amount of metadata for the deleted allowing for some limited human readable identification. diff --git a/docs/user_guides/fs/sharing/sharing.md b/docs/user_guides/fs/sharing/sharing.md index 206845f4c..56f83ec0a 100644 --- a/docs/user_guides/fs/sharing/sharing.md +++ b/docs/user_guides/fs/sharing/sharing.md @@ -2,18 +2,21 @@ ## Introduction -Hopsworks allows artifacts (e.g. feature groups, feature views) to be shared between projects. +Hopsworks allows artifacts (e.g., feature groups, feature views) to be shared between projects. There are two main use cases for sharing features between projects: -- If you have multiple teams working on the same Hopsworks deployment. Each team works within its own set of projects. +- If you have multiple teams working on the same Hopsworks deployment. + Each team works within its own set of projects. If team A wants to leverage features built by team B, they can do so by sharing the feature groups from a team A project to a team B project. -- By creating different projects for the different stages of the development lifecycle (e.g. a dev project, a testing project, and a production project), - you can make sure that changes on the development project don't impact the features in the production project. At the same time, you might want to - leverage production features to develop new models or additional features. In this case, you can share the production feature store with the +- By creating different projects for the different stages of the development lifecycle (e.g., a dev project, a testing project, and a production project), + you can make sure that changes on the development project don't impact the features in the production project. + At the same time, you might want to + leverage production features to develop new models or additional features. + In this case, you can share the production feature store with the development feature store in `read-only` mode. -### Step 1: Open the project of the feature store that you would like to share on Hopsworks. +### Step 1: Open the project of the feature store that you would like to share on Hopsworks In the `Project Settings` navigate to the `Shared with other projects` section. @@ -35,14 +38,14 @@ In the `Project` section choose project you wish to share the feature store with

-Feature stores can be shared exclusively using `read-only` permission. This means that a member is not capable of enacting any changes on the shared project. +Feature stores can be shared exclusively using `read-only` permission. +This means that a member is not capable of enacting any changes on the shared project. ### Step 3: Accept the Invitation In the project where the feature store was shared (step 2) go to `Project Settings` and navigate to the `Shared from other projects` section. Click `accept`. -

Accept @@ -57,14 +60,15 @@ After accepting the share, the shared feature store is listed under the `Shared

-## Use features from a shared feature store +## Use features from a shared feature store + +### Step 1: Get feature store handles -### Step 1: Get feature store handles -To access features from a shared feature store you need to first retrieve the handle for the shared feature store. +To access features from a shared feature store you need to first retrieve the handle for the shared feature store. To retrieve the handle use the get_feature_store() method and provide the name of the shared feature store ```python -import hopsworks +import hopsworks project = hopsworks.login() diff --git a/docs/user_guides/fs/tags/tags.md b/docs/user_guides/fs/tags/tags.md index c4a07f8b8..10c43e763 100644 --- a/docs/user_guides/fs/tags/tags.md +++ b/docs/user_guides/fs/tags/tags.md @@ -1,27 +1,35 @@ -# Tags +# Tags ## Introduction Hopsworks feature store enables users to attach tags to artifacts, such as feature groups, feature views or training datasets. -A tag is a `{key: value}` pair which provides additional information about the data managed by Hopsworks. Tags allow you to design custom metadata for your artifacts. For example, you could design a tag schema that encodes governance rules for your feature store, such as classifying data as personally identifiable, defining a data retention period for the data, and defining who signed off on the creation of some feature. +A tag is a `{key: value}` pair which provides additional information about the data managed by Hopsworks. +Tags allow you to design custom metadata for your artifacts. +For example, you could design a tag schema that encodes governance rules for your feature store, such as classifying data as personally identifiable, defining a data retention period for the data, and defining who signed off on the creation of some feature. ## Prerequisites -Tags have a schema. Before you can attach a tag to an artifact and fill in the tag values, you first need to select an existing tag schema or create a new tag schema. +Tags have a schema. +Before you can attach a tag to an artifact and fill in the tag values, you first need to select an existing tag schema or create a new tag schema. -Tag schemas can be defined by Hopsworks administrator in the `Cluster settings` section of the platform. Schemas are defined globally across all projects. When users attach tags to an artifact, the tag will be validated against a specific schema. This allows tags to be consistent no matter the project or the team generating them. +Tag schemas can be defined by Hopsworks administrator in the `Cluster settings` section of the platform. +Schemas are defined globally across all projects. +When users attach tags to an artifact, the tag will be validated against a specific schema. +This allows tags to be consistent no matter the project or the team generating them. !!! warning "Immutable" - Tag schemas are immutable. Once defined, a tag schema cannot be edited nor deleted. + Tag schemas are immutable. + Once defined, a tag schema cannot be edited nor deleted. ## Step 1: Define a tag schema -Tag schemas can be defined using the UI wizard in the `Cluster settings` > `Tag schemas` section. -Tag schemas have a name, the name is used to uniquely identify the schema. You can also provide an optional description. +Tag schemas can be defined using the UI wizard in the `Cluster settings` > `Tag schemas` section. +Tag schemas have a name, the name is used to uniquely identify the schema. +You can also provide an optional description. You can define a schema by using the UI tool or by providing the schema in JSON format. -If you use the UI tool, you should provide the name of the property in the schema, the type of the property, whether or not the property is required and an optional description. +If you use the UI tool, you should provide the name of the property in the schema, the type of the property, whether or not the property is required and an optional description.

@@ -30,10 +38,13 @@ If you use the UI tool, you should provide the name of the property in the schem

-The UI tool allows you to define simple not-nested schemas. For more advanced use cases, more complex schemas (e.g. nested schemas) might be required to fully express the content of a given artifact. -In such cases it is possible to provide the schema directly as JSON string. The JSON should follow the standard [https://json-schema.org](https://json-schema.org). An example of complex schema is the following: +The UI tool allows you to define simple not-nested schemas. +For more advanced use cases, more complex schemas (e.g., nested schemas) might be required to fully express the content of a given artifact. +In such cases it is possible to provide the schema directly as JSON string. +The JSON should follow the standard [https://json-schema.org](https://json-schema.org). +An example of complex schema is the following: -``` +```json { "type" : "object", "properties" : @@ -51,9 +62,10 @@ In such cases it is possible to provide the schema directly as JSON string. The } ``` -Additionally it is also possible to define a single property as tag. You can achieve this by defining a JSON schema like the following: +Additionally it is also possible to define a single property as tag. +You can achieve this by defining a JSON schema like the following: -``` +```json { "type" : "string" } ``` @@ -81,7 +93,7 @@ You can attach tags to feature groups and feature views by using the `add_tag()` } # Attach the tag - fg.add_tag("data_privacy", tag) + fg.add_tag("data_privacy", tag) ``` You can see the list of tags attached to a given artifact by using the `get_tags()` method: @@ -92,7 +104,7 @@ You can see the list of tags attached to a given artifact by using the `get_tags # Retrieve the feature group fg = fs.get_feature_group("transactions_4h_aggs_fraud_batch_fg", version=1) - # Retrieve the tags for this feature group + # Retrieve the tags for this feature group fg.get_tags() ``` @@ -104,7 +116,7 @@ Finally you can remove a tag from a given artifact by calling the `delete_tag()` # Retrieve the feature group fg = fs.get_feature_group("transactions_4h_aggs_fraud_batch_fg", version=1) - # Retrieve the tags for this feature group + # Retrieve the tags for this feature group fg.delete_tag("data_privacy") ``` @@ -112,7 +124,9 @@ The same APIs work for feature views and training dataset alike. ### Using the UI -You can attach tags to feature groups and feature views directly from the UI. You can navigate on the artifact page and click on the `Add tags` button. From there you can select the tag schema of the tag you want to attach and populate the values as shown in the gif below. +You can attach tags to feature groups and feature views directly from the UI. +You can navigate on the artifact page and click on the `Add tags` button. +From there you can select the tag schema of the tag you want to attach and populate the values as shown in the gif below.

@@ -123,11 +137,12 @@ You can attach tags to feature groups and feature views directly from the UI. Yo ## Step 3: Search -Hopsworks indexes the tags attached to feature groups, feature views and training datasets. The tags will then be searchable using the free text search box located at the top of the UI. +Hopsworks indexes the tags attached to feature groups, feature views and training datasets. +The tags will then be searchable using the free text search box located at the top of the UI.

Search for tags in the feature store
Search for tags in the feature store
-

\ No newline at end of file +

diff --git a/docs/user_guides/fs/transformation_functions.md b/docs/user_guides/fs/transformation_functions.md index fb0f5beec..db060ceb5 100644 --- a/docs/user_guides/fs/transformation_functions.md +++ b/docs/user_guides/fs/transformation_functions.md @@ -1,32 +1,47 @@ -# Transformation Functions +# Transformation Functions -In AI systems, [transformation functions](https://www.hopsworks.ai/dictionary/transformation) transform data to create features, the inputs to machine learning models (in both training and inference). The [taxonomy of data transformations](../../concepts/mlops/data_transformations.md) introduces three types of data transformation prevalent in all AI systems. Hopsworks offers simple Python APIs to define custom transformation functions. These can be used along with [feature groups](./feature_group/index.md) and [feature views](./feature_view/overview.md) to create [on-demand transformations](./feature_group/on_demand_transformations.md) and [model-dependent transformations](./feature_view/model-dependent-transformations.md), producing modular AI pipelines that are skew-free. +In AI systems, [transformation functions](https://www.hopsworks.ai/dictionary/transformation) transform data to create features, the inputs to machine learning models (in both training and inference). +The [taxonomy of data transformations](../../concepts/mlops/data_transformations.md) introduces three types of data transformation prevalent in all AI systems. +Hopsworks offers simple Python APIs to define custom transformation functions. +These can be used along with [feature groups](./feature_group/index.md) and [feature views](./feature_view/overview.md) to create [on-demand transformations](./feature_group/on_demand_transformations.md) and [model-dependent transformations](./feature_view/model-dependent-transformations.md), producing modular AI pipelines that are skew-free. ## Custom Transformation Function Creation -User-defined transformation functions can be created in Hopsworks using the [`@udf`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/udf/) decorator. These functions can be either implemented as pure Python UDFs or Pandas UDFs (User-Defined Functions). +User-defined transformation functions can be created in Hopsworks using the [`@udf`][hsfs.hopsworks_udf.udf] decorator. +These functions can be either implemented as pure Python UDFs or Pandas UDFs (User-Defined Functions). -Hopsworks offers three execution modes to control the execution of transformation functions during training dataset creation, batch inference, and online inference. By default, Hopsworks executes transformation functions as Python UDFs for [feature vector retrieval](feature_view/feature-vectors.md) in online inference pipelines and as Pandas UDFs for both [batch data retrieval](feature_view/batch-data.md) in batch inference pipelines and [training dataset creation](feature_view/training-data.md) in training pipelines. Python UDFs are optimized for smaller data volumes, while Pandas UDFs provide better performance on larger datasets. This execution mode provides the optimal balance based on the data size across training dataset generations, batch inference, and online inference. Additionally, Hopsworks allows you to explicitly set the execution mode for a transformation function to `python` or `pandas`, forcing the transformation function to always run as either a Python or Pandas UDF as specified. +Hopsworks offers three execution modes to control the execution of transformation functions during training dataset creation, batch inference, and online inference. +By default, Hopsworks executes transformation functions as Python UDFs for [feature vector retrieval](feature_view/feature-vectors.md) in online inference pipelines and as Pandas UDFs for both [batch data retrieval](feature_view/batch-data.md) in batch inference pipelines and [training dataset creation](feature_view/training-data.md) in training pipelines. +Python UDFs are optimized for smaller data volumes, while Pandas UDFs provide better performance on larger datasets. +This execution mode provides the optimal balance based on the data size across training dataset generations, batch inference, and online inference. +Additionally, Hopsworks allows you to explicitly set the execution mode for a transformation function to `python` or `pandas`, forcing the transformation function to always run as either a Python or Pandas UDF as specified. -A Pandas UDF in Hopsworks accepts one or more Pandas Series as input and can return either one or more Series or a Pandas DataFrame. When integrated with PySpark applications, Hopsworks automatically executes Pandas UDFs using PySpark’s [`pandas_udf`](https://spark.apache.org/docs/3.4.1/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html), enabling the transformation functions to efficiently scale for large datasets. +A Pandas UDF in Hopsworks accepts one or more Pandas Series as input and can return either one or more Series or a Pandas DataFrame. +When integrated with PySpark applications, Hopsworks automatically executes Pandas UDFs using PySpark’s [`pandas_udf`](https://spark.apache.org/docs/3.4.1/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html), enabling the transformation functions to efficiently scale for large datasets. !!! warning "Java/Scala support" - Hopsworks supports transformations functions in Python (Pandas UDFs, Python UDFs). Transformations functions can also be executed in Python-based DataFrame frameworks (PySpark, Pandas). There is currently no support for transformation functions in SQL or Java-based feature pipelines. + Hopsworks supports transformations functions in Python (Pandas UDFs, Python UDFs). + Transformations functions can also be executed in Python-based DataFrame frameworks (PySpark, Pandas). + There is currently no support for transformation functions in SQL or Java-based feature pipelines. -Transformation functions created in Hopsworks can be directly attached to feature views or feature groups or stored in the feature store for later retrieval. These functions can be part of a library [installed](../../user_guides/projects/python/python_install.md) in Hopsworks or be defined in a [Jupyter notebook](../../user_guides/projects/jupyter/python_notebook.md) running a Python kernel or added when starting a Jupyter notebook or [Hopsworks job](../../user_guides/projects/jobs/spark_job.md). +Transformation functions created in Hopsworks can be directly attached to feature views or feature groups or stored in the feature store for later retrieval. +These functions can be part of a library [installed](../../user_guides/projects/python/python_install.md) in Hopsworks or be defined in a [Jupyter notebook](../../user_guides/projects/jupyter/python_notebook.md) running a Python kernel or added when starting a Jupyter notebook or [Hopsworks job](../../user_guides/projects/jobs/spark_job.md). !!! warning "PySpark Kernels" - Definition transformation function within a Jupyter notebook is only supported in Python Kernel. In a PySpark Kernel transformation function have to defined as modules or added when starting a Jupyter notebook. + Definition transformation function within a Jupyter notebook is only supported in Python Kernel. + In a PySpark Kernel transformation function have to defined as modules or added when starting a Jupyter notebook. - -The `@udf` decorator in Hopsworks creates a metadata class called [`HopsworksUdf`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/hopsworks_udf/). This class manages the necessary operations to execute the transformation function. +The `@udf` decorator in Hopsworks creates a metadata class called [`HopsworksUdf`][hsfs.hopsworks_udf.HopsworksUdf]. +This class manages the necessary operations to execute the transformation function. The decorator accepts three parameters: -- **`return_type`** (required): Specifies the data type(s) of the features returned by the transformation function. It can be a single Python type if the function returns one transformed feature, or a list of Python types if it returns multiple transformed features. The supported Python types that be used with the `return_type` argument are provided in the table below: +- **`return_type`** (required): Specifies the data type(s) of the features returned by the transformation function. + It can be a single Python type if the function returns one transformed feature, or a list of Python types if it returns multiple transformed features. + The supported Python types that be used with the `return_type` argument are provided in the table below: | Supported Python Types | |:----------------------------------:| @@ -38,9 +53,13 @@ The decorator accepts three parameters: | datetime.date | | datetime.time | -- **`drop`** (optional): Identifies input arguments to exclude from the output after transformations are applied. By default, all inputs are retained in the output. Further details on this argument can be found [below](#dropping-input-features). +- **`drop`** (optional): Identifies input arguments to exclude from the output after transformations are applied. + By default, all inputs are retained in the output. + Further details on this argument can be found [below](#dropping-input-features). -- **`mode`** (optional): Determines the execution mode of the transformation function. The argument accepts three values: `default`, `python`, or `pandas`. By default, the `mode` is set to `default`. Further details on this argument can be found [below](#specifying-execution-modes). +- **`mode`** (optional): Determines the execution mode of the transformation function. + The argument accepts three values: `default`, `python`, or `pandas`. + By default, the `mode` is set to `default`. Further details on this argument can be found [below](#specifying-execution-modes). Hopsworks supports four types of transformation functions across all execution modes: @@ -49,10 +68,10 @@ Hopsworks supports four types of transformation functions across all execution m 3. Many-to-one: Transforms multiple features into one transformed feature. 4. Many-to-many: Transforms multiple features into multiple transformed features. - ### One-to-one transformations -To create a one-to-one transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a single Python type. The transformation function should take one argument as input and return a Pandas Series. +To create a one-to-one transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a single Python type. +The transformation function should take one argument as input and return a Pandas Series. === "Python" @@ -81,7 +100,8 @@ The creation of many-to-one transformation functions is similar to that of a one ### One-to-many transformations -To create a one-to-many transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a list of Python types, and the transformation function should take one argument as input and return multiple features as a Pandas DataFrame. The return types provided to the decorator must match the types of each column in the returned Pandas DataFrame. +To create a one-to-many transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a list of Python types, and the transformation function should take one argument as input and return multiple features as a Pandas DataFrame. +The return types provided to the decorator must match the types of each column in the returned Pandas DataFrame. === "Python" !!! example "Creation of a one-to-many transformation function in Hopsworks." @@ -98,7 +118,7 @@ To create a one-to-many transformation function, the Hopsworks `@udf` decorato The creation of a many-to-many transformation function is similar to that of a one-to-many transformation function, the only difference being that the transformation function accepts multiple features as input. -=== "Python" +=== "Python" !!! example "Creation of a many-to-many transformation function in Hopsworks." ```python from hopsworks import udf @@ -111,18 +131,21 @@ The creation of a many-to-many transformation function is similar to that of a o ### Specifying execution modes -The `mode` parameter of the `@udf` decorator can be used to specify the execution mode of the transformation function. It accepts three possible values `default`, `python` and `pandas`. Each mode is explained in more detail below: +The `mode` parameter of the `@udf` decorator can be used to specify the execution mode of the transformation function. +It accepts three possible values `default`, `python` and `pandas`. Each mode is explained in more detail below: -#### Default -This execution mode assumes that the transformation function can be executed as either a Pandas UDF or a Python UDF. It serves as the default mode used when the `mode` parameter is not specified. In this mode, the transformation function is executed as a Pandas UDF during training and in the batch inference pipeline, while it operates as a Python UDF during online inference. +#### Default +This execution mode assumes that the transformation function can be executed as either a Pandas UDF or a Python UDF. +It serves as the default mode used when the `mode` parameter is not specified. +In this mode, the transformation function is executed as a Pandas UDF during training and in the batch inference pipeline, while it operates as a Python UDF during online inference. -=== "Python" +=== "Python" !!! example "Creating a many to many transformations function using the default execution mode" ```python from hopsworks import udf import pandas as pd - + # "default" mode is used if the parameter `mode` is not explicitly set. @udf(return_type=[int, int, int]) def add_one_multiple(feature1, feature2, feature3): @@ -133,11 +156,11 @@ This execution mode assumes that the transformation function can be executed as return feature1 + 2, feature2 + 2, feature3 + 2 ``` -#### Python -The transformation function can be configured to always execute as a Python UDF by setting the `mode` parameter of the `@udf` decorator to `python`. +#### Python +The transformation function can be configured to always execute as a Python UDF by setting the `mode` parameter of the `@udf` decorator to `python`. -=== "Python" +=== "Python" !!! example "Creating a many to many transformation function as a Python UDF" ```python from hopsworks import udf @@ -148,11 +171,11 @@ The transformation function can be configured to always execute as a Python UDF return feature1 + 1, feature2 + 1, feature3 + 1 ``` -#### Pandas -The transformation function can be configured to always execute as a Pandas UDF by setting the `mode` parameter of the `@udf` decorator to `pandas`. +#### Pandas +The transformation function can be configured to always execute as a Pandas UDF by setting the `mode` parameter of the `@udf` decorator to `pandas`. -=== "Python" +=== "Python" !!! example "Creating a many to many transformations function as a Pandas UDF" ```python from hopsworks import udf @@ -171,10 +194,10 @@ The transformation function can be configured to always execute as a Pandas UDF ### Dropping input features -The `drop` parameter of the `@udf` decorator is used to drop specific columns in the input DataFrame after transformation. If any argument of the transformation function is passed to the `drop` parameter, then the column mapped to the argument is dropped after the transformation functions are applied. In the example below, the columns mapped to the arguments `feature1` and `feature3` are dropped after the application of all transformation functions. - +The `drop` parameter of the `@udf` decorator is used to drop specific columns in the input DataFrame after transformation. If any argument of the transformation function is passed to the `drop` parameter, then the column mapped to the argument is dropped after the transformation functions are applied. +In the example below, the columns mapped to the arguments `feature1` and `feature3` are dropped after the application of all transformation functions. -=== "Python" +=== "Python" !!! example "Specify arguments to drop after transformation" ```python from hopsworks import udf @@ -187,10 +210,11 @@ The `drop` parameter of the `@udf` decorator is used to drop specific column ### Specifying output features names for transformation functions -The [`alias`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/transformation_functions_api/#alias) function of a transformation function allows the specification of names of transformed features generated by the transformation function. Each name must be uniques and should be at-most 63 characters long. If no name is provided via the `alias` function, Hopsworks generates default output feature names when [on-demand](./feature_group/on_demand_transformations.md) or [model-dependent](./feature_view/model-dependent-transformations.md) transformation functions are created. +The [`TransformationFunction.alias`][hsfs.transformation_function.TransformationFunction.alias] function of a transformation function allows the specification of names of transformed features generated by the transformation function. +Each name must be uniques and should be at-most 63 characters long. +If no name is provided via the `alias` function, Hopsworks generates default output feature names when [on-demand](./feature_group/on_demand_transformations.md) or [model-dependent](./feature_view/model-dependent-transformations.md) transformation functions are created. - -=== "Python" +=== "Python" !!! example "Specifying output column names for transformation functions." ```python from hopsworks import udf @@ -206,22 +230,26 @@ The [`alias`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/gen ### Training dataset statistics -A keyword argument `statistics` can be defined in the transformation function if it requires training dataset statistics for any of its arguments. The `statistics` argument must be assigned an instance of the class [`TransformationStatistics`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/transformation_statistics/) as the default value. The `TransformationStatistics` instance must be initialized using the names of the arguments requiring statistics. +A keyword argument `statistics` can be defined in the transformation function if it requires training dataset statistics for any of its arguments. +The `statistics` argument must be assigned an instance of the class [`TransformationStatistics`][hsfs.transformation_statistics.TransformationStatistics] as the default value. +The `TransformationStatistics` instance must be initialized using the names of the arguments requiring statistics. !!! warning "Transformation Statistics" - The statistics provided to the transformation function is the statistics computed using [the train set](https://www.hopsworks.ai/dictionary/train-training-set). Training dataset statistics are not available for on-demand transformations. + The statistics provided to the transformation function is the statistics computed using [the train set](https://www.hopsworks.ai/dictionary/train-training-set). + Training dataset statistics are not available for on-demand transformations. -The `TransformationStatistics` instance contains separate objects with the same name as the arguments used to initialize it. These objects encapsulate statistics related to the argument as instances of the class [`FeatureTransformationStatistics`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/feature_transformation_statistics/). Upon instantiation, instances of `FeatureTransformationStatistics` contain `None` values and are updated with the required statistics after the creation of a training dataset. +The `TransformationStatistics` instance contains separate objects with the same name as the arguments used to initialize it. +These objects encapsulate statistics related to the argument as instances of the class [`FeatureTransformationStatistics`][hsfs.transformation_statistics.FeatureTransformationStatistics]. +Upon instantiation, instances of `FeatureTransformationStatistics` contain `None` values and are updated with the required statistics after the creation of a training dataset. - -=== "Python" +=== "Python" !!! example "Creation of a transformation function in Hopsworks that uses training dataset statistics" ```python from hopsworks import udf from hopsworks.transformation_statistics import TransformationStatistics - stats = TransformationStatistics("argument1", "argument2", "argument3") + stats = TransformationStatistics("argument1", "argument2", "argument3") @udf(int) def add_features(argument1, argument2, argument3, statistics=stats): @@ -230,10 +258,11 @@ The `TransformationStatistics` instance contains separate objects with the sam ### Passing context variables to transformation function -The `context` keyword argument can be defined in a transformation function to access shared context variables. These variables contain common data used across transformation functions. By including the context argument, you can pass the necessary data as a dictionary into the into the `context` argument of the transformation function during [training dataset creation](feature_view/training-data.md#passing-context-variables-to-transformation-functions) or [feature vector retrieval](feature_view/feature-vectors.md#passing-context-variables-to-transformation-functions) or [batch data retrieval](feature_view/batch-data.md#passing-context-variables-to-transformation-functions). - +The `context` keyword argument can be defined in a transformation function to access shared context variables. +These variables contain common data used across transformation functions. +By including the context argument, you can pass the necessary data as a dictionary into the into the `context` argument of the transformation function during [training dataset creation](feature_view/training-data.md#passing-context-variables-to-transformation-functions) or [feature vector retrieval](feature_view/feature-vectors.md#passing-context-variables-to-transformation-functions) or [batch data retrieval](feature_view/batch-data.md#passing-context-variables-to-transformation-functions). -=== "Python" +=== "Python" !!! example "Creation of a transformation function in Hopsworks that accepts context variables" ```python from hopsworks import udf @@ -243,10 +272,10 @@ The `context` keyword argument can be defined in a transformation function to ac return argument + context["value_to_add"] ``` - ## Saving to the Feature Store -To save a transformation function to the feature store, use the function `create_transformation_function`. It creates a [`TransformationFunction`](http://docs.hopsworks.ai/hopsworks-api/{{{hopsworks_version}}}/generated/api/transformation_functions_api/) object which can then be saved by calling the save function. The save function will throw an error if another transformation function with the same name and version is already saved in the feature store. +To save a transformation function to the feature store, use the function `create_transformation_function`. It creates a [`TransformationFunction`][hsfs.transformation_function.TransformationFunction] object which can then be saved by calling the save function. +The save function will throw an error if another transformation function with the same name and version is already saved in the feature store. === "Python" @@ -260,9 +289,10 @@ To save a transformation function to the feature store, use the function `creat ## Retrieval from the Feature Store -To retrieve all transformation functions from the feature store, use the function `get_transformation_functions`, which returns the list of `TransformationFunction` objects. +To retrieve all transformation functions from the feature store, use the function `get_transformation_functions`, which returns the list of `TransformationFunction` objects. -A specific transformation function can be retrieved using its `name` and `version` with the function `get_transformation_function`. If only the `name` is provided, then the version will default to 1. +A specific transformation function can be retrieved using its `name` and `version` with the function `get_transformation_function`. +If only the `name` is provided, then the version will default to 1. === "Python" @@ -280,4 +310,4 @@ A specific transformation function can be retrieved using its `name` and `versio ## Using transformation functions -Transformation functions can be used by attaching it to a feature view to [create model-dependent transformations](./feature_view/model-dependent-transformations.md) or attached to feature groups to [create on-demand transformations](./feature_group/on_demand_transformations.md) +Transformation functions can be used by attaching it to a feature view to [create model-dependent transformations](./feature_view/model-dependent-transformations.md) or attached to feature groups to [create on-demand transformations](./feature_group/on_demand_transformations.md) diff --git a/docs/user_guides/fs/vector_similarity_search.md b/docs/user_guides/fs/vector_similarity_search.md index 135885c32..5b722c046 100644 --- a/docs/user_guides/fs/vector_similarity_search.md +++ b/docs/user_guides/fs/vector_similarity_search.md @@ -2,11 +2,20 @@ description: User guide for how to use vector similarity search in Hopsworks --- -# Introduction -Vector similarity search (also called similarity search) is a technique enabling the retrieval of similar items based on their vector embeddings or representations. Its applications range across various domains, from recommendation systems to image similarity and beyond. In Hopsworks, vector similarity search is enabled by extending an online feature group with approximate nearest neighbor search capabilities through a vector database, such as Opensearch. This guide provides a detailed walkthrough on how to leverage Hopsworks for vector similarity search. +## Introduction -# Extending Feature Groups with Similarity Search -In Hopsworks, each vector embedding in a feature group is stored in an index within the backing vector database. By default, vector embeddings are stored in the default index for the project (created for every project in Hopsworks), but you have the option to create a new index for a feature group if needed. Creating a separate index per feature group is particularly useful for large volumes of data, ensuring that when a feature group is deleted, its associated index is also removed. For feature groups that use the default project index, the index will only be removed when the project is deleted - not when the feature group is deleted. The index will store all the vector embeddings defined in that feature group, if you have more than one vector embedding in the feature group. +Vector similarity search (also called similarity search) is a technique enabling the retrieval of similar items based on their vector embeddings or representations. +Its applications range across various domains, from recommendation systems to image similarity and beyond. +In Hopsworks, vector similarity search is enabled by extending an online feature group with approximate nearest neighbor search capabilities through a vector database, such as Opensearch. +This guide provides a detailed walkthrough on how to leverage Hopsworks for vector similarity search. + +## Extending Feature Groups with Similarity Search + +In Hopsworks, each vector embedding in a feature group is stored in an index within the backing vector database. +By default, vector embeddings are stored in the default index for the project (created for every project in Hopsworks), but you have the option to create a new index for a feature group if needed. +Creating a separate index per feature group is particularly useful for large volumes of data, ensuring that when a feature group is deleted, its associated index is also removed. +For feature groups that use the default project index, the index will only be removed when the project is deleted - not when the feature group is deleted. +The index will store all the vector embeddings defined in that feature group, if you have more than one vector embedding in the feature group. In the following example, we explicitly define an index for the feature group: @@ -17,13 +26,21 @@ from hsfs import embedding emb = embedding.EmbeddingIndex(index_name="news_fg") ``` -Then, add one or more embedding features to the index. Name and dimension of the embedding features are required for identifying which features should be indexed for k-nearest neighbor (KNN) search. In this example, we get the dimension of the embedding by taking the length of the value of the `embedding_heading` column in the first row of the dataframe `df`. Optionally, you can specify the similarity function among `l2_norm`, `cosine`, and `dot_product`. Refer to [add_embedding](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/embedding_index_api/#add_embedding) for the full list of arguments. +Then, add one or more embedding features to the index. +Name and dimension of the embedding features are required for identifying which features should be indexed for k-nearest neighbor (KNN) search. +In this example, we get the dimension of the embedding by taking the length of the value of the `embedding_heading` column in the first row of the dataframe `df`. +Optionally, you can specify the similarity function among `l2_norm`, `cosine`, and `dot_product`. +Refer to [`EmbeddingIndex.add_embedding`][hsfs.embedding.EmbeddingIndex.add_embedding] for the full list of arguments. + ```aidl # Add embedding feature to the index emb.add_embedding("embedding_heading", len(df["embedding_heading"][0])) ``` -Next, you create a feature group with the `embedding_index` and ingest data to the feature group. When the `embedding_index` is provided, the vector database is used as online feature store. That is, all the features in the feature group are stored **exclusively** in the vector database. The advantage of storing all features in the vector database is that it enables similarity search, and push-down filtering for all feature values. +Next, you create a feature group with the `embedding_index` and ingest data to the feature group. +When the `embedding_index` is provided, the vector database is used as online feature store. +That is, all the features in the feature group are stored **exclusively** in the vector database. +The advantage of storing all features in the vector database is that it enables similarity search, and push-down filtering for all feature values. ```aidl # Create a feature group with the embedding index @@ -39,12 +56,16 @@ news_fg = fs.get_or_create_feature_group( news_fg.insert(df) ``` -# Similarity Search for Feature Groups using Vector Embeddings -You provide a vector embedding as a parameter to the search query using [`find_neighbors`](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#find_neighbors), and it returns the rows in the online feature group that have vector embedding values most similar to the provided vector embedding. +## Similarity Search for Feature Groups using Vector Embeddings + +You provide a vector embedding as a parameter to the search query using [`FeatureGroup.find_neighbors`][hsfs.feature_group.FeatureGroup.find_neighbors], and it returns the rows in the online feature group that have vector embedding values most similar to the provided vector embedding. + +It is also possible to filter rows by specifying a filter on any of the features in the feature group. +The filter is pushed down to the vector database to improve query performance. -It is also possible to filter rows by specifying a filter on any of the features in the feature group. The filter is pushed down to the vector database to improve query performance. +In the first code snippet below, `find_neighbor`s returns 3 rows in `news_fg` that have the closest `news_description` values to the provided `news_description`. +In the second code snippet below, we only return news articles with a `newstype` of `sports`. -In the first code snippet below, `find_neighbor`s returns 3 rows in `news_fg` that have the closest `news_description` values to the provided `news_description`. In the second code snippet below, we only return news articles with a `newstype` of `sports`. ```aidl # Search neighbor embedding with k=3 news_fg.find_neighbors(model.encode(news_description), k=3) @@ -54,15 +75,18 @@ news_fg.find_neighbors(model.encode(news_description), k=3, filter=news_fg.newst ``` To analyze feature values at specific points in time, you can utilize time travel functionality: + ```aidl # Time travel and read from the offline feature store news_fg.as_of(time_in_past).read() ``` -# Querying Similar Embeddings with Additional features +## Querying Similar Embeddings with Additional features You can also use similarity search for vector embedding features in feature views. -In the code snippet below, we create a feature view by selecting features from the earlier `news_fg` and a new feature group `view_fg`. If you include a feature group with vector embedding features in a feature view, **whether or not the vector embedding features are selected**, you can call `find_neighbors` on the feature view, and it will return rows containing all the feature values in the feature view. In the example below, a list of `heading` and `view_cnt` will be returned for the news articles which are closet to provided `news_description`. +In the code snippet below, we create a feature view by selecting features from the earlier `news_fg` and a new feature group `view_fg`. +If you include a feature group with vector embedding features in a feature view, **whether or not the vector embedding features are selected**, you can call `find_neighbors` on the feature view, and it will return rows containing all the feature values in the feature view. +In the example below, a list of `heading` and `view_cnt` will be returned for the news articles which are closet to provided `news_description`. ```aidl view_fg = fs.get_or_create_feature_group( @@ -80,7 +104,9 @@ fv = fs.get_or_create_feature_view( fv.find_neighbors(model.encode(news_description), k=5) ``` -Note that you can use similarity search from the feature view **only if** the feature group which you are querying with `find_neighbors` has **all** the primary keys of the other feature groups. In the example above, you are querying against the feature group `news_fg` which has the vector embedding features, and it has the feature "news_id" which is the primary key of the feature group `view_fg`. But if `page_fg` is used as illustrated below, `find_neighbors` will fail to return any features because primary key `page_id` does not exist in `news_fg`. +Note that you can use similarity search from the feature view **only if** the feature group which you are querying with `find_neighbors` has **all** the primary keys of the other feature groups. +In the example above, you are querying against the feature group `news_fg` which has the vector embedding features, and it has the feature "news_id" which is the primary key of the feature group `view_fg`. +But if `page_fg` is used as illustrated below, `find_neighbors` will fail to return any features because primary key `page_id` does not exist in `news_fg`.

@@ -89,23 +115,34 @@ Note that you can use similarity search from the feature view **only if** the fe

-It is also possible to get back feature vector by providing the primary keys, but it is not recommended as explained in the next section. The client fetches feature vector from the vector store and the online store for `news_fg` and `view_fg` respectively. +It is also possible to get back feature vector by providing the primary keys, but it is not recommended as explained in the next section. +The client fetches feature vector from the vector store and the online store for `news_fg` and `view_fg` respectively. + ```aidl fv.get_feature_vector({"news_id": 1}) ``` -# Performance considerations for Feature Groups with Embeddings -## Choose Features for Vector Store +## Performance considerations for Feature Groups with Embeddings -While it is possible to update feature value in vector store, updating feature value in online store is more efficient. If you have features which are frequently being updated and do not require for filtering, consider storing them separately in a different feature group. As shown in the previous example, `view_cnt` is updated frequently and stored separately. You can then get all the required features by using feature view. +### Choose Features for Vector Store -## Choose the Appropriate Online Feature Stores +While it is possible to update feature value in vector store, updating feature value in online store is more efficient. +If you have features which are frequently being updated and do not require for filtering, consider storing them separately in a different feature group. +As shown in the previous example, `view_cnt` is updated frequently and stored separately. +You can then get all the required features by using feature view. -There are 2 types of online feature stores in Hopsworks: online store (RonDB) and vector store (Opensearch). Online store is designed for retrieving feature vectors efficiently with low latency. Vector store is designed for finding similar embedding efficiently. If similarity search is not required, using online store is recommended for low latency retrieval of feature values including embedding. +### Choose the Appropriate Online Feature Stores -## Use New Index per Feature Group +There are 2 types of online feature stores in Hopsworks: online store (RonDB) and vector store (Opensearch). +Online store is designed for retrieving feature vectors efficiently with low latency. +Vector store is designed for finding similar embedding efficiently. +If similarity search is not required, using online store is recommended for low latency retrieval of feature values including embedding. + +### Use New Index per Feature Group Create a new index per feature group to optimize retrieval performance. -# Next step -Explore the [news search example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/vector_similarity_search/1_feature_group_embeddings_api.ipynb), demonstrating how to use Hopsworks for implementing a news search application using natural language in the application. Additionally, you can see the application of querying similar embeddings with additional features in this [news rank example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/vector_similarity_search/2_feature_view_embeddings_api.ipynb). +## Next steps + +Explore the [news search example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/vector_similarity_search/1_feature_group_embeddings_api.ipynb), demonstrating how to use Hopsworks for implementing a news search application using natural language in the application. +Additionally, you can see the application of querying similar embeddings with additional features in this [news rank example](https://github.com/logicalclocks/hopsworks-tutorials/blob/master/api_examples/vector_similarity_search/2_feature_view_embeddings_api.ipynb). diff --git a/docs/user_guides/index.md b/docs/user_guides/index.md index 55e0146a7..0b0678dbb 100644 --- a/docs/user_guides/index.md +++ b/docs/user_guides/index.md @@ -3,7 +3,9 @@ This section serves to provide guides and examples for the common usage of abstractions and functionality of the Hopsworks platform through the Hopsworks UI and APIs. - [Client Installation](client_installation/index.md): How to get started with the Hopsworks Client libraries. -- [Feature Store](fs/index.md): Learn about the common usage of the core Hopsworks Feature Store abstractions, such as Feature Groups, Feature Views, Data Validation and Data Sources. Also, learn from the [Client Integrations](integrations/index.md) guides how to connect to the Feature Store from external environments such as a local Python environment, Databricks, or AWS Sagemaker +- [Feature Store](fs/index.md): Learn about the common usage of the core Hopsworks Feature Store abstractions, such as Feature Groups, Feature Views, Data Validation and Data Sources. + Also, learn from the [Client Integrations](integrations/index.md) guides how to connect to the Feature Store from external environments such as a local Python environment, Databricks, or AWS Sagemaker - [MLOps](mlops/index.md): Learn about the common usage of Hopsworks MLOps abstractions, such as the Model Registry or Model Serving. -- [Projects](projects/index.md): The core abstraction on Hopsworks are [Projects](../concepts/projects/governance.md). Learn in this section how to manage your projects and the services therein. +- [Projects](projects/index.md): The core abstraction on Hopsworks are [Projects](../concepts/projects/governance.md). + Learn in this section how to manage your projects and the services therein. - [Migration](migration/40_migration.md): Learn how to migrate to newer versions of Hopsworks. diff --git a/docs/user_guides/integrations/beam.md b/docs/user_guides/integrations/beam.md index fc95e93b6..237e9dc56 100644 --- a/docs/user_guides/integrations/beam.md +++ b/docs/user_guides/integrations/beam.md @@ -3,8 +3,10 @@ description: Documentation on how to configure an Apache Beam Dataflow Runner jo --- # Apache Beam Dataflow Runner -Connecting to the Feature Store from an Apache Beam Dataflow Runner, requires configuring the Hopsworks certificates. For this in your Beam Java application `pom.xml` file include following snippet: -``` +Connecting to the Feature Store from an Apache Beam Dataflow Runner, requires configuring the Hopsworks certificates. +For this in your Beam Java application `pom.xml` file include following snippet: + +```xml java.io.tmpdir @@ -17,7 +19,8 @@ Connecting to the Feature Store from an Apache Beam Dataflow Runner, requires co ## Generating an API Key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Beam integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Beam integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -33,7 +36,7 @@ You are now ready to connect to the Hopsworks Feature Store from Beam: HopsworksConnection hopsworksConnection = HopsworksConnection.builder() .host("my_instance") // DNS of your Feature Store instance .port(443) // Port to reach your Hopsworks instance, defaults to 443 - .project("my_project") // Name of your Hopsworks Feature Store project + .project("my_project") // Name of your Hopsworks Feature Store project .apiKeyValue("api_key") // The API key to authenticate with the feature store .hostnameVerification(false) // Disable for self-signed certificates .build(); @@ -44,4 +47,4 @@ FeatureStore fs = hopsworksConnection.getFeatureStore(); ## Next Steps -For more information and how to integrate Beam feature pipeline to the Hopsworks Feature store follow the [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/beam). \ No newline at end of file +For more information and how to integrate Beam feature pipeline to the Hopsworks Feature store follow the [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/beam). diff --git a/docs/user_guides/integrations/databricks/api_key.md b/docs/user_guides/integrations/databricks/api_key.md index dc7b4fa84..659b14c0a 100644 --- a/docs/user_guides/integrations/databricks/api_key.md +++ b/docs/user_guides/integrations/databricks/api_key.md @@ -4,7 +4,8 @@ In order for the Databricks cluster to be able to communicate with Hopsworks, cl ## Generate an API key -For instructions on how to generate an API key follow this [user guide](../../projects/api_key/create_api_key.md). For the Databricks integration to work make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../../projects/api_key/create_api_key.md). +For the Databricks integration to work make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -16,9 +17,8 @@ For instructions on how to generate an API key follow this [user guide](../../pr !!! hint "API key as Argument" To get started quickly, without saving the Hopsworks API in a secret storage, you can simply supply it as an argument when instantiating a connection: - ```python hl_lines="6" - import hopsworks + import hopsworks project = hopsworks.login( host='my_instance', # DNS of your Feature Store instance port=443, # Port to reach your Hopsworks instance, defaults to 443 diff --git a/docs/user_guides/integrations/databricks/configuration.md b/docs/user_guides/integrations/databricks/configuration.md index 137779edf..6e8d8b113 100644 --- a/docs/user_guides/integrations/databricks/configuration.md +++ b/docs/user_guides/integrations/databricks/configuration.md @@ -4,7 +4,8 @@ description: Documentation on how to configure a Databricks cluster to read and # Databricks Integration Users can configure their Databricks clusters to write the results of feature engineering pipelines in the Hopsworks Feature Store using HSFS. -Configuring a Databricks cluster can be done from the Hopsworks Feature Store UI. This guide explains each step. +Configuring a Databricks cluster can be done from the Hopsworks Feature Store UI. +This guide explains each step. ## Prerequisites @@ -16,7 +17,8 @@ If you haven't done so already, follow the networking guides for either [AWS](ne ### Hopsworks API key -In order for the Feature Store API to be able to communicate with the user's Hopsworks instance, the client library (HSFS) needs to have access to a previously generated API key from Hopsworks. For ways to setup and store the Hopsworks API key, please refer to the [API key guide for Databricks](api_key.md). +In order for the Feature Store API to be able to communicate with the user's Hopsworks instance, the client library (HSFS) needs to have access to a previously generated API key from Hopsworks. +For ways to setup and store the Hopsworks API key, please refer to the [API key guide for Databricks](api_key.md). ## Databricks API key @@ -27,7 +29,7 @@ Users can get a valid Databricks API key by following the [Databricks Documentat !!! warning "Cluster access control" - If users have enabled [Databricks Cluster access control](https://docs.databricks.com/security/access-control/cluster-acl.html#cluster-access-control), it is important that the users running the cluster configuration (i.e. the user generating the API key) has `Can Manage` privileges on the cluster they are trying to configure. + If users have enabled [Databricks Cluster access control](https://docs.databricks.com/security/access-control/cluster-acl.html#cluster-access-control), it is important that the users running the cluster configuration (i.e., the user generating the API key) has `Can Manage` privileges on the cluster they are trying to configure. ## Register a new Databricks Instance @@ -51,11 +53,13 @@ The instance name corresponds to the address of the Databricks instance and shou

-The API key will be stored in the Hopsworks secret store for the user and will be available only for that user. If multiple users need to configure Databricks clusters, each has to generate an API key and register an instance. The Databricks instance registration does not have a project scope, meaning that once registered, the user can configure clusters for all projects they are part of. +The API key will be stored in the Hopsworks secret store for the user and will be available only for that user. If multiple users need to configure Databricks clusters, each has to generate an API key and register an instance. +The Databricks instance registration does not have a project scope, meaning that once registered, the user can configure clusters for all projects they are part of. ## Databricks Cluster -A cluster needs to exist before users can configure it using the Hopsworks UI. The cluster can be in any state prior to the configuration. +A cluster needs to exist before users can configure it using the Hopsworks UI. +The cluster can be in any state prior to the configuration. !!! warning "Runtime limitation" @@ -64,7 +68,10 @@ A cluster needs to exist before users can configure it using the Hopsworks UI. T ## Configure a cluster Clusters are configured for a project user, which, in Hopsworks terms, means a user operating within the scope of a project. -To configure a cluster, click on the `Configure` button. By default the cluster will be configured for the user making the request. If the user doesn't have `Can Manage` privilege on the cluster, they can ask a project `Data Owner` to configure it for them. Hopsworks `Data Owners` are allowed to configure clusters for other project users, as long as they have the required Databricks privileges. +To configure a cluster, click on the `Configure` button. +By default the cluster will be configured for the user making the request. +If the user doesn't have `Can Manage` privilege on the cluster, they can ask a project `Data Owner` to configure it for them. +Hopsworks `Data Owners` are allowed to configure clusters for other project users, as long as they have the required Databricks privileges.

@@ -81,9 +88,11 @@ During the cluster configuration the following steps will be taken: - Configure the necessary Spark properties to authenticate and communicate with the Feature Store !!! note "HopsFS configuration" - It is not necessary to configure HopsFS if data is stored outside the Hopsworks file system. To do this define [Data Sources](../../fs/data_source/index.md) and link them to [Feature Groups](../../fs/feature_group/create.md) and [Training Datasets](../../fs/feature_view/training-data.md). + It is not necessary to configure HopsFS if data is stored outside the Hopsworks file system. + To do this define [Data Sources](../../fs/data_source/index.md) and link them to [Feature Groups](../../fs/feature_group/create.md) and [Training Datasets](../../fs/feature_view/training-data.md). -When a cluster is configured for a specific project user, all the operations with the Hopsworks Feature Store will be executed as that project user. If another user needs to re-use the same cluster, the cluster can be reconfigured by following the same steps above. +When a cluster is configured for a specific project user, all the operations with the Hopsworks Feature Store will be executed as that project user. +If another user needs to re-use the same cluster, the cluster can be reconfigured by following the same steps above. ## Connecting to the Feature Store @@ -91,7 +100,7 @@ At the end of the configuration, Hopsworks will start the cluster. Once the cluster is running users can establish a connection to the Hopsworks Feature Store from Databricks: ```python -import hopsworks +import hopsworks project = hopsworks.login( host='my_instance', # DNS of your Hopsworks instance port=443, # Port to reach your Hopsworks instance, defaults to 443 @@ -103,4 +112,5 @@ fs = project.get_feature_store() # Get the project's default feature s ## Next Steps -For more information about how to connect, see the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/) API reference. Or continue with the Data Source guide to import your own data to the Feature Store. +For more information about how to connect, see the [Login API][hopsworks.login]. +Or continue with the Data Source guide to import your own data to the Feature Store. diff --git a/docs/user_guides/integrations/databricks/networking.md b/docs/user_guides/integrations/databricks/networking.md index 065de136d..94bbbaaa7 100644 --- a/docs/user_guides/integrations/databricks/networking.md +++ b/docs/user_guides/integrations/databricks/networking.md @@ -1,6 +1,7 @@ # Networking -In order for Spark to communicate with the Feature Store from Databricks, networking needs to be set up correctly. This includes deploying the Hopsworks Instance to either the same VPC or enable VPC/VNet peering between the VPC/VNet of the Databricks Cluster and the Hopsworks Cluster. +In order for Spark to communicate with the Feature Store from Databricks, networking needs to be set up correctly. +This includes deploying the Hopsworks Instance to either the same VPC or enable VPC/VNet peering between the VPC/VNet of the Databricks Cluster and the Hopsworks Cluster. ## AWS @@ -10,7 +11,7 @@ The DataFrame API needs to be able to connect directly to the IP on which the Fe This means that if you deploy the Feature Store on AWS you will either need to deploy the Feature Store in the same VPC as your Databricks cluster or to set up [VPC Peering](https://docs.databricks.com/administration-guide/cloud-configurations/aws/vpc-peering.html) between your Databricks VPC and the Feature Store VPC. -**Option 1: Deploy the Feature Store in the Databricks VPC** +#### Option 1: Deploy the Feature Store in the Databricks VPC When you deploy the Feature Store Hopsworks instance, select the Databricks *VPC* and *Availability Zone* as the VPC and Availability Zone of your Feature Store cluster. Identify your Databricks VPC by searching for VPCs containing Databricks in their name in your Databricks AWS region in the AWS Management Console: @@ -22,9 +23,10 @@ Identify your Databricks VPC by searching for VPCs containing Databricks in thei

-**Option 2: Set up VPC peering** +#### Option 2: Set up VPC peering -Follow the guide [VPC Peering](https://docs.databricks.com/administration-guide/cloud-configurations/aws/vpc-peering.html) to set up VPC peering between the Feature Store cluster and Databricks. Get your Feature Store *VPC ID* and *CIDR* by searching for the Feature Store VPC in the AWS Management Console: +Follow the guide [VPC Peering](https://docs.databricks.com/administration-guide/cloud-configurations/aws/vpc-peering.html) to set up VPC peering between the Feature Store cluster and Databricks. +Get your Feature Store *VPC ID* and *CIDR* by searching for the Feature Store VPC in the AWS Management Console: !!! info "managed.hopsworks.ai" On **[managed.hopsworks.ai](https://managed.hopsworks.ai)**, the VPC is shown in the cluster details. @@ -52,7 +54,8 @@ Open your feature store instance under EC2 in the AWS Management Console and ens

-Connectivity from the Databricks Security Group can be allowed by opening the Security Group, adding a port to the Inbound rules and searching for *dbe-worker* in the source field. Selecting any of the *dbe-worker* Security Groups will be sufficient: +Connectivity from the Databricks Security Group can be allowed by opening the Security Group, adding a port to the Inbound rules and searching for *dbe-worker* in the source field. +Selecting any of the *dbe-worker* Security Groups will be sufficient:

@@ -86,9 +89,12 @@ Select *Add Peering*:

-Name the peering and select the virtual network used by your Hopsworks cluster. The virtual network -is shown in the cluster details on [managed.hopsworks.ai](https://managed.hopsworks.ai) (see the next picture). Ensure to press the copy button -on the bottom of the page and save the value somewhere. Press *Add* and create the peering: +Name the peering and select the virtual network used by your Hopsworks cluster. +The virtual network +is shown in the cluster details on [managed.hopsworks.ai](https://managed.hopsworks.ai) (see the next picture). +Ensure to press the copy button +on the bottom of the page and save the value somewhere. +Press *Add* and create the peering:

@@ -143,8 +149,10 @@ Choose to add a peering connection:

-Name the peering connection and select *I know my resource ID*. Paste the string copied when creating -the peering from Databricks Azure. If you haven't copied that string, then manually select the virtual +Name the peering connection and select *I know my resource ID*. +Paste the string copied when creating +the peering from Databricks Azure. +If you haven't copied that string, then manually select the virtual network used by Databricks and press *OK* to create the peering:

@@ -163,7 +171,8 @@ The peering should now be *Updating*:

-Wait for the peering to show up as *Connected*. There should now be bi-directional network connectivity between the Feature Store and Databricks: +Wait for the peering to show up as *Connected*. +There should now be bi-directional network connectivity between the Feature Store and Databricks:

@@ -174,7 +183,8 @@ Wait for the peering to show up as *Connected*. There should now be bi-direction ### Step 2: Configure the Network Security Group -The virtual network peering will allow full access between the Hopsworks virtual network and the Databricks virtual network by default. However, if you have a different setup, ensure that the *Network Security Group* of the Feature Store is configured to allow traffic from your Databricks clusters. +The virtual network peering will allow full access between the Hopsworks virtual network and the Databricks virtual network by default. +However, if you have a different setup, ensure that the *Network Security Group* of the Feature Store is configured to allow traffic from your Databricks clusters. Ensure that ports *443*, *9083*, *9085*, *8020*, *50010*, and *9092* are reachable from the Databricks cluster *Network Security Group*. diff --git a/docs/user_guides/integrations/emr/emr_configuration.md b/docs/user_guides/integrations/emr/emr_configuration.md index d1984ed1d..da25a9494 100644 --- a/docs/user_guides/integrations/emr/emr_configuration.md +++ b/docs/user_guides/integrations/emr/emr_configuration.md @@ -2,6 +2,7 @@ description: Documentation on how to configure an EMR cluster to read and write features from the Hopsworks Feature Store --- # Configure EMR for the Hopsworks Feature Store + To enable EMR to access the Hopsworks Feature Store, you need to set up a Hopsworks API key, add a bootstrap action and configurations to your EMR cluster. !!! info @@ -9,7 +10,8 @@ To enable EMR to access the Hopsworks Feature Store, you need to set up a Hopswo ## Step 1: Set up a Hopsworks API key -For instructions on how to generate an API key follow this [user guide](../../projects/api_key/create_api_key.md). For the EMR integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../../projects/api_key/create_api_key.md). +For the EMR integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -19,8 +21,10 @@ For instructions on how to generate an API key follow this [user guide](../../pr ### Store the API key in the AWS Secrets Manager In the AWS management console ensure that your active region is the region you use for EMR. -Go to the *AWS Secrets Manager* and select *Store new secret*. Select *Other type of secrets* and add *api-key* -as the key and paste the API key created in the previous step as the value. Click next. +Go to the *AWS Secrets Manager* and select *Store new secret*. +Select *Other type of secrets* and add *api-key* +as the key and paste the API key created in the previous step as the value. +Click next.

@@ -29,7 +33,8 @@ as the key and paste the API key created in the previous step as the value. Clic

-As a secret name, enter *hopsworks/featurestore*. Select next twice and finally store the secret. +As a secret name, enter *hopsworks/featurestore*. +Select next twice and finally store the secret. Then click on the secret in the secrets list and take note of the *Secret ARN*.

@@ -49,10 +54,11 @@ Identify your EMR EC2 instance profile in the EMR cluster summary:

- In the AWS Management Console, go to *IAM*, select *Roles* and then the EC2 instance profile used by your EMR cluster. -Select *Add inline policy*. Choose *Secrets Manager* as a service, expand the *Read* access level and check *GetSecretValue*. -Expand Resources and select *Add ARN*. Paste the ARN of the secret created in the previous step. +Select *Add inline policy*. +Choose *Secrets Manager* as a service, expand the *Read* access level and check *GetSecretValue*. +Expand Resources and select *Add ARN*. +Paste the ARN of the secret created in the previous step. Click on *Review*, give the policy a name and click on *Create policy*.

@@ -65,6 +71,7 @@ Click on *Review*, give the policy a name and click on *Create policy*. ## Step 2: Configure your EMR cluster ### Add the Hopsworks Feature Store configuration to your EMR cluster + In order for EMR to be able to talk to the Feature Store, you need to update the Hadoop and Spark configurations. Copy the configuration below and replace ip-XXX-XX-XX-XXX.XX-XXXX-X.compute.internal with the private DNS name of your Hopsworks master node. @@ -123,8 +130,11 @@ When you create your EMR cluster, add the configuration: ### Add the Bootstrap Action to your EMR cluster -EMR requires Hopsworks connectors to be able to communicate with the Hopsworks Feature Store. These connectors can be installed with the -bootstrap action shown below. Copy the content into a file and name the file `hopsworks.sh`. Copy that file into any S3 bucket that +EMR requires Hopsworks connectors to be able to communicate with the Hopsworks Feature Store. +These connectors can be installed with the +bootstrap action shown below. +Copy the content into a file and name the file `hopsworks.sh`. +Copy that file into any S3 bucket that is readable by your EMR clusters and take note of the S3 URI of that file e.g., `s3://my-emr-init/hopsworks.sh`. ```bash @@ -169,6 +179,7 @@ chmod -R o-rwx /usr/lib/hopsworks sudo pip3 install --upgrade hopsworks~=X.X.0 ``` + !!! attention "Matching Hopsworks version" We recommend that the major and minor version of the Python library match the major and minor version of the Hopsworks deployment. @@ -180,7 +191,8 @@ sudo pip3 install --upgrade hopsworks~=X.X.0

-Add the bootstrap actions when configuring your EMR cluster. Provide 3 arguments to the bootstrap action: The name of the API key secret e.g., `hopsworks/featurestore`, +Add the bootstrap actions when configuring your EMR cluster. +Provide 3 arguments to the bootstrap action: The name of the API key secret e.g., `hopsworks/featurestore`, the public DNS name of your Hopsworks cluster, such as `ad005770-33b5-11eb-b5a7-bfabd757769f.cloud.hopsworks.ai`, and the name of your Hopsworks project, e.g. `demo_fs_meb10179`.

@@ -194,4 +206,5 @@ Your EMR cluster will now be able to access your Hopsworks Feature Store. ## Next Steps -Use the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/) to connect to the Hopsworks Feature Store. For more information about how to use the Feature Store, see the [Quickstart Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. +Use the [Login API][hopsworks.login] to connect to the Hopsworks Feature Store. +For more information about how to use the Feature Store, see the [Quickstart Guide](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb){:target="_blank"}. diff --git a/docs/user_guides/integrations/emr/networking.md b/docs/user_guides/integrations/emr/networking.md index da468b945..205149f99 100644 --- a/docs/user_guides/integrations/emr/networking.md +++ b/docs/user_guides/integrations/emr/networking.md @@ -1,6 +1,7 @@ # Networking -In order for Spark to communicate with the Hopsworks Feature Store from EMR, networking needs to be set up correctly. This includes deploying the Hopsworks Feature Store to either the same VPC or enable VPC peering between the VPC of the EMR cluster and the Hopsworks Feature Store. +In order for Spark to communicate with the Hopsworks Feature Store from EMR, networking needs to be set up correctly. +This includes deploying the Hopsworks Feature Store to either the same VPC or enable VPC peering between the VPC of the EMR cluster and the Hopsworks Feature Store. ## Step 1: Ensure network connectivity @@ -8,7 +9,7 @@ The DataFrame API needs to be able to connect directly to the IP on which the Fe This means that if you deploy the Feature Store on AWS you will either need to deploy the Feature Store in the same VPC as your EMR cluster or to set up [VPC Peering](https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html) between your EMR VPC and the Feature Store VPC. -**Option 1: Deploy the Feature Store in the EMR VPC** +### Option 1: Deploy the Feature Store in the EMR VPC When deploying the Hopsworks Feature Store, select the EMR *VPC* and *Availability Zone* as the VPC and Availability Zone of your Feature Store. Identify your EMR VPC in the Summary of your EMR cluster: @@ -27,9 +28,10 @@ Identify your EMR VPC in the Summary of your EMR cluster:

-**Option 2: Set up VPC peering** +### Option 2: Set up VPC peering -Follow the guide [VPC Peering](https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html) to set up VPC peering between the Feature Store and EMR. Get your Feature Store *VPC ID* and *CIDR* by searching for the Feature Store VPC in the AWS Management Console: +Follow the guide [VPC Peering](https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html) to set up VPC peering between the Feature Store and EMR. +Get your Feature Store *VPC ID* and *CIDR* by searching for the Feature Store VPC in the AWS Management Console:

diff --git a/docs/user_guides/integrations/flink.md b/docs/user_guides/integrations/flink.md index 09cb6ac58..8587c57d1 100644 --- a/docs/user_guides/integrations/flink.md +++ b/docs/user_guides/integrations/flink.md @@ -3,11 +3,13 @@ description: Documentation on how to configure an external Flink cluster to writ --- # Flink Integration -Connecting to the Feature Store from an external Flink cluster, such as AWS EMR and GCP DataProc requires configuring it with the Hopsworks certificates, done automatically when using Hopsworks API. This guide explains how to connect to the Feature Store from an external Flink cluster. +Connecting to the Feature Store from an external Flink cluster, such as AWS EMR and GCP DataProc requires configuring it with the Hopsworks certificates, done automatically when using Hopsworks API. +This guide explains how to connect to the Feature Store from an external Flink cluster. ## Generating an API Key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Flink integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Flink integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -23,7 +25,7 @@ You are now ready to connect to the Hopsworks Feature Store from Flink: HopsworksConnection hopsworksConnection = HopsworksConnection.builder() .host("my_instance") // DNS of your Feature Store instance .port(443) // Port to reach your Hopsworks instance, defaults to 443 - .project("my_project") // Name of your Hopsworks Feature Store project + .project("my_project") // Name of your Hopsworks Feature Store project .apiKeyValue("api_key") // The API key to authenticate with the feature store .hostnameVerification(false) // Disable for self-signed certificates .build(); @@ -34,4 +36,4 @@ FeatureStore fs = hopsworksConnection.getFeatureStore(); ## Next Steps -For more information and how to integrate Flink streaming feature pipeline to the Hopsworks Feature store follow the [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/flink). \ No newline at end of file +For more information and how to integrate Flink streaming feature pipeline to the Hopsworks Feature store follow the [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/flink). diff --git a/docs/user_guides/integrations/hdinsight.md b/docs/user_guides/integrations/hdinsight.md index 50f57cb42..8295b9329 100644 --- a/docs/user_guides/integrations/hdinsight.md +++ b/docs/user_guides/integrations/hdinsight.md @@ -2,18 +2,23 @@ description: Documentation on how to configure an HDInsight cluster to read and write features from the Hopsworks Feature Store --- # Configure HDInsight for the Hopsworks Feature Store + To enable HDInsight to access the Hopsworks Feature Store, you need to set up a Hopsworks API key, add a script action and configurations to your HDInsight cluster. !!! info "Prerequisites" - A HDInsight cluster with cluster type Spark is required to connect to the Feature Store. You can either use an existing cluster or create a new one. + A HDInsight cluster with cluster type Spark is required to connect to the Feature Store. + You can either use an existing cluster or create a new one. !!! info "Network Connectivity" - To be able to connect to the Feature Store, please ensure that your HDInsight cluster and the Hopsworks Feature Store are either in the same [Virtual Network](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-networks-overview) or [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering) is set up between the different networks. In addition, ensure that the Network Security Group of your Hopsworks instance is configured to allow incoming traffic from your HDInsight cluster on ports 443, 3306, 8020, 30010, 9083 and 9085 (443,3306,8020,30010,9083,9085). See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. + To be able to connect to the Feature Store, please ensure that your HDInsight cluster and the Hopsworks Feature Store are either in the same [Virtual Network](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-networks-overview) or [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering) is set up between the different networks. + In addition, ensure that the Network Security Group of your Hopsworks instance is configured to allow incoming traffic from your HDInsight cluster on ports 443, 3306, 8020, 30010, 9083 and 9085 (443,3306,8020,30010,9083,9085). + See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. ## Step 1: Set up a Hopsworks API key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the HDInsight integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the HDInsight integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -22,9 +27,14 @@ For instructions on how to generate an API key follow this [user guide](../proje ## Step 2: Use a script action to install the Feature Store connector -HDInsight requires Hopsworks connectors to be able to communicate with the Hopsworks Feature Store. These connectors can be installed with the script action shown below. Copy the content into a file, name the file `hopsworks.sh` and replace MY_INSTANCE, MY_PROJECT, MY_VERSION, MY_API_KEY and MY_CONDA_ENV with your values. Copy the `hopsworks.sh` file into any storage that is readable by your HDInsight clusters and take note of the URI of that file e.g., `https://account.blob.core.windows.net/scripts/hopsworks.sh`. +HDInsight requires Hopsworks connectors to be able to communicate with the Hopsworks Feature Store. +These connectors can be installed with the script action shown below. +Copy the content into a file, name the file `hopsworks.sh` and replace MY_INSTANCE, MY_PROJECT, MY_VERSION, MY_API_KEY and MY_CONDA_ENV with your values. +Copy the `hopsworks.sh` file into any storage that is readable by your HDInsight clusters and take note of the URI of that file e.g., `https://account.blob.core.windows.net/scripts/hopsworks.sh`. -The script action needs to be applied head and worker nodes and can be applied during cluster creation or to an existing cluster. Ensure to persist the script action so that it is run on newly created nodes. For more information about how to use script actions, see [Customize Azure HDInsight clusters by using script actions](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux). +The script action needs to be applied head and worker nodes and can be applied during cluster creation or to an existing cluster. +Ensure to persist the script action so that it is run on newly created nodes. +For more information about how to use script actions, see [Customize Azure HDInsight clusters by using script actions](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux). !!! attention "Matching Hopsworks version" @@ -38,6 +48,7 @@ The script action needs to be applied head and worker nodes and can be applied d

Feature Store script action: + ```bash set -e @@ -77,19 +88,24 @@ chown -R root:hadoop /usr/lib/hopsworks ## Step 3: Configure HDInsight for Feature Store access -The Hadoop and Spark installations of the HDInsight cluster need to be configured in order to access the Feature Store. This can be achieved either by using a [bootstrap script](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-bootstrap) when creating clusters or using [Ambari](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-manage-ambari) on existing clusters. Apply the following configurations to your HDInsight cluster. +The Hadoop and Spark installations of the HDInsight cluster need to be configured in order to access the Feature Store. +This can be achieved either by using a [bootstrap script](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-bootstrap) when creating clusters or using [Ambari](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-manage-ambari) on existing clusters. +Apply the following configurations to your HDInsight cluster. !!! attention "Using Hive and the Feature Store" - HDInsight clusters cannot use their local Hive when being configured for the Feature Store as the Feature Store relies on custom Hive binaries and its own Metastore which will overwrite the local one. If you rely on Hive for feature engineering then it is advised to write your data to an external data storage such as ADLS from your main HDInsight cluster and in the Feature Store, create an [on-demand](https://docs.hopsworks.ai/overview/#feature-groups) Feature Group on the storage container in ADLS. + HDInsight clusters cannot use their local Hive when being configured for the Feature Store as the Feature Store relies on custom Hive binaries and its own Metastore which will overwrite the local one. + If you rely on Hive for feature engineering then it is advised to write your data to an external data storage such as ADLS from your main HDInsight cluster and in the Feature Store, create an [on-demand](../../concepts/fs/feature_group/on_demand_feature.md) Feature Group on the storage container in ADLS. Hadoop hadoop-env.sh: -``` + +```sh export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hopsworks/client/* ``` Hadoop core-site.xml: -``` + +```ini hops.ipc.server.ssl.enabled=true fs.hopsfs.impl=io.hops.hopsfs.client.HopsFileSystem client.rpc.ssl.enabled.protocol=TLSv1.2 @@ -101,7 +117,8 @@ hops.ssl.trustore.name=/usr/lib/hopsworks/trustStore.jks ``` Spark spark-defaults.conf: -``` + +```ini spark.executor.extraClassPath=/usr/lib/hopsworks/client/* spark.driver.extraClassPath=/usr/lib/hopsworks/client/* spark.sql.hive.metastore.jars=path @@ -109,7 +126,8 @@ spark.sql.hive.metastore.jars.path=/usr/lib/hopsworks/apache-hive-bin/lib/* ``` Spark hive-site.xml: -``` + +```ini hive.metastore.uris=thrift://MY_HOPSWORKS_INSTANCE_PRIVATE_IP:9083 ``` @@ -142,4 +160,4 @@ fs = project.get_feature_store() ## Next Steps -For more information on how to use the Hopsworks API check out the other guides or the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/). \ No newline at end of file +For more information on how to use the Hopsworks API check out the other guides or the [Login API][hopsworks.login]. diff --git a/docs/user_guides/integrations/index.md b/docs/user_guides/integrations/index.md index fb9d212f8..7cb83ed8d 100644 --- a/docs/user_guides/integrations/index.md +++ b/docs/user_guides/integrations/index.md @@ -1,13 +1,14 @@ # Client Integrations -Hopsworks is an open platform aiming to be accessible from a variety of tools. Learn in this section how to connect to Hopsworks from +Hopsworks is an open platform aiming to be accessible from a variety of tools. +Learn in this section how to connect to Hopsworks from -- [Python, AWS SageMaker, Google Colab, Kubeflow](python) -- [Java](java) -- [Databricks](databricks/networking) -- [AWS EMR](emr/emr_configuration) -- [Azure HDInsight](hdinsight) -- [Azure Machine Learning](mlstudio_designer) -- [Apache Spark](spark) -- [Apache Flink](flink) -- [Apache Beam](beam) +- [Python, AWS SageMaker, Google Colab, Kubeflow](./python.md) +- [Java](./java.md) +- [Databricks](./databricks/networking.md) +- [AWS EMR](./emr/emr_configuration.md) +- [Azure HDInsight](./hdinsight.md) +- [Azure Machine Learning](./mlstudio_designer.md) +- [Apache Spark](./spark.md) +- [Apache Flink](./flink.md) +- [Apache Beam](./beam.md) diff --git a/docs/user_guides/integrations/java.md b/docs/user_guides/integrations/java.md index f9c62c7bd..9b2905b29 100644 --- a/docs/user_guides/integrations/java.md +++ b/docs/user_guides/integrations/java.md @@ -6,10 +6,10 @@ description: Documentation on how to connect to Hopsworks from a Java client. This guide explains step by step how to connect to Hopsworks from a Java client. - ## Generate an API key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Java client to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Java client to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -30,7 +30,7 @@ import com.logicalclocks.hsfs.HopsworksConnection; HopsworksConnection hopsworksConnection = HopsworksConnection.builder() .host("my_instance") // DNS of your Feature Store instance .port(443) // Port to reach your Hopsworks instance, defaults to 443 - .project("my_project") // Name of your Hopsworks Feature Store project + .project("my_project") // Name of your Hopsworks Feature Store project .apiKeyValue("api_key") // The API key to authenticate with the feature store .hostnameVerification(false) // Disable for self-signed certificates .build(); @@ -48,4 +48,5 @@ List singleVector = fv.getFeatureVector(new HashMap() {{ ``` ## Next Steps -For more information how to interact from Java client with the Hopsworks Feature store follow this [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/java_engine/java). \ No newline at end of file + +For more information how to interact from Java client with the Hopsworks Feature store follow this [tutorial](https://github.com/logicalclocks/hopsworks-tutorials/tree/java_engine/java). diff --git a/docs/user_guides/integrations/mlstudio_designer.md b/docs/user_guides/integrations/mlstudio_designer.md index 38d41bbaf..5acd2c7a3 100644 --- a/docs/user_guides/integrations/mlstudio_designer.md +++ b/docs/user_guides/integrations/mlstudio_designer.md @@ -1,21 +1,25 @@ # Azure Machine Learning Designer Integration -Connecting to Hopsworks from the Azure Machine Learning Designer requires setting up a Hopsworks API key for the Designer and installing the **Hopsworks** Python library on the Designer. This guide explains step by step how to connect to the Feature Store from Azure Machine Learning Designer. +Connecting to Hopsworks from the Azure Machine Learning Designer requires setting up a Hopsworks API key for the Designer and installing the **Hopsworks** Python library on the Designer. +This guide explains step by step how to connect to the Feature Store from Azure Machine Learning Designer. !!! info "Network Connectivity" - To be able to connect to the Feature Store, please ensure that the Network Security Group of your Hopsworks instance on Azure is configured to allow incoming traffic from your compute target on ports 443, 9083 and 9085 (443,9083,9085). See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. If your compute target is not in the same VNet as your Hopsworks instance and the Hopsworks instance is not accessible from the internet then you will need to configure [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering). + To be able to connect to the Feature Store, please ensure that the Network Security Group of your Hopsworks instance on Azure is configured to allow incoming traffic from your compute target on ports 443, 9083 and 9085 (443,9083,9085). + See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. + If your compute target is not in the same VNet as your Hopsworks instance and the Hopsworks instance is not accessible from the internet then you will need to configure [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering). ## Generate an API key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Azure ML Designer integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Azure ML Designer integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project 3. job 4. kafka -## Connect to Hopsworks +## Connect to Hopsworks To connect to Hopsworks from the Azure Machine Learning Designer, create a new pipeline or open an existing one: @@ -37,7 +41,9 @@ In the pipeline, add a new `Execute Python Script` step and replace the Python s !!! info "Updating the script" - Replace MY_VERSION, MY_API_KEY, MY_INSTANCE, MY_PROJECT and MY_FEATURE_GROUP with the respective values. The major version set for MY_VERSION needs to match the major version of Hopsworks. Check [PyPI](https://pypi.org/project/hopsworks/#history) for available releases. + Replace MY_VERSION, MY_API_KEY, MY_INSTANCE, MY_PROJECT and MY_FEATURE_GROUP with the respective values. + The major version set for MY_VERSION needs to match the major version of Hopsworks. + Check [PyPI](https://pypi.org/project/hopsworks/#history) for available releases.

@@ -81,7 +87,8 @@ def azureml_main(dataframe1 = None, dataframe2 = None): return fs.get_feature_group('MY_FEATURE_GROUP', version=1).read(), ``` -Select a compute target and save the step. The step is now ready to use: +Select a compute target and save the step. +The step is now ready to use:

@@ -90,7 +97,8 @@ Select a compute target and save the step. The step is now ready to use:

-As a next step, you have to connect the previously created `Execute Python Script` step with the next step in the pipeline. For instance, to export the features to a CSV file, create a `Export Data` step: +As a next step, you have to connect the previously created `Execute Python Script` step with the next step in the pipeline. +For instance, to export the features to a CSV file, create a `Export Data` step:

@@ -121,7 +129,8 @@ Finally, submit the pipeline and wait for it to finish: !!! info "Performance on the first execution" - The `Execute Python Script` step can be slow when being executed for the first time as the Hopsworks library needs to be installed on the compute target. Subsequent executions on the same compute target should use the already installed library. + The `Execute Python Script` step can be slow when being executed for the first time as the Hopsworks library needs to be installed on the compute target. + Subsequent executions on the same compute target should use the already installed library.

@@ -132,4 +141,4 @@ Finally, submit the pipeline and wait for it to finish: ## Next Steps -For more information on how to use the Hopsworks API check out the other guides or the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/). \ No newline at end of file +For more information on how to use the Hopsworks API check out the other guides or the [Login API][hopsworks.login]. diff --git a/docs/user_guides/integrations/mlstudio_notebooks.md b/docs/user_guides/integrations/mlstudio_notebooks.md index d1f847a69..182b1784f 100644 --- a/docs/user_guides/integrations/mlstudio_notebooks.md +++ b/docs/user_guides/integrations/mlstudio_notebooks.md @@ -1,22 +1,27 @@ # Azure Machine Learning Notebooks Integration -Connecting to the Hopsworks from Azure Machine Learning Notebooks requires setting up a Hopsworks API key for Azure Machine Learning Notebooks and installing the **Hopsworks** Python library on the notebook. This guide explains step by step how to connect to the Hopsworks from Azure Machine Learning Notebooks. +Connecting to the Hopsworks from Azure Machine Learning Notebooks requires setting up a Hopsworks API key for Azure Machine Learning Notebooks and installing the **Hopsworks** Python library on the notebook. +This guide explains step by step how to connect to the Hopsworks from Azure Machine Learning Notebooks. !!! info "Network Connectivity" - To be able to connect to the Feature Store, please ensure that the Network Security Group of your Hopsworks instance on Azure is configured to allow incoming traffic from your compute target on ports 443, 9083 and 9085 (443,9083,9085). See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. If your compute target is not in the same VNet as your Hopsworks instance and the Hopsworks instance is not accessible from the internet then you will need to configure [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering). + To be able to connect to the Feature Store, please ensure that the Network Security Group of your Hopsworks instance on Azure is configured to allow incoming traffic from your compute target on ports 443, 9083 and 9085 (443,9083,9085). + See [Network security groups](https://docs.microsoft.com/en-us/azure/virtual-network/network-security-groups-overview) for more information. + If your compute target is not in the same VNet as your Hopsworks instance and the Hopsworks instance is not accessible from the internet then you will need to configure [Virtual Network Peering](https://docs.microsoft.com/en-us/azure/virtual-network/virtual-network-manage-peering). ## Install Hopsworks Python Library -To be able to interact with Hopsworks from a Python environment you need to install the `Hopsworks` Python library. The library is available on [PyPi](https://pypi.org/project/hopsworks/) and can be installed using `pip`: +To be able to interact with Hopsworks from a Python environment you need to install the `Hopsworks` Python library. +The library is available on [PyPi](https://pypi.org/project/hopsworks/) and can be installed using `pip`: -``` +```sh pip install hopsworks[python]~=[HOPSWORKS_VERSION] ``` !!! attention "Python Profile" - By default, `pip install hopsworks` does not install all the necessary dependencies required to use the Hopsworks library from a local Python environment. To ensure that all the dependencies are installed, you should install the library using with the Python profile `pip install hopsworks[python]`. + By default, `pip install hopsworks` does not install all the necessary dependencies required to use the Hopsworks library from a local Python environment. + To ensure that all the dependencies are installed, you should install the library using with the Python profile `pip install hopsworks[python]`. !!! attention "Matching Hopsworks version" @@ -31,7 +36,8 @@ pip install hopsworks[python]~=[HOPSWORKS_VERSION] ## Generate an API key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Azure ML Notebooks integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Azure ML Notebooks integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -49,12 +55,12 @@ To access Hopsworks from Azure Machine Learning, open a Python notebook and proc

-### Connect to Hopsworks +### Connect to Hopsworks You are now ready to connect to Hopsworks Feature Store from the notebook: ```python -import hopsworks +import hopsworks # Put the API key into Key Vault for any production setup: # See, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-secrets-in-runs @@ -79,4 +85,4 @@ fs = project.get_feature_store() ## Next Steps -For more information on how to use the Hopsworks API check out the other guides or the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/). \ No newline at end of file +For more information on how to use the Hopsworks API check out the other guides or the [Login API][hopsworks.login]. diff --git a/docs/user_guides/integrations/python.md b/docs/user_guides/integrations/python.md index fd9769c0d..5e1bc2e66 100644 --- a/docs/user_guides/integrations/python.md +++ b/docs/user_guides/integrations/python.md @@ -1,5 +1,5 @@ --- -description: Documentation on how to connect to Hopsworks from a Python environment (e.g. from Sagemaker, Google Colab, Kubeflow or local environment) +description: Documentation on how to connect to Hopsworks from a Python environment (e.g., from Sagemaker, Google Colab, Kubeflow or local environment) --- # Python Environments (Local, AWS SageMaker, Google Colab or Kubeflow) @@ -8,15 +8,17 @@ This guide explains step by step how to connect to Hopsworks from any Python env ## Install Python Library -To be able to interact with Hopsworks from a Python environment you need to install the `Hopsworks` Python library. The library is available on [PyPi](https://pypi.org/project/hopsworks/) and can be installed using `pip`: +To be able to interact with Hopsworks from a Python environment you need to install the `Hopsworks` Python library. +The library is available on [PyPi](https://pypi.org/project/hopsworks/) and can be installed using `pip`: -``` +```sh pip install hopsworks[python]~=[HOPSWORKS_VERSION] ``` !!! attention "Python Profile" - By default, `pip install hopsworks`, does not install all the necessary dependencies required to use the Hopsworks library from a pure Python environment. To ensure that all the dependencies are installed, you should install the library using with the Python profile `pip install hopsworks[python]`. + By default, `pip install hopsworks`, does not install all the necessary dependencies required to use the Hopsworks library from a pure Python environment. + To ensure that all the dependencies are installed, you should install the library using with the Python profile `pip install hopsworks[python]`. !!! attention "Matching Hopsworks version" @@ -31,7 +33,8 @@ pip install hopsworks[python]~=[HOPSWORKS_VERSION] ## Generate an API key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Python client to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Python client to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -43,7 +46,7 @@ For instructions on how to generate an API key follow this [user guide](../proje You are now ready to connect to Hopsworks from your Python environment: ```python -import hopsworks +import hopsworks project = hopsworks.login( host='my_instance', # DNS of your Hopsworks instance port=443, # Port to reach your Hopsworks instance, defaults to 443 @@ -56,10 +59,11 @@ fs = project.get_feature_store() # Get the project's default feature stor !!! note "Engine" - `Hopsworks` leverages several engines depending on whether you are running using Apache Spark or Pandas/Polars. The default behaviour of the library is to use the `spark` engine if you do not specify any `engine` option in the `login` method and if the `PySpark` library is available in the environment. + `Hopsworks` leverages several engines depending on whether you are running using Apache Spark or Pandas/Polars. + The default behaviour of the library is to use the `spark` engine if you do not specify any `engine` option in the `login` method and if the `PySpark` library is available in the environment. Please refer to the [Spark integration guide](spark.md) to configure your PySpark cluster to interact with Hopsworks. ## Next Steps -For more information on how to use the Hopsworks API check out the other guides or the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/). +For more information on how to use the Hopsworks API check out the other guides or the [Login API][hopsworks.login]. diff --git a/docs/user_guides/integrations/spark.md b/docs/user_guides/integrations/spark.md index d282fecfa..9dc774738 100644 --- a/docs/user_guides/integrations/spark.md +++ b/docs/user_guides/integrations/spark.md @@ -3,11 +3,18 @@ description: Documentation on how to configure an external Spark cluster to read --- # Spark Integration -Connecting to the Feature Store from an external Spark cluster, such as Cloudera, requires configuring it with the Hopsworks client jars and configuration. This guide explains step by step how to connect to the Feature Store from an external Spark cluster. +Connecting to the Feature Store from an external Spark cluster, such as Cloudera, requires configuring it with the Hopsworks client jars and configuration. +This guide explains step by step how to connect to the Feature Store from an external Spark cluster. ## Download the Hopsworks Client Jars -In the *Project Settings*, select the *integration* tab and scroll to the *Configure Spark Integration* section. Click on *Download client Jars*. This will start the download of the *client.tar.gz* archive. The archive contains two jar files for HopsFS, the Apache Hudi jar and the Java version of the HSFS library. You should upload these libraries to your Spark cluster and attach them as local resources to your Job. If you are using `spark-submit`, you should specify the `--jar` option. For more details see: [Spark Dependency Management](https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management). +In the *Project Settings*, select the *integration* tab and scroll to the *Configure Spark Integration* section. +Click on *Download client Jars*. +This will start the download of the *client.tar.gz* archive. +The archive contains two jar files for HopsFS, the Apache Hudi jar and the Java version of the HSFS library. +You should upload these libraries to your Spark cluster and attach them as local resources to your Job. +If you are using `spark-submit`, you should specify the `--jar` option. +For more details see: [Spark Dependency Management](https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management).

@@ -18,8 +25,11 @@ In the *Project Settings*, select the *integration* tab and scroll to the *Confi ## Download the certificates -Download the certificates from the same section as above. Hopsworks uses X.509 certificates for authentication and authorization. If you are interested in the Hopsworks security model, you can read more about it in this [blog post](https://www.logicalclocks.com/blog/how-we-secure-your-data-with-hopsworks). -The certificates are composed of three different components: the `keyStore.jks` containing the private key and the certificate for your project user, the `trustStore.jks` containing the certificates for the Hopsworks certificates authority, and a password to unlock the private key in the `keyStore.jks`. The password is displayed in a pop-up when downloading the certificate and should be saved in a file named `material_passwd`. +Download the certificates from the same section as above. +Hopsworks uses X.509 certificates for authentication and authorization. +If you are interested in the Hopsworks security model, you can read more about it in this [blog post](https://www.logicalclocks.com/blog/how-we-secure-your-data-with-hopsworks). +The certificates are composed of three different components: the `keyStore.jks` containing the private key and the certificate for your project user, the `trustStore.jks` containing the certificates for the Hopsworks certificates authority, and a password to unlock the private key in the `keyStore.jks`. +The password is displayed in a pop-up when downloading the certificate and should be saved in a file named `material_passwd`. !!! warning When you copy-paste the password to the `material_passwd` file, pay attention to not introduce additional empty spaces or new lines. @@ -32,10 +42,9 @@ The three files (`keyStore.jks`, `trustStore.jks` and `material_passwd`) should Currently Spark version 3.3.x is suggested to be able to use the full suite of Hopsworks Feature Store capabilities. - Add the following configuration to the Spark application: -``` +```plaintext spark.hadoop.fs.hopsfs.impl io.hops.hopsfs.client.HopsFileSystem spark.hadoop.hops.ipc.server.ssl.enabled true spark.hadoop.hops.ssl.hostname.verifier ALLOW_ALL @@ -60,7 +69,8 @@ To use PySpark, install the HSFS Python library which can be found on [PyPi](htt ## Generating an API Key -For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). For the Spark integration to work correctly make sure you add the following scopes to your API key: +For instructions on how to generate an API key follow this [user guide](../projects/api_key/create_api_key.md). +For the Spark integration to work correctly make sure you add the following scopes to your API key: 1. featurestore 2. project @@ -72,7 +82,7 @@ For instructions on how to generate an API key follow this [user guide](../proje You are now ready to connect to the Hopsworks Feature Store from Spark: ```python -import hopsworks +import hopsworks project = hopsworks.login( host='my_instance', # DNS of your Feature Store instance port=443, # Port to reach your Hopsworks instance, defaults to 443 @@ -85,8 +95,10 @@ fs = project.get_feature_store() # Get the project's default feature s !!! note "Engine" - `Hopsworks` leverages several engines depending on whether you are running using Apache Spark or Pandas/Polars. The default behaviour of the library is to use the `spark` engine if you do not specify any `engine` option in the `login` method and if the `PySpark` library is available in the environment. + `Hopsworks` leverages several engines depending on whether you are running using Apache Spark or Pandas/Polars. + The default behaviour of the library is to use the `spark` engine if you do not specify any `engine` option in the `login` method and if the `PySpark` library is available in the environment. ## Next Steps -For more information about how to connect, see the [Login API](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/) API reference. Or continue with the Data Source guide to import your own data to the Feature Store. +For more information about how to connect, see the [Login API][hopsworks.login]. +Or continue with the Data Source guide to import your own data to the Feature Store. diff --git a/docs/user_guides/migration/30_migration.md b/docs/user_guides/migration/30_migration.md deleted file mode 100644 index 8f1eda08f..000000000 --- a/docs/user_guides/migration/30_migration.md +++ /dev/null @@ -1,219 +0,0 @@ -# 3.0 Migration Guide - -## Breaking Changes - -### Feature View - -Feature View is a new core abstraction introduced in version 3.0. The Feature View extends and replaces the old Training Dataset. Feature views are now the gateway for users to access feature data from the feature store. - -=== "Pre-3.0" - ```python - td = fs.create_training_dataset() - ``` - -=== "3.0" - First create the feature view using a Query object: - ```python - fv = fs.create_feature_view() - ``` - - You can then create training data from the feature view by writing it as new files with `.create_` methods: - ```python - td_df = fv.create_training_data() - fv.create_train_test_split() - fv.create_train_validation_test_split() - ``` - - Or you can directly create the training data in memory by below methods: - ```python - fv.training_data() - fv.train_test_split() - fv.train_validation_test_split() - ``` - -This means that the process for generating training data changes slightly and training data is grouped by feature views. -This has the following advantages: - -1. It will be easier to create new versions of training data for re-training of models in the future. -2. You do not have to keep references to training data in your model serving code, instead you use the Feature View object. -3. The feature view offers the same interface for retrieving batch data for batch scoring, so you don’t have to execute SQL against the feature store explicitly anymore. - -A feature view uses the Query abstraction to define the schema of the view, and therefore, the Query is a mandatory argument, this effectively removes the possibility to create a training dataset directly from a Spark Dataframe. - -#### Required changes -After the upgrade all existing training datasets will be located under a Feature View with the same name. The versions of the training datasets stay untouched. You can use `feature_view.get_training_data` to get back existing training datasets. Except that training datasets created directly from Spark Dataframe are not migrated to feature view. In this case, you can use the old APIs for retrieving these training datasets, so that your existing training pipelines are still functional. - -However, if you have any pipelines, that automatically create training datasets, you will have to adjust these to the above workflow. `fs.create_training_dataset()` has been removed. - -To learn more about the new Feature View, have a look at the dedicated [concept](../../concepts/fs/feature_view/fv_overview.md) and [guide](../fs/feature_view/overview.md) section in the documentation. - -### Deequ-based Data Validation in favour of Great Expectations -Unfortunately, the [Deequ data validation library](https://github.com/awslabs/deequ) is no longer actively maintained which makes it impossible for us to maintain the functionality within Hopsworks. Therefore, we are dropping the entire support for Deequ as validation engine in Hopsworks 3.0 in favour of [Great Expectations](https://greatexpectations.io/) (GE). - -This has the following advantages: - -1. Great Expectations has become the defacto standard for data validation within the community. -2. Hopsworks is fully compatible with GE native objects, that means you can bring your existing expectation suites without the need for rewriting them. -3. GE is both available for Spark and for Pandas Dataframes, whereas Deequ was only supporting Spark. - -#### Required changes -All APIs regarding data validation have been redesigned to accommodate the functionality of GE. This means that you will have to redesign your previous expectations in the form of GE expectation suites that you can attach to Feature Groups. Please refer to the [data validation guide](../fs/feature_group/data_validation.md) for a full specification of the functionality. - -#### Limitations -GE is a Python library and therefore we can support synchronous data validation only in Python and PySpark kernels and not on Java/Scala Spark kernels. However, you have the possibility to launch a job asynchronously after writing with Java/Scala in order to perform data validation. - -## Deprecated Features - -These changes or new features introduce changes in APIs which might break your pipelines in the future. We try to keep old APIs around until the next major release in order to give you some time to adapt your pipelines, however, this is not always possible, and these methods might be removed in any upcoming release, so we recommend addressing these changes as soon as possible. For this reason, we list some of the changes as breaking change, even though they are still backwards compatible. - -### On-Demand Feature Groups are now called External Feature Groups - -Most data engineers but also many data scientists have a background where they at least partially where exposed to database terminology. Therefore, we decided to rename On-Demand Feature Groups to simply External Feature Groups. We think this makes the abstraction clearer, as practitioners are usually familiar with the concept of External Tables in a database. - -This lead to a change in HSFS APIs: - -=== "Pre-3.0" - ```python - fs.create_on_demand_feature_group() - fs.get_on_demand_feature_group() - fs.get_on_demand_feature_groups() - ``` - -=== "3.0" - ```python - fs.create_external_feature_group() - fs.get_external_feature_group() - fs.get_external_feature_groups() - ``` - -Note, pre-3.0 methods are marked as deprecated and still available in the library for backwards compatibility. - -### Streaming API for writing becomes the Python Default - -Hopsworks provides [three write APIs](../../concepts/fs/feature_group/write_apis.md) to the Feature Store to accommodate the different use cases: - -1. **Batch Write:** This was the default mode prior to version 3.0. It involves writing a DataFrame in batch either to the offline feature store, or the online one, or both. This mode is still the default when you are writing Spark DataFrames on Hopsworks. -2. **External connectors:** This mode allows users to mount external tables existing in DataWarehouses like Snowflake, Redshift and BigQuery as feature groups in Hopsworks. In this case the data is not moved and remains on the external data storage. -3. **Stream Write:** This mode was introduced in version 2.0 and expanded in version 3.0. This mode has a "Kappa-style" architecture, where the DataFrame gets streamed into a Kafka topic and, as explained later in the post, the data is picked up from Kafka by Hopsworks and written into the desired stores. In Hopsworks 3.0 this is the default mode for Python clients. - -With 3.0 the stream API becomes the default for Feature Groups created from pure Python environments with Pandas Dataframes. - -This has the following advantages: - -1. **Reduced write amplification:** Instead of uploading data to Hopsworks and subsequently starting a Spark job to upsert the data on offline storage and writing it to Kafka for the online storage upsert, the data is directly written to Kafka and from there it’s being upserted directly to offline and/or online. - -2. **Fresher features:** Since new data gets written to Kafka directly without prior upload, the data ends up in the online feature store with subsecond latency, which is a massive improvement given it is written from Python without any Streaming framework. - -3. **Batching of offline upserts:** You can control now yourself how often the Spark application that performs the upsert on the offline feature store is running. Either you run it synchronously with every new Dataframe ingestion, or you batch multiple Dataframes by launching the job less regularly. - -#### Required changes -Your existing feature groups will not be affected by this change, that means all existing feature groups will continue to use the old upload path for ingestion. However, we strongly recommend creating new versions of your existing feature groups that use ingest to using Python, in order to leverage the above advantages. - -### Built-in transformation functions don’t have to be registered explicitly for every project -In Hopsworks 2.5 users had to register the built-in transformation functions (min-max scaler, standard scaler, label encoder and robust scaler) explicitly for every project by calling `fs.register_builtin_transformation_functions()`. This is no longer necessary, as all new projects will have the functions registered by default. - -### Hive installation extra deprecated in favour of Python extra -In the past when using HSFS in pure Python environments without PySpark, users had to install the hive extra when installing the PyPi package. This extra got deprecated and users now have to install an extra called python to reflect the environment: - -=== "Pre-3.0" - ```bash - pip install hsfs[hive] - ``` - -=== "3.0" - ```bash - pip install hsfs[python] - ``` - -### More restrictive feature types -With Hopsworks 3.0 we made feature types more strict and therefore made ingestion pipelines more robust. Both Spark and Pandas are quite forgiving when it comes to types, which often led to schema incompatibilities when ingesting to a feature group. - -In this release we narrowed down the allowed Python types, and defined a clear mapping to Spark and Online Feature Store types. Please refer to the [feature type guide](../fs/feature_group/data_types.md) in the documentation for the exact mapping. - -#### Required Changes -The most common Python/Pandas types are still supported, we recommend you double check your feature groups with the type mapping above. - -### Deprecation of .save methods in favour of .insert together with .get_or_create_ -The `.save()` methods to create feature store entities has been deprecated in favour of `.insert()`. That means if there is no metadata for an entity in the feature store, a call to `.insert()` will create it. - -Together with the new `.get_or_create_ APIs` this will avoid that users have to change their code between creating entities and deploying the same code into production. - -=== "Pre-3.0" - ```bash - try: - fg = fs.get_feature_group(...) - fg.insert(df) - except RESTError as e: - fg = fs.create_feature_group(...) - fg.save(df) - ``` - -=== "3.0" - ```bash - fg = fs.get_or_create_feature_group(...) - fg.insert(df) - ``` - -### hops python library superseded by Hopsworks library - -The [hops](https://pypi.org/project/hops/) python library is now deprecated and is superseded by the [hopsworks](https://pypi.org/project/hopsworks/) python library. `hopsworks` is essentially a reimplementation of `hops`, but with an object-oriented API, similar in style with [hsfs](https://pypi.org/project/hsfs/). For guides on how to use the API follow the [Projects guides](../../user_guides/projects/index.md). - -Furthermore, the functionality provided by the `model` and `serving` module in `hops`, is now ported to the [hsml](https://pypi.org/project/hsml/) python library. To create models and serving follow the [MLOps guides](../../user_guides/mlops/index.md). - -## New Feature Highlights - -This list is meant to serve as a starting point to explore the new features of the Hopsworks 3.0 release, which can significantly improve your workflows. - -### Added new Data Sources: GCS, BigQuery and Kafka -With the added support for Google Cloud, we added also two new [data sources](../fs/data_source/index.md): [Google Cloud Storage](../fs/data_source/creation/gcs.md) and [Google BigQuery](../fs/data_source/creation/bigquery.md). Users can use these connectors to create external feature groups or write out training data. - -Additionally, to make it easier for users to get started with Spark Streaming applications, we added a [Kafka connector](../fs/data_source/creation/kafka.md), which let’s you easily read a Kafka topic into a Spark Streaming Dataframe. - -### Optimized Default Hudi Options -By default, Hudi tends to over-partition input, and therefore the layout of Feature Groups. The default parallelism is 200, to ensure each Spark partition stays within the 2GB limit for inputs up to 500GB. The new default is the following for all insert/upsert operations: - -```python -extra_write_options = { - 'hoodie.bulkinsert.shuffle.parallelism': '5', - 'hoodie.insert.shuffle.parallelism': '5', - 'hoodie.upsert.shuffle.parallelism': '5' -} -``` - -In most of the cases, you will not have to change this default. For very large inputs you should bump this up accordingly, by passing it to the `write_options` argument of the Feature Group `.insert()` method: - -```python -fg.insert(df, write_options=extra_write_options) -``` - -We recommend having shuffle parallelism `hoodie.[insert|upsert|bulkinsert].shuffle.parallelism` such that its at least input_data_size/500MB. - -### Feature View passed features -With the introduction of the [Feature View abstraction](../../concepts/fs/feature_view/fv_overview.md), we added APIs to allow users to overwrite features with so-called [passed features](../fs/feature_view/feature-vectors.md#passed-features) when calling `fv.get_feature_vector()`: - -```python -# get a single vector -feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2}, - passed_features = {"feature_a": "value_a"} -) -# get multiple vectors -feature_view.get_feature_vectors( - entry = [ - {"pk1": 1, "pk2": 2}, - {"pk1": 3, "pk2": 4}, - {"pk1": 5, "pk2": 6} - ], - passed_features = [ - {"feature_a": "value_a"}, - {"feature_a": "value_b"}, - {"feature_a": "value_c"}, - ] -) -``` - -This is useful, if some of the features values are only known at prediction time and cannot be computed and cached in the online feature store, you can provide those values as `passed_features` option. The `get_feature_vector` method is going to use the passed values to construct the final feature vector to submit to the model, it will also apply any transformations to the passed features attached to the respective features. - -### Reading Training Data directly from HopsFS in Python environments - -In Hopsworks 3.0 we added support for [reading training data](../fs/feature_view/training-data.md#read-training-data) from Hopsworks directly into external Python environments. Previously, users had to write the training data to external storage like S3 in order to access it from external environment. diff --git a/docs/user_guides/migration/40_migration.md b/docs/user_guides/migration/40_migration.md index 596c87c7e..691bacdab 100644 --- a/docs/user_guides/migration/40_migration.md +++ b/docs/user_guides/migration/40_migration.md @@ -2,10 +2,8 @@ ## Breaking Changes -With the release of Hopsworks 4.0, a number of necessary breaking -changes have been put in place to improve the overall experience of -using the Hopsworks platform. These breaking changes can be categorized -in the following areas: +With the release of Hopsworks 4.0, a number of necessary breaking changes have been put in place to improve the overall experience of using the Hopsworks platform. +These breaking changes can be categorized in the following areas: - Python API @@ -15,26 +13,15 @@ in the following areas: ### Python API -A number of significant changes have been made in the Python API -Hopsworks 4.0. Previously, in Hopsworks 3.X, there were 3 python -libraries used (“hopsworks”, “hsfs” & “hsml”) to develop feature, -training & inference pipelines, with the 4.0 release there is now -one single “hopsworks” python library that should be used. For -backwards compatibility, it is still possible to import both -the “hsfs” & “hsml” packages directly, but the proper way to -import them is to use “hopsworks.hsfs” & “hopsworks.hsml”. +A number of significant changes have been made in the Python API Hopsworks 4.0. Previously, in Hopsworks 3.X, there were 3 python libraries used (“hopsworks”, “hsfs” & “hsml”) to develop feature, training & inference pipelines, with the 4.0 release there is now one single “hopsworks” python library that should be used. +For backwards compatibility, it is still possible to import both the “hsfs” & “hsml” packages directly, but the proper way to import them is to use “hopsworks.hsfs” & “hopsworks.hsml”. The direct imports will be deprecated later. -Another significant change in the Hopsworks Python API is the use of -optional extras to allow a developer to easily import exactly what is -needed as part of their work. The main ones are great-expectations and -polars. It is arguable whether this is a breaking change but it is -important to note depending on how a particular pipeline has been -written which may encounter a problem when executing using Hopsworks -4.0. +Another significant change in the Hopsworks Python API is the use of optional extras to allow a developer to easily import exactly what is needed as part of their work. +The main ones are great-expectations and polars. +It is arguable whether this is a breaking change but it is important to note depending on how a particular pipeline has been written which may encounter a problem when executing using Hopsworks 4.0. -Finally, there are a number of relatively small breaking changes and -deprecated methods to improve the developer experience, these include: +Finally, there are a number of relatively small breaking changes and deprecated methods to improve the developer experience, these include: - connection.init() is now considered deprecated @@ -42,93 +29,79 @@ deprecated methods to improve the developer experience, these include: - DatasetApi's zip and unzip will now return False when a timeout is exceeded instead of previously throwing an Exception - ### Multi-Environment Docker Images -As part of the Hopsworks 4.0 release, an engineering team using -Hopsworks can now customize the docker images that they use for their -feature, training and inference pipelines. By adding this flexibility, -a set of breaking changes are necessary. Instead of having one common -docker image for fti pipelines, with the release of 4.0 a number of -specific docker images are provided to allow an engineering team using -Hopsworks to install exactly what they need to get their feature, -training and inference pipelines up and running. This breaking change -will require existing customers running Hopsworks 3.X to test their -existing pipelines using Hopsworks 4.0 before upgrading their -production environments. - +As part of the Hopsworks 4.0 release, an engineering team using Hopsworks can now customize the docker images that they use for their feature, training and inference pipelines. +By adding this flexibility, a set of breaking changes are necessary. +Instead of having one common docker image for fti pipelines, with the release of 4.0 a number of specific docker images are provided to allow an engineering team using Hopsworks to install exactly what they need to get their feature, training and inference pipelines up and running. +This breaking change will require existing customers running Hopsworks 3.X to test their existing pipelines using Hopsworks 4.0 before upgrading their production environments. ### On-Demand Transformation Functions -A number of changes have been made to transformation functions in the -last releases of Hopsworks. With 4.0, On-Demand Transformation Functions -are now better supported which has resulted in some breaking changes. -The following is how transformation functions were used in previous -versions of Hopsworks and the how transformation functions are used -in the 4.0 release. - +A number of changes have been made to transformation functions in the last releases of Hopsworks. +With 4.0, On-Demand Transformation Functions are now better supported which has resulted in some breaking changes. +The following is how transformation functions were used in previous versions of Hopsworks and the how transformation functions are used in the 4.0 release. === "Pre-4.0" ```python - ################################################# - # Creating transformation funciton Hopsworks 3.8# - ################################################# + ################################################# + # Creating transformation funciton Hopsworks 3.8# + ################################################# - # Define custom transformation function - def add_one(feature): - return feature + 1 + # Define custom transformation function + def add_one(feature): + return feature + 1 - # Create transformation function - add_one = fs.create_transformation_function(add_one, - output_type=int, + # Create transformation function + add_one = fs.create_transformation_function(add_one, + output_type=int, version=1, ) - # Save transformation function - add_one.save() - - # Retrieve transformation function - scaler = fs.get_transformation_function( - name="add_one", - version=1, - ) - - # Create feature view - feature_view = fs.get_or_create_feature_view( - name='serving_fv', - version=1, - query=selected_features, - # Apply your custom transformation functions to the feature `feature_1` - transformation_functions={ - "feature_1": add_one, - }, - labels=['target'], - ) + # Save transformation function + add_one.save() + + # Retrieve transformation function + scaler = fs.get_transformation_function( + name="add_one", + version=1, + ) + + # Create feature view + feature_view = fs.get_or_create_feature_view( + name='serving_fv', + version=1, + query=selected_features, + # Apply your custom transformation functions to the feature `feature_1` + transformation_functions={ + "feature_1": add_one, + }, + labels=['target'], + ) ``` === "4.0" ```python - ################################################# - # Creating transformation funciton Hopsworks 4.0# - ################################################# - - # Define custom transformation function - @hopsworks.udf(int) - def add_one(feature): - return feature + 1 - - # Create feature view - feature_view = fs.get_or_create_feature_view( - name='serving_fv', - version=1, - query=selected_features, - # Apply the custom transformation functions defined to the feature `feature_1` - transformation_functions=[ - add_one("feature_1"), - ], - labels=['target'], - ) + ################################################# + # Creating transformation funciton Hopsworks 4.0# + ################################################# + + # Define custom transformation function + @hopsworks.udf(int) + def add_one(feature): + return feature + 1 + + # Create feature view + feature_view = fs.get_or_create_feature_view( + name='serving_fv', + version=1, + query=selected_features, + # Apply the custom transformation functions defined to the feature `feature_1` + transformation_functions=[ + add_one("feature_1"), + ], + labels=['target'], + ) ``` -Note that the number of lines of code required has been significantly -reduced using the “@hopsworks.udf” python decorator. +Note that the number of lines of code required has been significantly reduced using the “@hopsworks.udf” python decorator. diff --git a/docs/user_guides/mlops/provenance/provenance.md b/docs/user_guides/mlops/provenance/provenance.md index 344d3d8c1..842706b7d 100644 --- a/docs/user_guides/mlops/provenance/provenance.md +++ b/docs/user_guides/mlops/provenance/provenance.md @@ -14,19 +14,22 @@ In the provenance pages we will call a provenance artifact or shortly artifact, With the following provenance graph: -``` +```plaintext data source -> feature group -> feature group -> feature view -> training dataset -> model ``` -we will call the parent, the artifact to the left, and the child, the artifact to the right. So a feature view has a number of feature groups as parents and can have a number of training datasets as children. +we will call the parent, the artifact to the left, and the child, the artifact to the right. +So a feature view has a number of feature groups as parents and can have a number of training datasets as children. -Tracking provenance allows users to determine where and if an artifact is being used. You can track, for example, if feature groups are being used to create additional (derived) feature groups or feature views, or if their data is eventually used to train models. +Tracking provenance allows users to determine where and if an artifact is being used. +You can track, for example, if feature groups are being used to create additional (derived) feature groups or feature views, or if their data is eventually used to train models. You can interact with the provenance graph using the UI or the APIs. ## Model provenance -The relationship between feature views and models is captured in the model [constructor](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model_registry/model_api/#create_model). If you do not provide at least the feature view object to the constructor, the provenance will not capture this relation and you will not be able to navigate from model to the feature view it used or from the feature view to this model. +The relationship between feature views and models is captured in the [model][hsml.model.Model] constructor. +If you do not provide at least the feature view object to the constructor, the provenance will not capture this relation and you will not be able to navigate from model to the feature view it used or from the feature view to this model. You can provide the feature view object and have the training dataset version be inferred. @@ -66,9 +69,9 @@ You can of course explicitly provide the training dataset version. Once the relation is stored in the provenance graph, you can navigate the graph from model to feature view or training dataset and the other way around. -Users can call the [get_feature_view_provenance(https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model_registry/model_api/#get_feature_view_provenance) method or the [get_training_dataset_provenance(https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model_registry/model_api/#get_training_dataset_provenance) method which will each return a [Link](#provenance-links) object. +Users can call the [`Model.get_feature_view_provenance`][hsml.model.Model.get_feature_view_provenance] method or the [`Model.get_training_dataset_provenance`][hsml.model.Model.get_training_dataset_provenance] method which will each return a [provenance Link object](#provenance-links). -You can also retrieve directly the parent feature view object, without the need to extract them from the provenance links object, using the [get_feature_view(https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model_registry/model_api/#get_feature_view ) method +You can also retrieve directly the parent feature view object, without the need to extract them from the provenance links object, using the [`Model.get_feature_view`][hsml.model.Model.get_feature_view] method. === "Python" @@ -84,7 +87,8 @@ This utility method also has the options to initialize the required components f model.get_feature_view(init: bool = True, online: Optional[bool]: None) ``` -By default, the base init for feature vector retrieval is enabled. In case you have a workflow that requires more particular options, you can disable this base init by setting the `init` to `false`. +By default, the base init for feature vector retrieval is enabled. +In case you have a workflow that requires more particular options, you can disable this base init by setting the `init` to `false`. The method detects if it is running within a deployment and will initialize the feature vector retrieval for the serving. If the `online` argument is provided and `true` it will initialize for online feature vector retrieval. If the `online` argument is provided and `false` it will initialize the feature vector retrieval for batch scoring. @@ -105,5 +109,7 @@ In the model overview UI you can explore the provenance graph of the model: All the `_provenance` methods return a `Link` dictionary object that contains `accessible`, `inaccesible`, `deleted` lists. - `accessible` - contains any artifact from the result, that the user has access to. -- `inaccessible` - contains any artifacts that might have been shared at some point in the past, but where this sharing was retracted. Since the relation between artifacts is still maintained in the provenance, the user will only have access to limited metadata and the artifacts will be included in this `inaccessible` list. -- `deleted` - contains artifacts that are deleted with children stil present in the system. There is minimum amount of metadata for the deleted allowing for some limited human readable identification. +- `inaccessible` - contains any artifacts that might have been shared at some point in the past, but where this sharing was retracted. + Since the relation between artifacts is still maintained in the provenance, the user will only have access to limited metadata and the artifacts will be included in this `inaccessible` list. +- `deleted` - contains artifacts that are deleted with children stil present in the system. + There is minimum amount of metadata for the deleted allowing for some limited human readable identification. diff --git a/docs/user_guides/mlops/registry/frameworks/llm.md b/docs/user_guides/mlops/registry/frameworks/llm.md index ee9e29a05..9d7edf162 100644 --- a/docs/user_guides/mlops/registry/frameworks/llm.md +++ b/docs/user_guides/mlops/registry/frameworks/llm.md @@ -24,7 +24,8 @@ In this guide you will learn how to export a [Large Language Model (LLM)](https: ### Step 2: Download the LLM -Download your base or fine-tuned LLM. LLMs can typically be downloaded using the official frameworks provided by their creators (e.g., HuggingFace, Ollama, ...) +Download your base or fine-tuned LLM. +LLMs can typically be downloaded using the official frameworks provided by their creators (e.g., HuggingFace, Ollama, ...) === "Python" ```python @@ -39,7 +40,8 @@ Download your base or fine-tuned LLM. LLMs can typically be downloaded using the ### Step 3: (Optional) Fine-tune LLM -If necessary, fine-tune your LLM with an [instruction set](https://www.hopsworks.ai/dictionary/instruction-datasets-for-fine-tuning-llms). A LLM can be fine-tuned fully or using [Parameter Efficient Fine Tuning (PEFT)](https://www.hopsworks.ai/dictionary/parameter-efficient-fine-tuning-of-llms) methods such as LoRA or QLoRA. +If necessary, fine-tune your LLM with an [instruction set](https://www.hopsworks.ai/dictionary/instruction-datasets-for-fine-tuning-llms). +A LLM can be fine-tuned fully or using [Parameter Efficient Fine Tuning (PEFT)](https://www.hopsworks.ai/dictionary/parameter-efficient-fine-tuning-of-llms) methods such as LoRA or QLoRA. === "Python" ```python @@ -49,7 +51,8 @@ If necessary, fine-tune your LLM with an [instruction set](https://www.hopsworks ### Step 4: Register model in registry -Use the `ModelRegistry.llm.create_model(..)` function to register a model as LLM. Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. +Use the `ModelRegistry.llm.create_model(..)` function to register a model as LLM. +Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/frameworks/python.md b/docs/user_guides/mlops/registry/frameworks/python.md index 8a7544aa9..a7e6ba89d 100644 --- a/docs/user_guides/mlops/registry/frameworks/python.md +++ b/docs/user_guides/mlops/registry/frameworks/python.md @@ -48,7 +48,8 @@ Export the XGBoost model to a directory on the local filesystem. ### Step 4: Register model in registry -Use the `ModelRegistry.python.create_model(..)` function to register a model as a Python model. Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. +Use the `ModelRegistry.python.create_model(..)` function to register a model as a Python model. +Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/frameworks/skl.md b/docs/user_guides/mlops/registry/frameworks/skl.md index 81d8254bf..7969a3050 100644 --- a/docs/user_guides/mlops/registry/frameworks/skl.md +++ b/docs/user_guides/mlops/registry/frameworks/skl.md @@ -47,7 +47,8 @@ Export the Scikit-learn model to a directory on the local filesystem. ### Step 4: Register model in registry -Use the `ModelRegistry.sklearn.create_model(..)` function to register a model as a Scikit-learn model. Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. +Use the `ModelRegistry.sklearn.create_model(..)` function to register a model as a Scikit-learn model. +Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/frameworks/tch.md b/docs/user_guides/mlops/registry/frameworks/tch.md index 040cf61be..7e0a2aa52 100644 --- a/docs/user_guides/mlops/registry/frameworks/tch.md +++ b/docs/user_guides/mlops/registry/frameworks/tch.md @@ -8,7 +8,6 @@ description: Documentation on how to export a Pytorch model to the model registr In this guide you will learn how to export a Torch model and register it in the Model Registry. - ## Code ### Step 1: Connect to Hopsworks @@ -62,7 +61,8 @@ Export the Torch model to a directory on the local filesystem. ### Step 4: Register model in registry -Use the `ModelRegistry.torch.create_model(..)` function to register a model as a Torch model. Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. +Use the `ModelRegistry.torch.create_model(..)` function to register a model as a Torch model. +Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/frameworks/tf.md b/docs/user_guides/mlops/registry/frameworks/tf.md index 64a5e7225..5de153544 100644 --- a/docs/user_guides/mlops/registry/frameworks/tf.md +++ b/docs/user_guides/mlops/registry/frameworks/tf.md @@ -11,7 +11,6 @@ In this guide you will learn how to export a TensorFlow model and register it in !!! notice "Save in SavedModel format" Make sure the model is saved in the [SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) format to be able to deploy it on TensorFlow Serving. - ## Code ### Step 1: Connect to Hopsworks @@ -40,7 +39,7 @@ Define your TensorFlow model and run the training loop. # Compile the model. model.compile(..) - + # Train the model model.fit(..) ``` @@ -58,7 +57,8 @@ Export the TensorFlow model to a directory on the local filesystem. ### Step 4: Register model in registry -Use the `ModelRegistry.tensorflow.create_model(..)` function to register a model as a TensorFlow model. Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. +Use the `ModelRegistry.tensorflow.create_model(..)` function to register a model as a TensorFlow model. +Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/index.md b/docs/user_guides/mlops/registry/index.md index ddc2adab6..ffc781c9c 100644 --- a/docs/user_guides/mlops/registry/index.md +++ b/docs/user_guides/mlops/registry/index.md @@ -1,30 +1,31 @@ # Model Registry Guides -**Hopsworks Model Registry** is a centralized repository, within an organization, to manage machine learning models. A model is the product of training a machine learning algorithm with training data. +**Hopsworks Model Registry** is a centralized repository, within an organization, to manage machine learning models. +A model is the product of training a machine learning algorithm with training data. This section provides guides for creating models and publish them to the Model Registry to make them available for download for batch predictions, or deployed to serve realtime applications. - ## Exporting a model Follow these framework-specific guides to export a Model to the Model Registry. -* [TensorFlow](frameworks/tf.md) - -* [Torch](frameworks/tch.md) +- [TensorFlow](frameworks/tf.md) -* [Scikit-learn](frameworks/skl.md) +- [Torch](frameworks/tch.md) -* [LLM](frameworks/llm.md) +- [Scikit-learn](frameworks/skl.md) -* [Other Python frameworks](frameworks/python.md) +- [LLM](frameworks/llm.md) +- [Other Python frameworks](frameworks/python.md) ## Model Schema -A [Model schema](model_schema.md) describes the input and outputs for a model. It provides a functional description of the model which makes it simpler to get started working with it. For example if the model inputs a tensor, the model schema can define the shape and data type of the tensor. - +A [Model schema](model_schema.md) describes the input and outputs for a model. +It provides a functional description of the model which makes it simpler to get started working with it. +For example if the model inputs a tensor, the model schema can define the shape and data type of the tensor. ## Input Example -An [Input example](input_example.md) provides an instance of a valid model input. Input examples are stored with the model as separate artifacts. +An [Input example](input_example.md) provides an instance of a valid model input. +Input examples are stored with the model as separate artifacts. diff --git a/docs/user_guides/mlops/registry/input_example.md b/docs/user_guides/mlops/registry/input_example.md index 18bf05e3a..86908443f 100644 --- a/docs/user_guides/mlops/registry/input_example.md +++ b/docs/user_guides/mlops/registry/input_example.md @@ -6,7 +6,9 @@ description: Documentation on how to attach an input example to a model. ## Introduction -In this guide you will learn how to attach an input example to a model. An input example is simply an instance of a valid model input. Attaching an input example to your model will give other users a better understanding of what data it expects. +In this guide you will learn how to attach an input example to a model. +An input example is simply an instance of a valid model input. +Attaching an input example to your model will give other users a better understanding of what data it expects. ## Code @@ -24,7 +26,8 @@ In this guide you will learn how to attach an input example to a model. An input ### Step 2: Generate an input example -Generate an input example which corresponds to a valid input to your model. Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to be passed as input example. +Generate an input example which corresponds to a valid input to your model. +Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to be passed as input example. === "Python" ```python diff --git a/docs/user_guides/mlops/registry/model_evaluation_images.md b/docs/user_guides/mlops/registry/model_evaluation_images.md index 3577d4ca5..cf166bb15 100644 --- a/docs/user_guides/mlops/registry/model_evaluation_images.md +++ b/docs/user_guides/mlops/registry/model_evaluation_images.md @@ -6,7 +6,10 @@ description: Documentation on how to attach model evaluation images to a model. ## Introduction -In this guide, you will learn how to attach ==model evaluation images== to a model. Model evaluation images are images that visually describe model performance metrics. For example, **confusion matrices**, **ROC curves**, **model bias tests**, and **training loss curves** are examples of common model evaluation images. By attaching model evaluation images to your versioned model, other users can better understand the model performance and evaluation metrics. +In this guide, you will learn how to attach ==model evaluation images== to a model. +Model evaluation images are images that visually describe model performance metrics. +For example, **confusion matrices**, **ROC curves**, **model bias tests**, and **training loss curves** are examples of common model evaluation images. +By attaching model evaluation images to your versioned model, other users can better understand the model performance and evaluation metrics. ## Code @@ -42,7 +45,7 @@ Generate an image that visualizes model performance and evaluation metrics # Create a DataFrame for the confusion matrix results df_confusion_matrix = pd.DataFrame( - results, + results, ['True Normal', 'True Fraud'], ['Pred Normal', 'Pred Fraud'], ) diff --git a/docs/user_guides/mlops/registry/model_schema.md b/docs/user_guides/mlops/registry/model_schema.md index dfa2effa6..e7c8fe5c5 100644 --- a/docs/user_guides/mlops/registry/model_schema.md +++ b/docs/user_guides/mlops/registry/model_schema.md @@ -6,7 +6,9 @@ description: Documentation on how to attach a model schema to a model. ## Introduction -In this guide you will learn how to attach a model schema to your model. A model schema, describes the type and shape of inputs and outputs (predictions) for your model. Attaching a model schema to your model will give other users a better understanding of what data it expects. +In this guide you will learn how to attach a model schema to your model. +A model schema, describes the type and shape of inputs and outputs (predictions) for your model. +Attaching a model schema to your model will give other users a better understanding of what data it expects. ## Code @@ -24,7 +26,8 @@ In this guide you will learn how to attach a model schema to your model. A model ### Step 2: Create ModelSchema -Create a ModelSchema for your inputs and outputs by passing in an example that your model is trained on and a valid prediction. Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`. +Create a ModelSchema for your inputs and outputs by passing in an example that your model is trained on and a valid prediction. +Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`. === "Python" ```python diff --git a/docs/user_guides/mlops/serving/api-protocol.md b/docs/user_guides/mlops/serving/api-protocol.md index 684b84ad1..cea116f26 100644 --- a/docs/user_guides/mlops/serving/api-protocol.md +++ b/docs/user_guides/mlops/serving/api-protocol.md @@ -2,10 +2,12 @@ ## Introduction -Hopsworks supports both REST and gRPC as API protocols for sending inference requests to model deployments. While REST API protocol is supported in all types of model deployments, support for gRPC is only available for models served with [KServe](predictor.md#serving-tool). +Hopsworks supports both REST and gRPC as API protocols for sending inference requests to model deployments. +While REST API protocol is supported in all types of model deployments, support for gRPC is only available for models served with [KServe](predictor.md#serving-tool). !!! warning - At the moment, the gRPC API protocol is only supported for **Python model deployments** (e.g., scikit-learn, xgboost). Support for Tensorflow model deployments is coming soon. + At the moment, the gRPC API protocol is only supported for **Python model deployments** (e.g., scikit-learn, xgboost). + Support for Tensorflow model deployments is coming soon. ## GUI @@ -20,11 +22,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Go to advanced options -A simplified creation form will appear including the most common deployment fields from all available configurations. Resource allocation is part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +A simplified creation form will appear including the most common deployment fields from all available configurations. +Resource allocation is part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -35,7 +40,8 @@ A simplified creation form will appear including the most common deployment fiel ### Step 3: Select the API protocol -Enabling gRPC as the API protocol for a model deployment requires KServe as the serving platform for the deployment. Make sure that KServe is enabled by activating the corresponding checkbox. +Enabling gRPC as the API protocol for a model deployment requires KServe as the serving platform for the deployment. +Make sure that KServe is enabled by activating the corresponding checkbox.

@@ -54,7 +60,9 @@ Then, you can select the API protocol to be enabled in your model deployment.

!!! info "Only one API protocol can be enabled in a model deployment (they cannot support both gRPC and REST)" - Currently, KServe model deployments are limited to one API protocol at a time. Therefore, only one of REST or gRPC API protocols can be enabled at the same time on the same model deployment. You cannot change the API protocol of existing deployments. + Currently, KServe model deployments are limited to one API protocol at a time. + Therefore, only one of REST or gRPC API protocols can be enabled at the same time on the same model deployment. + You cannot change the API protocol of existing deployments. Once you are done with the changes, click on `Create new deployment` at the bottom of the page to create the deployment for your model. @@ -63,6 +71,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -78,12 +87,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2: Create a deployment with a specific API protocol === "Python" + ```python my_model = mr.get_model("my_model", version=1) - my_predictor = ms.create_predictor(my_model, - api_protocol="GRPC" # defaults to "REST" - ) + my_predictor = ms.create_predictor( + my_model, + api_protocol="GRPC" # defaults to "REST" + ) my_predictor.deploy() # or @@ -94,4 +105,4 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[API Protocol](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/deployment_api/#api_protocol) +[API Protocol][hsml.deployment.Deployment.api_protocol] diff --git a/docs/user_guides/mlops/serving/deployment-state.md b/docs/user_guides/mlops/serving/deployment-state.md index 35f037670..d73068398 100644 --- a/docs/user_guides/mlops/serving/deployment-state.md +++ b/docs/user_guides/mlops/serving/deployment-state.md @@ -4,7 +4,8 @@ In this guide, you will learn how to inspect the state of a deployment. -A state can be seen as a snapshot of the current inner workings of a deployment. The following is the state transition diagram for deployments. +A state can be seen as a snapshot of the current inner workings of a deployment. +The following is the state transition diagram for deployments.

@@ -13,7 +14,8 @@ A state can be seen as a snapshot of the current inner workings of a deployment.

-States are composed of a [status](#deployment-status) and a [condition](#deployment-conditions). While a status represents a high-level view of the state, conditions contain more detailed information closely related to infrastructure terms. +States are composed of a [status](#deployment-status) and a [condition](#deployment-conditions). +While a status represents a high-level view of the state, conditions contain more detailed information closely related to infrastructure terms. ## GUI @@ -28,13 +30,17 @@ If you have at least one deployment already created, navigate to the deployments

-Once in the deployments page, find the deployment you want to inspect. Next to the actions buttons, you can find an indicator showing the current status of the deployment. This indicator changes its color based on the status. +Once in the deployments page, find the deployment you want to inspect. +Next to the actions buttons, you can find an indicator showing the current status of the deployment. +This indicator changes its color based on the status. To inspect the condition of the deployment, click on the name of the deployment to open the deployment overview page. ### Step 2: Inspect condition -Once in the deployment overview page, you can find the aforementioned status indicator at the top of page. Below it, a one-line message is shown with a more detailed description of the deployment status. This message is built using the current [condition](#deployment-conditions) of the deployment. +Once in the deployment overview page, you can find the aforementioned status indicator at the top of page. +Below it, a one-line message is shown with a more detailed description of the deployment status. +This message is built using the current [condition](#deployment-conditions) of the deployment.

@@ -55,13 +61,15 @@ Additionally, you can find the nº of instances currently running by scrolling d

!!! info "Scale-to-zero capabilities" - If scale-to-zero capabilities are enabled, you can see how the nº of instances of a running deployment goes to zero and the status changes to `idle`. To enable scale-to-zero in a deployment, see [Resource Allocation Guide](resources.md) + If scale-to-zero capabilities are enabled, you can see how the nº of instances of a running deployment goes to zero and the status changes to `idle`. + To enable scale-to-zero in a deployment, see [Resource Allocation Guide](resources.md) ## Code ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -74,6 +82,7 @@ Additionally, you can find the nº of instances currently running by scrolling d ### Step 2: Retrieve an existing deployment === "Python" + ```python deployment = ms.get_deployment("mydeployment") ``` @@ -81,6 +90,7 @@ Additionally, you can find the nº of instances currently running by scrolling d ### Step 3: Inspect deployment state === "Python" + ```python state = deployment.get_state() @@ -90,6 +100,7 @@ Additionally, you can find the nº of instances currently running by scrolling d ### Step 4: Check nº of running instances === "Python" + ```python # nº of predictor instances deployment.resources.describe() @@ -100,9 +111,9 @@ Additionally, you can find the nº of instances currently running by scrolling d ### API Reference -[Deployment](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/deployment_api/) +[`Deployment`][hsml.deployment.Deployment] -[PredictorState](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/predictor_api/) +[`PredictorState`][hsml.predictor_state.PredictorState] ## Deployment status @@ -123,9 +134,12 @@ The status of a deployment is a high-level description of its current state. ## Deployment conditions -A condition contains more specific information about the status of the deployment. They are mainly useful to track the progress of starting or stopping deployments. +A condition contains more specific information about the status of the deployment. +They are mainly useful to track the progress of starting or stopping deployments. -Status conditions contain three pieces of information: type, status and reason. While the type describes the purpose of the condition, the status represents its progress. Additionally, a reason field is provided with a more descriptive message of the status. +Status conditions contain three pieces of information: type, status and reason. +While the type describes the purpose of the condition, the status represents its progress. +Additionally, a reason field is provided with a more descriptive message of the status. ??? info "Show deployment conditions" @@ -146,7 +160,6 @@ Status conditions contain three pieces of information: type, status and reason. | | `False` | Connectivity failed to be set up, mainly due to networking issues. | | | `True` | Connectivity has been set up and the deployment is ready | - The following are two diagrams with the state transitions of conditions in starting and stopping deployments, respectively.

diff --git a/docs/user_guides/mlops/serving/deployment.md b/docs/user_guides/mlops/serving/deployment.md index 4459dc08b..488989143 100644 --- a/docs/user_guides/mlops/serving/deployment.md +++ b/docs/user_guides/mlops/serving/deployment.md @@ -9,9 +9,11 @@ description: Documentation on how to deployment Machine Learning (ML) models and In this guide, you will learn how to create a new deployment for a trained model. !!! warning - This guide assumes that a model has already been trained and saved into the Model Registry. To learn how to create a model in the Model Registry, see [Model Registry Guide](../registry/index.md#exporting-a-model) + This guide assumes that a model has already been trained and saved into the Model Registry. + To learn how to create a model in the Model Registry, see [Model Registry Guide](../registry/index.md#exporting-a-model) -Deployments are used to unify the different components involved in making one or more trained models online and accessible to compute predictions on demand. For each deployment, there are four concepts to consider: +Deployments are used to unify the different components involved in making one or more trained models online and accessible to compute predictions on demand. +For each deployment, there are four concepts to consider: !!! info "" 1. [Model files](#model-files) @@ -32,15 +34,20 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Basic deployment configuration -A simplified creation form will appear including the most common deployment fields from all available configurations. We provide default values for the rest of the fields, adjusted to the type of deployment you want to create. +A simplified creation form will appear including the most common deployment fields from all available configurations. +We provide default values for the rest of the fields, adjusted to the type of deployment you want to create. -In the simplified form, select the model framework used to train your model. Then, select the model you want to deploy from the list of available models under `pick a model`. +In the simplified form, select the model framework used to train your model. +Then, select the model you want to deploy from the list of available models under `pick a model`. -After selecting the model, the rest of fields are filled automatically. We pick the last model version and model artifact version available in the Model Registry. Moreover, we infer the deployment name from the model name. +After selecting the model, the rest of fields are filled automatically. +We pick the last model version and model artifact version available in the Model Registry. +Moreover, we infer the deployment name from the model name. !!! notice "Deployment name validation rules" A valid deployment name can only contain characters a-z, A-Z and 0-9. @@ -48,7 +55,8 @@ After selecting the model, the rest of fields are filled automatically. We pick !!! info "Predictor script for Python models" For Python models, you must select a custom [predictor script](#predictor) that loads and runs the trained model by clicking on `From project` or `Upload new file`, to choose an existing script in the project file system or upload a new script, respectively. -If you prefer, change the name of the deployment, model version or [artifact version](#model-artifact). Then, click on `Create new deployment` to create the deployment for your model. +If you prefer, change the name of the deployment, model version or [artifact version](#artifact-files). +Then, click on `Create new deployment` to create the deployment for your model.

@@ -75,7 +83,8 @@ Optionally, you can access and adjust other parameters of the deployment configu

-You will be redirected to a full-page deployment creation form where you can see all the default configuration values we selected for your deployment and adjust them according to your use case. Apart from the aforementioned simplified configuration, in this form you can setup the following components: +You will be redirected to a full-page deployment creation form where you can see all the default configuration values we selected for your deployment and adjust them according to your use case. +Apart from the aforementioned simplified configuration, in this form you can setup the following components: !!! info "Deployment advanced options" 1. [Predictor](#predictor) @@ -89,7 +98,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 4: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the deployment should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the deployment should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) @@ -106,7 +116,8 @@ Wait for the deployment creation process to finish. ### Step 6: Deployment overview -Once the deployment is created, you will be redirected to the list of all your existing deployments in the project. You can use the filters on the top of the page to easily locate your new deployment. +Once the deployment is created, you will be redirected to the list of all your existing deployments in the project. +You can use the filters on the top of the page to easily locate your new deployment.

@@ -129,6 +140,7 @@ After that, click on the new deployment to access the overview page. ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -143,6 +155,7 @@ After that, click on the new deployment to access the overview page. Retrieve the trained model you want to deploy. === "Python" + ```python my_model = mr.get_model("my_model", version=1) ``` @@ -150,6 +163,7 @@ Retrieve the trained model you want to deploy. #### Option A: Using the model object === "Python" + ```python my_deployment = my_model.deploy() ``` @@ -157,6 +171,7 @@ Retrieve the trained model you want to deploy. #### Option B: Using the Model Serving handle === "Python" + ```python # get Hopsworks Model Serving handle ms = project.get_model_serving() @@ -171,44 +186,57 @@ Retrieve the trained model you want to deploy. ### API Reference -[Model Serving](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/model_serving_api/) +[`ModelServing`][hsml.model_serving.ModelServing] ## Model Files -Model files are the files exported when a specific version of a model is saved to the model registry (see [Model Registry](../registry/index.md)). These files are ==unique for each model version, but shared across model deployments== created for the same version of the model. +Model files are the files exported when a specific version of a model is saved to the model registry (see [Model Registry](../registry/index.md)). +These files are ==unique for each model version, but shared across model deployments== created for the same version of the model. -Inside a model deployment, the local path to the model files is stored in the `MODEL_FILES_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). Moreover, you can explore the model files under the `/Models///Files` directory using the File Browser. +Inside a model deployment, the local path to the model files is stored in the `MODEL_FILES_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). +Moreover, you can explore the model files under the `/Models///Files` directory using the File Browser. !!! warning - All files under `/Models` are managed by Hopsworks. Changes to model files cannot be reverted and can have an impact on existing model deployments. + All files under `/Models` are managed by Hopsworks. + Changes to model files cannot be reverted and can have an impact on existing model deployments. ## Artifact Files -Artifact files are files involved in the correct startup and running of the model deployment. The most important files are the **predictor** and **transformer scripts**. The former is used to load and run the model for making predictions. The latter is typically used to apply transformations on the model inputs at inference time before making predictions. Predictor and transformer scripts run on separate components and, therefore, scale independently of each other. +Artifact files are files involved in the correct startup and running of the model deployment. +The most important files are the **predictor** and **transformer scripts**. +The former is used to load and run the model for making predictions. +The latter is typically used to apply transformations on the model inputs at inference time before making predictions. +Predictor and transformer scripts run on separate components and, therefore, scale independently of each other. !!! tip Whenever you provide a predictor script, you can include the transformations of model inputs in the same script as far as they don't need to be scaled independently from the model inference process. -Additionally, artifact files can also contain a **server configuration file** that helps detach configuration used within the model deployment from the model server or the implementation of the predictor and transformer scripts. Inside a model deployment, the local path to the configuration file is stored in the `CONFIG_FILE_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). +Additionally, artifact files can also contain a **server configuration file** that helps detach configuration used within the model deployment from the model server or the implementation of the predictor and transformer scripts. +Inside a model deployment, the local path to the configuration file is stored in the `CONFIG_FILE_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). -Every model deployment runs a specific version of the artifact files, commonly referred to as artifact version. ==One or more model deployments can use the same artifact version== (i.e., same predictor and transformer scripts). Artifact versions are unique for the same model version. +Every model deployment runs a specific version of the artifact files, commonly referred to as artifact version. ==One or more model deployments can use the same artifact version== (i.e., same predictor and transformer scripts). +Artifact versions are unique for the same model version. When a new deployment is created, a new artifact version is generated in two cases: -- the artifact version in the predictor is set to `CREATE` (see [Artifact Version](../predictor/#artifact_version)) +- the artifact version in the predictor is set to `CREATE` (see [Artifact Version](./predictor.md#environment-variables)) - no model artifact with the same files has been created before. -Inside a model deployment, the local path to the artifact files is stored in the `ARTIFACT_FILES_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). Moreover, you can explore the artifact files under the `/Models///Artifacts/` directory using the File Browser. +Inside a model deployment, the local path to the artifact files is stored in the `ARTIFACT_FILES_PATH` environment variable (see [environment variables](../serving/predictor.md#environment-variables)). +Moreover, you can explore the artifact files under the `/Models///Artifacts/` directory using the File Browser. !!! warning - All files under `/Models` are managed by Hopsworks. Changes to artifact files cannot be reverted and can have an impact on existing model deployments. + All files under `/Models` are managed by Hopsworks. + Changes to artifact files cannot be reverted and can have an impact on existing model deployments. !!! tip "Additional files" - Currently, the artifact files can only include predictor and transformer scripts, and a configuration file. Support for additional files (e.g., other resources) is coming soon. + Currently, the artifact files can only include predictor and transformer scripts, and a configuration file. + Support for additional files (e.g., other resources) is coming soon. ## Predictor -Predictors are responsible for running the model server that loads the trained model, listens to inference requests and returns prediction results. To learn more about predictors, see the [Predictor Guide](predictor.md) +Predictors are responsible for running the model server that loads the trained model, listens to inference requests and returns prediction results. +To learn more about predictors, see the [Predictor Guide](predictor.md) !!! note Only one predictor is supported in a deployment. @@ -218,7 +246,8 @@ Predictors are responsible for running the model server that loads the trained m ## Transformer -Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. To learn more about transformers, see the [Transformer Guide](transformer.md). +Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. +To learn more about transformers, see the [Transformer Guide](transformer.md). !!! warning Transformers are only supported in KServe deployments. diff --git a/docs/user_guides/mlops/serving/external-access.md b/docs/user_guides/mlops/serving/external-access.md index d32130dea..a02c8c250 100644 --- a/docs/user_guides/mlops/serving/external-access.md +++ b/docs/user_guides/mlops/serving/external-access.md @@ -8,10 +8,12 @@ description: Documentation on how to configure external access to a model deploy Hopsworks supports role-based access control (RBAC) for project members within a project, where a project ML assets can only be accessed by Hopsworks users that are members of that project (See [governance](../../../concepts/projects/governance.md)). -However, there are cases where you might want to grant ==external users== with access to specific model deployments without them having to register into Hopsworks or to join the project which will give them access to all project ML assets. For these cases, Hopsworks supports fine-grained access control to model deployments based on ==user groups== managed by an external Identity Provider. +However, there are cases where you might want to grant ==external users== with access to specific model deployments without them having to register into Hopsworks or to join the project which will give them access to all project ML assets. +For these cases, Hopsworks supports fine-grained access control to model deployments based on ==user groups== managed by an external Identity Provider. !!! info "Authentication methods" - Hopsworks can be configured to use different types of authentication methods including OAuth2, LDAP and Kerberos. See the [Authentication Methods Guide](../../../setup_installation/admin/auth.md) for more information. + Hopsworks can be configured to use different types of authentication methods including OAuth2, LDAP and Kerberos. + See the [Authentication Methods Guide](../../../setup_installation/admin/auth.md) for more information. ## GUI (for Hopsworks users) @@ -48,11 +50,12 @@ You can find the external access configuration by clicking on `External access` ### Step 3: Add or remove user groups -In this section, you can add and remove user groups by clicking on `edit external user groups` and typing the group name in the **text-free** input field or **selecting** one of the existing ones in the dropdown list. After that, click on the `save` button to persist the changes. - +In this section, you can add and remove user groups by clicking on `edit external user groups` and typing the group name in the **text-free** input field or **selecting** one of the existing ones in the dropdown list. +After that, click on the `save` button to persist the changes. !!! Warn "Case sensitivity" - Inference requests are authorized using a ==case-sensitive exact match== between the group names of the user making the request and the group names granted access to the model deployment. Therefore, a user assigned to the group `lab1` won't have access to a model deployment accessible by group `LAB1`. + Inference requests are authorized using a ==case-sensitive exact match== between the group names of the user making the request and the group names granted access to the model deployment. + Therefore, a user assigned to the group `lab1` won't have access to a model deployment accessible by group `LAB1`.

@@ -98,7 +101,8 @@ You can find the current groups you are assigned to at the top of the page. ### Step 3: Get an API key -Inference requests to model deployments are authenticated and authorized based on your external user and user groups. You can create API keys to authenticate your inference requests by clicking on the `Create API Key` button. +Inference requests to model deployments are authenticated and authorized based on your external user and user groups. +You can create API keys to authenticate your inference requests by clicking on the `Create API Key` button. !!! info "Authorization header" API keys are set in the `Authorization` header following the format `ApiKey ` @@ -112,9 +116,12 @@ Inference requests to model deployments are authenticated and authorized based o ### Step 4: Send inference requests -Depending on the type of model deployment, the URI of the model server can differ (e.g., `/chat/completions` for LLM deployments or `/predict` for traditional model deployments). You can find the corresponding URI on every model deployment card. +Depending on the type of model deployment, the URI of the model server can differ (e.g., `/chat/completions` for LLM deployments or `/predict` for traditional model deployments). +You can find the corresponding URI on every model deployment card. -In addition to the `Authorization` header containing the API key, the `Host` header needs to be set according to the model deployment where the inference requests are sent to. This header is used by the ingress to route the inference requests to the corresponding model deployment. You can find the `Host` header value in the model deployment card. +In addition to the `Authorization` header containing the API key, the `Host` header needs to be set according to the model deployment where the inference requests are sent to. +This header is used by the ingress to route the inference requests to the corresponding model deployment. +You can find the `Host` header value in the model deployment card. !!! tip "Code snippets" For clients sending inference requests using libraries similar to curl or OpenAI API-compatible libraries (e.g., LangChain), you can find code snippet examples by clicking on the `Curl >_` and `LangChain >_` buttons. @@ -128,11 +135,13 @@ In addition to the `Authorization` header containing the API key, the `Host` hea ## Refreshing External User Groups -Every time an external user signs in to Hopsworks using a pre-configured [authentication method](../../../setup_installation/admin/auth.md), Hopsworks fetches the external user groups and updates the internal state accordingly. Given that groups can be added/removed from users at any time by the Identity Provider, Hopsworks needs to periodically fetch the external user groups to keep the state updated. +Every time an external user signs in to Hopsworks using a pre-configured [authentication method](../../../setup_installation/admin/auth.md), Hopsworks fetches the external user groups and updates the internal state accordingly. +Given that groups can be added/removed from users at any time by the Identity Provider, Hopsworks needs to periodically fetch the external user groups to keep the state updated. -Therefore, external users that want to access model deployments are **required to login periodically** to ensure they are still part of the allowed groups. The timespan between logins is controlled by the configuration parameter `requireExternalUserLoginAfterHours` available during the Hopsworks installation and upgrade. +Therefore, external users that want to access model deployments are **required to login periodically** to ensure they are still part of the allowed groups. +The timespan between logins is controlled by the configuration parameter `requireExternalUserLoginAfterHours` available during the Hopsworks installation and upgrade. -The `requireExternalUserLoginAfterHours` configuration parameter controls the ==number of hours== after which external users are required to sign in to Hopsworks to refresh their external user groups. +The `requireExternalUserLoginAfterHours` configuration parameter controls the ==number of hours== after which external users are required to sign in to Hopsworks to refresh their external user groups. !!! info "Configuring `requireExternalUserLoginAfterHours`" Allowed values are -1, 0 and greater than 0, where -1 disables the periodic login requirement and 0 disables external access completely for every model deployment. diff --git a/docs/user_guides/mlops/serving/index.md b/docs/user_guides/mlops/serving/index.md index 3f1ce0e92..bc248e01f 100644 --- a/docs/user_guides/mlops/serving/index.md +++ b/docs/user_guides/mlops/serving/index.md @@ -2,7 +2,8 @@ ## Deployment -Assuming you have already created a model in the [Model Registry](../registry/index.md), a deployment can now be created to prepare a model artifact for this model and make it accessible for running predictions behind a REST or gRPC endpoint. Follow the [Deployment Creation Guide](deployment.md) to create a Deployment for your model. +Assuming you have already created a model in the [Model Registry](../registry/index.md), a deployment can now be created to prepare a model artifact for this model and make it accessible for running predictions behind a REST or gRPC endpoint. +Follow the [Deployment Creation Guide](deployment.md) to create a Deployment for your model. ### Predictor @@ -34,4 +35,4 @@ Inspect the model server logs to troubleshoot your model deployments, see the [T ### External access -Grant users authenticated by an external Identity Provider access to model deployments, see the [External Access Guide](external-access.md). \ No newline at end of file +Grant users authenticated by an external Identity Provider access to model deployments, see the [External Access Guide](external-access.md). diff --git a/docs/user_guides/mlops/serving/inference-batcher.md b/docs/user_guides/mlops/serving/inference-batcher.md index 665078d80..8dc94ed58 100644 --- a/docs/user_guides/mlops/serving/inference-batcher.md +++ b/docs/user_guides/mlops/serving/inference-batcher.md @@ -2,7 +2,9 @@ ## Introduction -Inference batching can be enabled to increase inference request throughput at the cost of higher latencies. The configuration of the inference batcher depends on the serving tool and the model server used in the deployment. See the [compatibility matrix](#compatibility-matrix). +Inference batching can be enabled to increase inference request throughput at the cost of higher latencies. +The configuration of the inference batcher depends on the serving tool and the model server used in the deployment. +See the [compatibility matrix](#compatibility-matrix). ## GUI @@ -17,11 +19,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Go to advanced options -A simplified creation form will appear including the most common deployment fields from all available configurations. Inference batching is part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +A simplified creation form will appear including the most common deployment fields from all available configurations. +Inference batching is part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -50,6 +55,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -65,6 +71,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2: Define an inference logger === "Python" + ```python from hsml.inference_batcher import InferenceBatcher @@ -79,6 +86,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 3: Create a deployment with the inference batcher === "Python" + ```python my_model = mr.get_model("my_model", version=1) @@ -96,7 +104,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[Inference Batcher](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/inference_batcher_api/) +[`InferenceBatcher`][hsml.inference_batcher.InferenceBatcher] ## Compatibility matrix diff --git a/docs/user_guides/mlops/serving/inference-logger.md b/docs/user_guides/mlops/serving/inference-logger.md index 809da1867..47f8ebaea 100644 --- a/docs/user_guides/mlops/serving/inference-logger.md +++ b/docs/user_guides/mlops/serving/inference-logger.md @@ -21,11 +21,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Go to advanced options -A simplified creation form will appear including the most common deployment fields from all available configurations. Inference logging is part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +A simplified creation form will appear including the most common deployment fields from all available configurations. +Inference logging is part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -36,7 +39,8 @@ A simplified creation form will appear including the most common deployment fiel ### Step 3: Configure inference logging -To enable inference logging, choose `CREATE` as Kafka topic name to create a new topic, or select an existing topic. If you prefer, you can disable inference logging by selecting `NONE`. +To enable inference logging, choose `CREATE` as Kafka topic name to create a new topic, or select an existing topic. +If you prefer, you can disable inference logging by selecting `NONE`. If you decide to create a new topic, select the number of partitions and number of replicas for your topic, or use the default values. @@ -56,6 +60,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -71,6 +76,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2: Define an inference logger === "Python" + ```python from hsml.inference_logger import InferenceLogger @@ -96,6 +102,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 3: Create a deployment with the inference logger === "Python" + ```python my_model = mr.get_model("my_model", version=1) @@ -112,11 +119,13 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[Inference Logger](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/inference_logger_api/) +[`InferenceLogger`][hsml.inference_logger.InferenceLogger] ## Topic schema -The schema of Kafka events varies depending on the serving tool. In KServe deployments, model inputs and predictions are logged in separate events, but sharing the same `requestId` field. In non-KServe deployments, the same event contains both the model input and prediction related to the same inference request. +The schema of Kafka events varies depending on the serving tool. +In KServe deployments, model inputs and predictions are logged in separate events, but sharing the same `requestId` field. +In non-KServe deployments, the same event contains both the model input and prediction related to the same inference request. ??? example "Show kafka topic schemas" diff --git a/docs/user_guides/mlops/serving/predictor.md b/docs/user_guides/mlops/serving/predictor.md index 1e5bd65b2..885590c70 100644 --- a/docs/user_guides/mlops/serving/predictor.md +++ b/docs/user_guides/mlops/serving/predictor.md @@ -9,9 +9,13 @@ description: Documentation on how to configure a predictor for a model deploymen In this guide, you will learn how to configure a predictor for a trained model. !!! warning - This guide assumes that a model has already been trained and saved into the Model Registry. To learn how to create a model in the Model Registry, see [Model Registry Guide](../registry/frameworks/tf.md) + This guide assumes that a model has already been trained and saved into the Model Registry. + To learn how to create a model in the Model Registry, see [Model Registry Guide](../registry/frameworks/tf.md) -Predictors are the main component of deployments. They are responsible for running a model server that loads a trained model, handles inference requests and returns predictions. They can be configured to use different model servers, serving tools, log specific inference data or scale differently. In each predictor, you can configure the following components: +Predictors are the main component of deployments. +They are responsible for running a model server that loads a trained model, handles inference requests and returns predictions. +They can be configured to use different model servers, serving tools, log specific inference data or scale differently. +In each predictor, you can configure the following components: !!! info "" 1. [Model server](#model-server) @@ -38,11 +42,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Choose a backend -A simplified creation form will appear, including the most common deployment fields from all available configurations. The first step is to choose a ==backend== for your model deployment. The backend will filter the models shown below according to the framework that the model was registered with in the model registry. +A simplified creation form will appear, including the most common deployment fields from all available configurations. +The first step is to choose a ==backend== for your model deployment. +The backend will filter the models shown below according to the framework that the model was registered with in the model registry. For example if you registered the model as a TensorFlow model using `ModelRegistry.tensorflow.create_model(...)` you select `Tensorflow Serving` in the dropdown. @@ -62,11 +69,12 @@ All models compatible with the selected backend will be listed in the model drop

-Moreover, you can optionally select a predictor script (see [Step 3](#step-3-optional-select-a-predictor-script)), enable KServe (see [Step 4](#step-4-optional-enable-kserve)) or change other advanced configuration (see [Step 5](#step-5-optional-other-advanced-options)). Otherwise, click on `Create new deployment` to create the deployment for your model. +Moreover, you can optionally select a predictor script (see [Step 3 (Optional): Select a predictor script](#step-3-optional-select-a-predictor-script)), enable KServe (see [Step 4 (Optional): Enable KServe](#step-6-optional-enable-kserve)) or change other advanced configuration (see [Step 5 (Optional): Other advanced options](#step-7-optional-other-advanced-options)). +Otherwise, click on `Create new deployment` to create the deployment for your model. ### Step 3 (Optional): Select a predictor script -For python models, if you want to use your own [predictor script](#step-2-optional-implement-predictor-script) click on `From project` and navigate through the file system to find it, or click on `Upload new file` to upload a predictor script now. +For python models, if you want to use your own [predictor script](#step-2-optional-implement-a-predictor-script) click on `From project` and navigate through the file system to find it, or click on `Upload new file` to upload a predictor script now.

@@ -77,7 +85,8 @@ For python models, if you want to use your own [predictor script](#step-2-option ### Step 4 (Optional): Change predictor environment -If you are using a predictor script it is also required to configure the inference environment for the predictor. This environment needs to have all the necessary dependencies installed to run your predictor script. +If you are using a predictor script it is also required to configure the inference environment for the predictor. +This environment needs to have all the necessary dependencies installed to run your predictor script. By default, we provide a set of environments like `tensorflow-inference-pipeline`, `torch-inference-pipeline` and `pandas-inference-pipeline` that serves this purpose for common machine learning frameworks. @@ -90,13 +99,15 @@ To create your own it is recommended to [clone](../../projects/python/python_env

- ### Step 5 (Optional): Select a configuration file !!! note Only available for LLM deployments. -You can select a configuration file to be added to the [artifact files](deployment.md#artifact-files). If a predictor script is provided, this configuration file will be available inside the model deployment at the local path stored in the `CONFIG_FILE_PATH` environment variable. If a predictor script is **not** provided, this configuration file will be directly passed to the vLLM server. You can find all configuration parameters supported by the vLLM server in the [vLLM documentation](https://docs.vllm.ai/en/v0.7.1/serving/openai_compatible_server.html). +You can select a configuration file to be added to the [artifact files](deployment.md#artifact-files). +If a predictor script is provided, this configuration file will be available inside the model deployment at the local path stored in the `CONFIG_FILE_PATH` environment variable. +If a predictor script is **not** provided, this configuration file will be directly passed to the vLLM server. +You can find all configuration parameters supported by the vLLM server in the [vLLM documentation](https://docs.vllm.ai/en/v0.7.1/serving/openai_compatible_server.html).

@@ -107,7 +118,8 @@ You can select a configuration file to be added to the [artifact files](deployme ### Step 6 (Optional): Enable KServe -Other configuration such as the serving tool, is part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +Other configuration such as the serving tool, is part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -143,6 +155,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -184,14 +197,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott """ Asynchronously serve predictions using the trained model""" # Preform async operations that required # result = await some_async_preprocessing(inputs) - + # Use the model to make predictions # return self.model.predict(result) ``` === "Predictor (vLLM deployments only)" ``` python import os - from vllm import __version__, AsyncEngineArgs, AsyncLLMEngine + from vllm import **version**, AsyncEngineArgs, AsyncLLMEngine from typing import Iterable, AsyncIterator, Union, Optional from kserve.protocol.rest.openai import ( CompletionRequest, @@ -201,15 +214,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott from kserve.protocol.rest.openai.types import Completion from kserve.protocol.rest.openai.types.openapi import ChatCompletionTool - class Predictor(): def __init__(self): """ Initialization code goes here""" - + # (optional) if any, access the configuration file via os.environ["CONFIG_FILE_PATH"] config = ... - + print("Starting vLLM backend...") engine_args = AsyncEngineArgs( model=os.environ["MODEL_FILES_PATH"], @@ -239,7 +251,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott # self, request: CompletionRequest #) -> Union[Completion, AsyncIterator[Completion]]: # """Generate responses using the vLLM engine""" - # + # # generators = self.vllm_engine.generate(...) # # # Completion: used for returning a single answer (batch) @@ -252,9 +264,10 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 3 (Optional): Upload the script to your project -!!! info "You can also use the UI to upload your predictor script. See [above](#step-3-advanced-deployment-form)" +!!! info "You can also use the UI to upload your predictor script. See [above](#step-3-optional-select-a-predictor-script)" === "Python" + ```python uploaded_file_path = dataset_api.upload("my_predictor.py", "Resources", overwrite=True) predictor_script_path = os.path.join("/Projects", project.name, uploaded_file_path) @@ -263,6 +276,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 4: Define predictor === "Python" + ```python my_model = mr.get_model("my_model", version=1) @@ -277,6 +291,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 5: Create a deployment with the predictor === "Python" + ```python my_deployment = my_predictor.deploy() @@ -287,11 +302,12 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[Predictor](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/predictor_api/) +[`Predictor`][hsml.predictor.Predictor] ## Model Server -Hopsworks Model Serving supports deploying models with a Flask server for python-based models, TensorFlow Serving for TensorFlow / Keras models and vLLM for Large Language Models (LLMs). Today, you can deploy PyTorch models as python-based models. +Hopsworks Model Serving supports deploying models with a Flask server for python-based models, TensorFlow Serving for TensorFlow / Keras models and vLLM for Large Language Models (LLMs). +Today, you can deploy PyTorch models as python-based models. ??? info "Show supported model servers" @@ -304,7 +320,8 @@ Hopsworks Model Serving supports deploying models with a Flask server for python ## Serving tool -In Hopsworks, model servers are deployed on Kubernetes. There are two options for deploying models on Kubernetes: using [KServe](https://kserve.github.io/website/latest/) inference services or Kubernetes built-in deployments. ==KServe is the recommended way to deploy models in Hopsworks==. +In Hopsworks, model servers are deployed on Kubernetes. +There are two options for deploying models on Kubernetes: using [KServe](https://kserve.github.io/website/latest/) inference services or Kubernetes built-in deployments. ==KServe is the recommended way to deploy models in Hopsworks==. The following is a comparative table showing the features supported by each of them. @@ -327,7 +344,8 @@ The following is a comparative table showing the features supported by each of t Depending on the model server and serving platform used in the model deployment, you can (or need) to provide your own python script to load the model and make predictions. This script is referred to as **predictor script**, and is included in the [artifact files](../serving/deployment.md#artifact-files) of the model deployment. -The predictor script needs to implement a given template depending on the model server of the model deployment. See the templates in [Step 2](#step-2-optional-implement-a-predictor-script). +The predictor script needs to implement a given template depending on the model server of the model deployment. +See the templates in [Step 2](#step-2-optional-implement-a-predictor-script). ??? info "Show supported user-provided predictors" @@ -341,13 +359,17 @@ The predictor script needs to implement a given template depending on the model ### Server configuration file -Depending on the model server, a **server configuration file** can be selected to help detach configuration used within the model deployment from the model server or the implementation of the predictor and transformer scripts. In other words, by modifying the configuration file of an existing model deployment you can adjust its settings without making changes to the predictor or transformer scripts. Inside a model deployment, the local path to the configuration file is stored in the `CONFIG_FILE_PATH` environment variable (see [environment variables](#environment-variables)). +Depending on the model server, a **server configuration file** can be selected to help detach configuration used within the model deployment from the model server or the implementation of the predictor and transformer scripts. +In other words, by modifying the configuration file of an existing model deployment you can adjust its settings without making changes to the predictor or transformer scripts. +Inside a model deployment, the local path to the configuration file is stored in the `CONFIG_FILE_PATH` environment variable (see [environment variables](#environment-variables)). !!! warning "Configuration file format" The configuration file can be of any format, except in vLLM deployments **without a predictor script** for which a YAML file is ==required==. !!! note "Passing arguments to vLLM via configuration file" - For vLLM deployments **without a predictor script**, the server configuration file is ==required== and it is used to configure the vLLM server. For example, you can use this configuration file to specify the chat template or LoRA modules to be loaded by the vLLM server. See all available parameters in the [official documentation](https://docs.vllm.ai/en/v0.7.1/serving/openai_compatible_server.html#command-line-arguments-for-the-server). + For vLLM deployments **without a predictor script**, the server configuration file is ==required== and it is used to configure the vLLM server. + For example, you can use this configuration file to specify the chat template or LoRA modules to be loaded by the vLLM server. + See all available parameters in the [official documentation](https://docs.vllm.ai/en/v0.7.1/serving/openai_compatible_server.html#command-line-arguments-for-the-server). ### Environment variables @@ -367,7 +389,8 @@ A number of different environment variables is available in the predictor to eas ## Python environments -Depending on the model server and serving tool used in the model deployment, you can select the Python environment where the predictor and transformer scripts will run. To create a new Python environment see [Python Environments](../../projects/python/python_env_overview.md). +Depending on the model server and serving tool used in the model deployment, you can select the Python environment where the predictor and transformer scripts will run. +To create a new Python environment see [Python Environments](../../projects/python/python_env_overview.md). ??? info "Show supported Python environments" @@ -379,28 +402,35 @@ Depending on the model server and serving tool used in the model deployment, you | | TensorFlow Serving | ✅ | (official) tensorflow serving image | any `inference-pipeline` image | | | vLLM | ✅ | `vllm-inference-pipeline` or `vllm-openai` | any `inference-pipeline` image | -!!! note - The selected Python environment is used for both predictor and transformer. Support for selecting a different Python environment for the predictor and transformer is coming soon. +!!! note + The selected Python environment is used for both predictor and transformer. + Support for selecting a different Python environment for the predictor and transformer is coming soon. ## Transformer -Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. To learn more about transformers, see the [Transformer Guide](transformer.md). +Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. +To learn more about transformers, see the [Transformer Guide](transformer.md). !!! note Transformers are only supported in KServe deployments. ## Inference logger -Inference loggers are deployment components that log inference requests into a Kafka topic for later analysis. To learn about the different logging modes, see the [Inference Logger Guide](inference-logger.md) +Inference loggers are deployment components that log inference requests into a Kafka topic for later analysis. +To learn about the different logging modes, see the [Inference Logger Guide](inference-logger.md) ## Inference batcher -Inference batcher are deployment component that apply batching to the incoming inference requests for a better throughput-latency trade-off. To learn about the different configuration available for the inference batcher, see the [Inference Batcher Guide](inference-batcher.md). +Inference batcher are deployment component that apply batching to the incoming inference requests for a better throughput-latency trade-off. +To learn about the different configuration available for the inference batcher, see the [Inference Batcher Guide](inference-batcher.md). ## Resources -Resources include the number of replicas for the deployment as well as the resources (i.e., memory, CPU, GPU) to be allocated per replica. To learn about the different combinations available, see the [Resources Guide](resources.md). +Resources include the number of replicas for the deployment as well as the resources (i.e., memory, CPU, GPU) to be allocated per replica. +To learn about the different combinations available, see the [Resources Guide](resources.md). ## API protocol -Hopsworks supports both REST and gRPC as the API protocols to send inference requests to model deployments. In general, you use gRPC when you need lower latency inference requests. To learn more about the REST and gRPC API protocols for model deployments, see the [API Protocol Guide](api-protocol.md). +Hopsworks supports both REST and gRPC as the API protocols to send inference requests to model deployments. +In general, you use gRPC when you need lower latency inference requests. +To learn more about the REST and gRPC API protocols for model deployments, see the [API Protocol Guide](api-protocol.md). diff --git a/docs/user_guides/mlops/serving/resources.md b/docs/user_guides/mlops/serving/resources.md index 27a22ba02..32d99adba 100644 --- a/docs/user_guides/mlops/serving/resources.md +++ b/docs/user_guides/mlops/serving/resources.md @@ -6,7 +6,9 @@ description: Documentation on how to allocate resources to a model deployment ## Introduction -Depending on the serving tool used to deploy a trained model, resource allocation can be configured at different levels. While deployments on Docker containers only support a fixed number of resources (CPU and memory), using Kubernetes or KServe allows a better exploitation of the resources available in the platform, by enabling you to specify how many CPUs, GPUs, and memory are allocated to a deployment. See the [compatibility matrix](#compatibility-matrix). +Depending on the serving tool used to deploy a trained model, resource allocation can be configured at different levels. +While deployments on Docker containers only support a fixed number of resources (CPU and memory), using Kubernetes or KServe allows a better exploitation of the resources available in the platform, by enabling you to specify how many CPUs, GPUs, and memory are allocated to a deployment. +See the [compatibility matrix](#compatibility-matrix). ## GUI @@ -21,11 +23,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Go to advanced options -A simplified creation form will appear including the most common deployment fields from all available configurations. Resource allocation is part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +A simplified creation form will appear including the most common deployment fields from all available configurations. +Resource allocation is part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -36,7 +41,8 @@ A simplified creation form will appear including the most common deployment fiel ### Step 3: Configure resource allocation -In the `Resource allocation` section of the form, you can optionally set the resources to be allocated to the predictor and/or the transformer (if available). Moreover, you can choose the minimum number of replicas for each of these components. +In the `Resource allocation` section of the form, you can optionally set the resources to be allocated to the predictor and/or the transformer (if available). +Moreover, you can choose the minimum number of replicas for each of these components. ??? note "Scale-to-zero capabilities" Deployments with KServe enabled can scale to zero by choosing `0` as the number of instances. @@ -55,6 +61,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -70,6 +77,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2: Define the predictor resource configuration === "Python" + ```python from hsml.resources import PredictorResources, Resources @@ -82,6 +90,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 3 (Optional): Define the transformer resource configuration === "Python" + ```python from hsml.resources import TransformerResources @@ -94,6 +103,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 4: Create a deployment with the resource configuration === "Python" + ```python my_model = mr.get_model("my_model", version=1) @@ -112,7 +122,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[Resource Allocation](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/resources_api/) +[`Resources`][hsml.resources.Resources] ## Compatibility matrix diff --git a/docs/user_guides/mlops/serving/rest-api.md b/docs/user_guides/mlops/serving/rest-api.md index 93461c9c3..7d3df7456 100644 --- a/docs/user_guides/mlops/serving/rest-api.md +++ b/docs/user_guides/mlops/serving/rest-api.md @@ -2,15 +2,16 @@ ## Introduction -Hopsworks provides model serving capabilities by leveraging [KServe](https://kserve.github.io/website/) as the model serving platform and [Istio](https://istio.io/) as the ingress gateway to the model deployments. +Hopsworks provides model serving capabilities by leveraging [KServe](https://kserve.github.io/website/) as the model serving platform and [Istio](https://istio.io/) as the ingress gateway to the model deployments. This document explains how to interact with a model deployment via REST API. ## Base URL -Deployed models are accessible through the Istio ingress gateway. The URL to interact with a model deployment is provided on the model deployment page in the Hopsworks UI. +Deployed models are accessible through the Istio ingress gateway. +The URL to interact with a model deployment is provided on the model deployment page in the Hopsworks UI. -The URL follows the format `http:///`, where `RESOURCE_PATH` depends on the [model server](https://docs.hopsworks.ai/latest/user_guides/mlops/serving/predictor/#model-server) (e.g. vLLM, TensorFlow Serving, SKLearn ModelServer). +The URL follows the format `http:///`, where `RESOURCE_PATH` depends on the [`Predictor.model_server`][hsml.predictor.Predictor.model_server] (e.g., vLLM, TensorFlow Serving, SKLearn ModelServer).

@@ -19,12 +20,13 @@ The URL follows the format `http:///`, where `R

- ## Authentication -All requests must include an API Key for authentication. You can create an API by following this [guide](../../projects/api_key/create_api_key.md). +All requests must include an API Key for authentication. +You can create an API by following this [guide](../../projects/api_key/create_api_key.md). Include the key in the Authorization header: + ```text Authorization: ApiKey ``` @@ -33,7 +35,7 @@ Authorization: ApiKey | Header | Description | Example Value | | --------------- | ------------------------------------------- | ------------------------------------ | -| `Host` | Model’s hostname, provided in Hopsworks UI. | `fraud.test.hopsworks.ai` | +| `Host` | Model’s hostname, provided in Hopsworks UI. | `fraud.test.hopsworks.ai` | | `Authorization` | API key for authentication. | `ApiKey ` | | `Content-Type` | Request payload type (always JSON). | `application/json` | @@ -41,7 +43,10 @@ Authorization: ApiKey The request format depends on the model sever being used. -For predictive inference (i.e. for Tensorflow or SkLearn or Python Serving). The request must be sent as a JSON object containing an `inputs` or `instances` field. You can find more information on the request format [here](https://kserve.github.io/website/docs/concepts/architecture/data-plane/v1-protocol#request-format). An example for this is given below. +For predictive inference (i.e., for Tensorflow or SkLearn or Python Serving). +The request must be sent as a JSON object containing an `inputs` or `instances` field. +See [more information on the request format](https://kserve.github.io/website/docs/concepts/architecture/data-plane/v1-protocol#request-format). +An example for this is given below. === "Python" @@ -90,9 +95,10 @@ For predictive inference (i.e. for Tensorflow or SkLearn or Python Serving). The }' ``` -For generative inference (i.e vLLM) the response follows the [OpenAI specification](https://platform.openai.com/docs/api-reference/chat/create). - +For generative inference (i.e vLLM) the response follows the [OpenAI specification](https://platform.openai.com/docs/api-reference/chat/create). ## Response -The model returns predictions in a JSON object. The response depends on the model server implementation. You can find more information regarding specific model servers in the [Kserve documentation](https://kserve.github.io/website/docs/intro). +The model returns predictions in a JSON object. +The response depends on the model server implementation. +You can find more information regarding specific model servers in the [Kserve documentation](https://kserve.github.io/website/docs/intro). diff --git a/docs/user_guides/mlops/serving/transformer.md b/docs/user_guides/mlops/serving/transformer.md index 6d3466932..607734b20 100644 --- a/docs/user_guides/mlops/serving/transformer.md +++ b/docs/user_guides/mlops/serving/transformer.md @@ -8,7 +8,8 @@ description: Documentation on how to configure a KServe transformer for a model In this guide, you will learn how to configure a transformer in a deployment. -Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. They run on a built-in Flask server provided by Hopsworks and require a user-provided python script implementing the [Transformer class](#step-2-implement-transformer-script). +Transformers are used to apply transformations on the model inputs before sending them to the predictor for making predictions using the model. +They run on a built-in Flask server provided by Hopsworks and require a user-provided python script implementing the [Transformer class](#step-2-implement-transformer-script). ???+ warning Transformers are only supported in deployments using KServe as serving tool. @@ -34,11 +35,14 @@ If you have at least one model already trained and saved in the Model Registry,

-Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. Both options will open the deployment creation form. +Once in the deployments page, you can create a new deployment by either clicking on `New deployment` (if there are no existing deployments) or on `Create new deployment` it the top-right corner. +Both options will open the deployment creation form. ### Step 2: Go to advanced options -A simplified creation form will appear including the most common deployment fields from all available configurations. Transformers are part of the advanced options of a deployment. To navigate to the advanced creation form, click on `Advanced options`. +A simplified creation form will appear including the most common deployment fields from all available configurations. +Transformers are part of the advanced options of a deployment. +To navigate to the advanced creation form, click on `Advanced options`.

@@ -49,7 +53,8 @@ A simplified creation form will appear including the most common deployment fiel ### Step 3: Select a transformer script -Transformers require KServe as the serving platform for the deployment. Make sure that KServe is enabled for this deployment by activating the corresponding checkbox. +Transformers require KServe as the serving platform for the deployment. +Make sure that KServe is enabled for this deployment by activating the corresponding checkbox.

@@ -68,7 +73,8 @@ Otherwise, you can click on `Upload new file` to upload the transformer script n

-After selecting the transformer script, you can optionally configure resource allocation for your transformer (see [Step 4](#step-4-optional-configure-resource-allocation)). Otherwise, click on `Create new deployment` to create the deployment for your model. +After selecting the transformer script, you can optionally configure resource allocation for your transformer (see [Step 4](#step-4-optional-configure-resource-allocation)). +Otherwise, click on `Create new deployment` to create the deployment for your model. ### Step 4 (Optional): Configure resource allocation @@ -91,6 +97,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -126,9 +133,10 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 3: Upload the script to your project -!!! info "You can also use the UI to upload your transformer script. See [above](#step-3-advanced-deployment-form)" +!!! info "You can also use the UI to upload your transformer script. See [above](#step-3-select-a-transformer-script)" === "Python" + ```python uploaded_file_path = dataset_api.upload("my_transformer.py", "Resources", overwrite=True) transformer_script_path = os.path.join("/Projects", project.name, uploaded_file_path) @@ -137,6 +145,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 4: Define a transformer === "Python" + ```python my_transformer = ms.create_transformer(script_file=uploaded_file_path) @@ -150,6 +159,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 5: Create a deployment with the transformer === "Python" + ```python my_predictor = ms.create_predictor(transformer=my_transformer) my_deployment = my_predictor.deploy() @@ -161,11 +171,12 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### API Reference -[Transformer](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/transformer_api/) +[`Transformer`][hsml.transformer.Transformer] ## Resources -Resources include the number of replicas for the deployment as well as the resources (i.e., memory, CPU, GPU) to be allocated per replica. To learn about the different combinations available, see the [Resources Guide](resources.md). +Resources include the number of replicas for the deployment as well as the resources (i.e., memory, CPU, GPU) to be allocated per replica. +To learn about the different combinations available, see the [Resources Guide](resources.md). ## Environment variables diff --git a/docs/user_guides/mlops/serving/troubleshooting.md b/docs/user_guides/mlops/serving/troubleshooting.md index df18c156c..d18d87c15 100644 --- a/docs/user_guides/mlops/serving/troubleshooting.md +++ b/docs/user_guides/mlops/serving/troubleshooting.md @@ -6,9 +6,11 @@ description: Documentation on how to troubleshoot a model deployment ## Introduction -In this guide, you will learn how to troubleshoot a deployment that is having issues to serve a trained model. But before that, it is important to understand how [deployment states](deployment-state.md) are defined and the possible transitions between conditions. +In this guide, you will learn how to troubleshoot a deployment that is having issues to serve a trained model. +But before that, it is important to understand how [deployment states](deployment-state.md) are defined and the possible transitions between conditions. -When a deployment is starting, it follows an ordered sequence of [states](deployment-state.md#deployment-conditions) before becoming ready for serving predictions. Similarly, it follows an ordered sequence of states when being stopped, although with fewer steps. +When a deployment is starting, it follows an ordered sequence of [states](deployment-state.md#deployment-conditions) before becoming ready for serving predictions. +Similarly, it follows an ordered sequence of states when being stopped, although with fewer steps. ## GUI @@ -23,15 +25,20 @@ If you have at least one deployment already created, navigate to the deployments

-Once in the deployments page, find the deployment you want to inspect. Next to the actions buttons, you can find an indicator showing the current status of the deployment. For a more descriptive representation, this indicator changes its color based on the status. +Once in the deployments page, find the deployment you want to inspect. +Next to the actions buttons, you can find an indicator showing the current status of the deployment. +For a more descriptive representation, this indicator changes its color based on the status. To inspect the condition of the deployment, click on the name of the deployment to open the deployment overview page. ### Step 2: Inspect condition -At the top of page, you can find the same status indicator mentioned in the previous step. Below it, a one-line message is shown with a more detailed description of the deployment status. This message is built using the current status [condition](deployment-state.md#deployment-conditions) of the deployment. +At the top of page, you can find the same status indicator mentioned in the previous step. +Below it, a one-line message is shown with a more detailed description of the deployment status. +This message is built using the current status [condition](deployment-state.md#deployment-conditions) of the deployment. -Oftentimes, the status and the one-line description are enough to understand the current state of a deployment. For instance, when the cluster lacks enough allocatable resources to meet the deployment requirements, a meaningful error message will be shown with the root cause. +Oftentimes, the status and the one-line description are enough to understand the current state of a deployment. +For instance, when the cluster lacks enough allocatable resources to meet the deployment requirements, a meaningful error message will be shown with the root cause.

@@ -40,7 +47,9 @@ Oftentimes, the status and the one-line description are enough to understand the

-However, when the deployment fails to start futher details might be needed depending on the source of failure. For example, failures in the initialization or starting steps will show a less relevant message. In those cases, you can explore the deployments logs in search of the cause of the problem. +However, when the deployment fails to start futher details might be needed depending on the source of failure. +For example, failures in the initialization or starting steps will show a less relevant message. +In those cases, you can explore the deployments logs in search of the cause of the problem.

@@ -51,7 +60,9 @@ However, when the deployment fails to start futher details might be needed depen ### Step 3: Explore transient logs -Each deployment is composed of several components depending on its configuration and the model being served. Transient logs refer to component-specific logs that are directly retrieved from the component itself. Therefore, these logs can only be retrieved as long as the deployment components are reachable. +Each deployment is composed of several components depending on its configuration and the model being served. +Transient logs refer to component-specific logs that are directly retrieved from the component itself. +Therefore, these logs can only be retrieved as long as the deployment components are reachable. !!! info "" Transient logs are informative and fast to retrieve, facilitating the troubleshooting of deployment components at a glance @@ -62,16 +73,20 @@ Transient logs are convenient when access to the most recent logs of a deploymen When a deployment is in idle state, there are no components running (i.e., scaled to zero) and, thus, no transient logs are available. !!! note - In the current version of Hopsworks, transient logs can only be accessed using the Hopsworks Machine Learning Python library. See an example [here](#step-4-explore-transient-logs). + In the current version of Hopsworks, transient logs can only be accessed using the Hopsworks Machine Learning Python library. + See [an example](#step-4-explore-transient-logs). ### Step 4: Explore historical logs -Transient logs are continuously collected and stored in OpenSearch, where they become historical logs accessible using the integrated OpenSearch Dashboards. Therefore, historical logs contain the same information than transient logs. However, there might be cases where transient logs could not be collected in time for a specific component and, thus, not included in the historical logs. +Transient logs are continuously collected and stored in OpenSearch, where they become historical logs accessible using the integrated OpenSearch Dashboards. +Therefore, historical logs contain the same information than transient logs. +However, there might be cases where transient logs could not be collected in time for a specific component and, thus, not included in the historical logs. !!! info "" Historical logs are persisted transient logs that can be queried, filtered and sorted using OpenSearch Dashboards, facilitating a more sophisticated exploration of past records. -Historical logs are convenient when a deployment fails occasionally, either at inference time or without a clear reason. In this case, narrowing the inspection of component-specific logs at a concrete point in time and searching for keywords can be helpful. +Historical logs are convenient when a deployment fails occasionally, either at inference time or without a clear reason. +In this case, narrowing the inspection of component-specific logs at a concrete point in time and searching for keywords can be helpful. To access the OpenSearch Dashboards, click on the `See logs` button at the top of the deployment overview page. @@ -82,7 +97,6 @@ To access the OpenSearch Dashboards, click on the `See logs` button at the top o

- !!! note In case you are not familiar with the interface, you may find the [official documentation](https://opensearch.org/docs/latest/dashboards/index/) useful. @@ -104,6 +118,7 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -116,6 +131,7 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ### Step 2: Retrieve an existing deployment === "Python" + ```python deployment = ms.get_deployment("mydeployment") ``` @@ -123,6 +139,7 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ### Step 3: Get current deployment's predictor state === "Python" + ```python state = deployment.get_state() @@ -132,12 +149,13 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ### Step 4: Explore transient logs === "Python" + ```python deployment.get_logs(component="predictor|transformer", tail=10) ``` ### API Reference -[Deployment](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/deployment_api/) +[`Deployment`][hsml.deployment.Deployment] -[PredictorState](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/model-serving/predictor_state_api/) +[`PredictorState`][hsml.predictor_state.PredictorState] diff --git a/docs/user_guides/projects/airflow/airflow.md b/docs/user_guides/projects/airflow/airflow.md index b4e878c8e..bead5c033 100644 --- a/docs/user_guides/projects/airflow/airflow.md +++ b/docs/user_guides/projects/airflow/airflow.md @@ -1,72 +1,98 @@ --- description: Documentation on how to orchestrate Hopsworks jobs using Apache Airflow ---- +--- # Orchestrate Jobs using Apache Airflow -## Introduction +## Introduction -Hopsworks jobs can be orchestrated using [Apache Airflow](https://airflow.apache.org/). You can define a Airflow DAG (Directed Acyclic Graph) containing the dependencies between Hopsworks jobs. +Hopsworks jobs can be orchestrated using [Apache Airflow](https://airflow.apache.org/). +You can define a Airflow DAG (Directed Acyclic Graph) containing the dependencies between Hopsworks jobs. You can then schedule the DAG to be executed at a specific schedule using a [cron](https://en.wikipedia.org/wiki/Cron) expression. -Airflow DAGs are defined as Python files. Within the Python file, different operators can be used to trigger different actions. Hopsworks provides an operator to execute jobs on Hopsworks and a sensor to wait for a specific job to finish. +Airflow DAGs are defined as Python files. +Within the Python file, different operators can be used to trigger different actions. +Hopsworks provides an operator to execute jobs on Hopsworks and a sensor to wait for a specific job to finish. -### Use Apache Airflow in Hopsworks +### Use Apache Airflow in Hopsworks -Hopsworks deployments include a deployment of Apache Airflow. You can access it from the Hopsworks UI by clicking on the _Airflow_ button on the left menu. +Hopsworks deployments include a deployment of Apache Airflow. +You can access it from the Hopsworks UI by clicking on the _Airflow_ button on the left menu. -Airfow is configured to enforce Role Based Access Control (RBAC) to the Airflow DAGs. Admin users on Hopsworks have access to all the DAGs in the deployment. Regular users can access all the DAGs of the projects they are a member of. +Airfow is configured to enforce Role Based Access Control (RBAC) to the Airflow DAGs. +Admin users on Hopsworks have access to all the DAGs in the deployment. +Regular users can access all the DAGs of the projects they are a member of. !!! note "Access Control" - Airflow does not have any knowledge of the Hopsworks project you are currently working on. As such, when opening the Airflow UI, you will see all the DAGs all of the projects you are a member of. + Airflow does not have any knowledge of the Hopsworks project you are currently working on. + As such, when opening the Airflow UI, you will see all the DAGs all of the projects you are a member of. -#### Hopsworks DAG Builder +#### Hopsworks DAG Builder
Airflow DAG Builder
Airflow DAG Builder
-You can create a new Airflow DAG to orchestrate jobs using the Hopsworks DAG builder tool. Click on _New Workflow_ to create a new Airflow DAG. You should provide a name for the DAG as well as a schedule interval. You can define the schedule using the dropdown menus or by providing a cron expression. +You can create a new Airflow DAG to orchestrate jobs using the Hopsworks DAG builder tool. +Click on _New Workflow_ to create a new Airflow DAG. +You should provide a name for the DAG as well as a schedule interval. +You can define the schedule using the dropdown menus or by providing a cron expression. You can add to the DAG Hopsworks operators and sensors: -- **Operator**: The operator is used to trigger a job execution. When configuring the operator you select the job you want to execute and you can optionally provide execution arguments. You can decide whether or not the operator should wait for the execution to be completed. If you select the _wait_ option, the operator will block and Airflow will not execute any parallel task. If you select the _wait_ option the Airflow task fails if the job fails. If you want to execute tasks in parallel, you should not select the _wait_ option but instead use the sensor. When configuring the operator, you can can also provide which other Airflow tasks it depends on. If you add a dependency, the task will be executed only after the upstream tasks have been executed successfully. +- **Operator**: The operator is used to trigger a job execution. +When configuring the operator you select the job you want to execute and you can optionally provide execution arguments. +You can decide whether or not the operator should wait for the execution to be completed. +If you select the _wait_ option, the operator will block and Airflow will not execute any parallel task. +If you select the _wait_ option the Airflow task fails if the job fails. +If you want to execute tasks in parallel, you should not select the _wait_ option but instead use the sensor. +When configuring the operator, you can can also provide which other Airflow tasks it depends on. +If you add a dependency, the task will be executed only after the upstream tasks have been executed successfully. -- **Sensor**: The sensor can be used to wait for executions to be completed. Similarly to the _wait_ option of the operator, the sensor blocks until the job execution is completed. The sensor can be used to launch several jobs in parallel and wait for their execution to be completed. Please note that the sensor is defined at the job level rather than the execution level. The sensor will wait for the most recent execution to be completed and it will fail the Airflow task if the execution was not successful. +- **Sensor**: The sensor can be used to wait for executions to be completed. +Similarly to the _wait_ option of the operator, the sensor blocks until the job execution is completed. +The sensor can be used to launch several jobs in parallel and wait for their execution to be completed. +Please note that the sensor is defined at the job level rather than the execution level. +The sensor will wait for the most recent execution to be completed and it will fail the Airflow task if the execution was not successful. -You can then create the DAG and Hopsworks will generate the Python file. +You can then create the DAG and Hopsworks will generate the Python file. -#### Write your own DAG +#### Write your own DAG -If you prefer to code the DAGs or you want to edit a DAG built with the builder tool, you can do so. The Airflow DAGs are stored in the _Airflow_ dataset which you can access using the file browser in the project settings. +If you prefer to code the DAGs or you want to edit a DAG built with the builder tool, you can do so. +The Airflow DAGs are stored in the _Airflow_ dataset which you can access using the file browser in the project settings. When writing the code for the DAG you can invoke the operator as follows: ```python HopsworksLaunchOperator(dag=dag, - task_id="profiles_fg_0", - project_name="airflow_doc", - job_name="profiles_fg", - job_arguments="", - wait_for_completion=True) + task_id="profiles_fg_0", + project_name="airflow_doc", + job_name="profiles_fg", + job_arguments="", + wait_for_completion=True) ``` -You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`, `job_arguments`). You can set the `wait_for_completion` flag to `True` if you want the operator to block and wait for the job execution to be finished. +You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`, `job_arguments`). +You can set the `wait_for_completion` flag to `True` if you want the operator to block and wait for the job execution to be finished. -Similarly, you can invoke the sensor as shown below. You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`) +Similarly, you can invoke the sensor as shown below. +You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`) ```python HopsworksJobSuccessSensor(dag=dag, task_id='wait_for_profiles_fg', - project_name="airflow_doc", + project_name="airflow_doc", job_name='profiles_fg') ``` -When writing the DAG file, you should also add the `access_control` parameter to the DAG configuration. The `access_control` parameter specifies which projects have access to the DAG and which actions the project members can perform on it. If you do not specify the `access_control` option, project members will not be able to see the DAG in the Airflow UI. +When writing the DAG file, you should also add the `access_control` parameter to the DAG configuration. +The `access_control` parameter specifies which projects have access to the DAG and which actions the project members can perform on it. +If you do not specify the `access_control` option, project members will not be able to see the DAG in the Airflow UI. !!! warning "Admin access" - The `access_control` configuration does not apply to Hopsworks admin users which have full access to all the DAGs even if they are not member of the project. + The `access_control` configuration does not apply to Hopsworks admin users which have full access to all the DAGs even if they are not member of the project. ```python dag = DAG( @@ -85,5 +111,6 @@ When writing the DAG file, you should also add the `access_control` parameter to #### Manage Airflow DAGs using Git -You can leverage the [Git integration](../git/clone_repo.md) to track your Airflow DAGs in a git repository. Airflow will only consider the DAG files which are stored in the _Airflow_ Dataset in Hopsworks. -After cloning the git repository in Hopsworks, you can automate the process of copying the DAG file in the _Airflow_ Dataset using the [copy method](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/datasets/#copy) of the Hopsworks API. \ No newline at end of file +You can leverage the [Git integration](../git/clone_repo.md) to track your Airflow DAGs in a git repository. +Airflow will only consider the DAG files which are stored in the _Airflow_ Dataset in Hopsworks. +After cloning the git repository in Hopsworks, you can automate the process of copying the DAG file in the _Airflow_ Dataset using [`DatasetApi.copy`][hopsworks_common.core.dataset_api.DatasetApi.copy] of the Hopsworks API. diff --git a/docs/user_guides/projects/api_key/create_api_key.md b/docs/user_guides/projects/api_key/create_api_key.md index 9a3654ad4..eb2e5b6d2 100644 --- a/docs/user_guides/projects/api_key/create_api_key.md +++ b/docs/user_guides/projects/api_key/create_api_key.md @@ -2,7 +2,8 @@ ## Introduction -An API key allows a user or a program to make API calls without having to authenticate with a username and password. To access an endpoint using an API key, a client should send the access token using the ApiKey authentication scheme. +An API key allows a user or a program to make API calls without having to authenticate with a username and password. +To access an endpoint using an API key, a client should send the access token using the ApiKey authentication scheme. The API Key can now be used when connecting to your Hopsworks instance using the `hopsworks`, `hsfs` or `hsml` python library or set in the `ApiKey` header for the REST API. @@ -11,7 +12,9 @@ GET /resource HTTP/1.1 Host: server.hopsworks.ai Authorization: ApiKey ``` + ## UI + In this guide, you will learn how to create an API key. ### Step 1: Navigate to API Keys @@ -27,7 +30,7 @@ In the _Account Settings_ page you can find the _API_ section showing a list of ### Step 2: Create an API Key -Click `New Api key`, select the required scopes and create it by clicking `Create Api Key`. +Click `New Api key`, select the required scopes and create it by clicking `Create Api Key`. Copy the value and save it in a secure location, such as a password manager. @@ -40,4 +43,5 @@ Copy the value and save it in a secure location, such as a password manager. ## Login with API Key using SDK -In this guide you learned how to create an API Key. You can now use the API Key to [login](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/login/) using the `hopsworks` python SDK. \ No newline at end of file +In this guide you learned how to create an API Key. +You can now use the API Key to [login][hopsworks.login] using the `hopsworks` python SDK. diff --git a/docs/user_guides/projects/auth/krb.md b/docs/user_guides/projects/auth/krb.md index 7498bce19..a937faf4a 100644 --- a/docs/user_guides/projects/auth/krb.md +++ b/docs/user_guides/projects/auth/krb.md @@ -1,30 +1,35 @@ # Login using Kerberos ## Introduction -Hopsworks supports different methods of authentication. Here we will look at authentication using Kerberos. + +Hopsworks supports different methods of authentication. +Here we will look at authentication using Kerberos. ## Prerequisites -A Hopsworks cluster with Kerberos authentication. -See [Configure Kerberos](../../../../setup_installation/admin/ldap/configure-krb) on how to configure Kerberos on your cluster. +A Hopsworks cluster with Kerberos authentication. +See [Configure Kerberos](../../../setup_installation/admin/ldap/configure-krb.md) on how to configure Kerberos on your cluster. ### Step 1: Log in with Kerberos -If Kerberos is configured you will see a _Log in using_ alternative on the login page. Choose Kerberos and click on -**Go to Hopsworks** to login. + +If Kerberos is configured you will see a _Log in using_ alternative on the login page. +Choose Kerberos and click on +**Go to Hopsworks** to login.
Log in using Kerberos
Log in using Kerberos
-If password login is disabled you only see the _Log in using Kerberos/SSO_ alternative. Click on +If password login is disabled you only see the _Log in using Kerberos/SSO_ alternative. +Click on **Go to Hopsworks** to login.
Kerberos only
Kerberos only authentication
-To be able to authenticate with Kerberos you need to configure your browser to use Kerberos. +To be able to authenticate with Kerberos you need to configure your browser to use Kerberos. Note that without a properly configured browser, the Kerberos token is not sent to the server and so SSO will not work. If Kerberos is not configured properly you will see **Wrong credentials** message when trying to log in. @@ -34,11 +39,14 @@ If Kerberos is not configured properly you will see **Wrong credentials** messag
### Step 2: Give consent + When logging in with Kerberos for the first time Hopsworks will retrieve and save consented claims (firstname, lastname -and email), about the logged in end-user. If you have multiple email addresses registered in Kerberos you can choose +and email), about the logged in end-user. +If you have multiple email addresses registered in Kerberos you can choose one to use with Hopsworks. -If you do not want your information to be saved in Hopsworks you can click **Cancel**. This will redirect you back +If you do not want your information to be saved in Hopsworks you can click **Cancel**. +This will redirect you back to the login page.
@@ -52,5 +60,6 @@ After clicking on **Register** you will be redirected to the landing page:
Landing page
-In the landing page, you will find two buttons. Use these buttons to either create a -_demo project_ or [a new project](../../../projects/project/create_project). +In the landing page, you will find two buttons. +Use these buttons to either create a +_demo project_ or [a new project](../../projects/project/create_project.md). diff --git a/docs/user_guides/projects/auth/ldap.md b/docs/user_guides/projects/auth/ldap.md index aedf1bf23..721666767 100644 --- a/docs/user_guides/projects/auth/ldap.md +++ b/docs/user_guides/projects/auth/ldap.md @@ -1,14 +1,19 @@ # Login using LDAP ## Introduction -Hopsworks supports different methods of authentication. Here we will look at authentication using LDAP. + +Hopsworks supports different methods of authentication. +Here we will look at authentication using LDAP. ## Prerequisites -A Hopsworks cluster with LDAP authentication. -See [Configure LDAP](../../../../setup_installation/admin/ldap/configure-ldap) on how to configure LDAP on your cluster. + +A Hopsworks cluster with LDAP authentication. +See [Configure LDAP](../../../setup_installation/admin/ldap/configure-ldap.md) on how to configure LDAP on your cluster. ### Step 1: Log in with LDAP -If LDAP is configured you will see a _Log in using_ alternative on the login page. Choose LDAP and type in your + +If LDAP is configured you will see a _Log in using_ alternative on the login page. +Choose LDAP and type in your _username_ and _password_ then click on **Login**. Note that you need to use your LDAP credentials. @@ -18,11 +23,14 @@ Note that you need to use your LDAP credentials.
### Step 2: Give consent + When logging in with LDAP for the first time Hopsworks will retrieve and save consented claims (firstname, lastname -and email), about the logged in end-user. If you have multiple email addresses registered in LDAP you can choose one to -use with Hopsworks. +and email), about the logged in end-user. +If you have multiple email addresses registered in LDAP you can choose one to +use with Hopsworks. -If you do not want your information to be saved in Hopsworks you can click **Cancel**. This will redirect you back +If you do not want your information to be saved in Hopsworks you can click **Cancel**. +This will redirect you back to the login page.
@@ -36,5 +44,6 @@ After clicking on **Register** you will be redirected to the landing page:
Landing page
-In the landing page, you will find two buttons. Use these buttons to either create a -_demo project_ or [a new project](../../../projects/project/create_project). +In the landing page, you will find two buttons. +Use these buttons to either create a +_demo project_ or [a new project](../../projects/project/create_project.md). diff --git a/docs/user_guides/projects/auth/login.md b/docs/user_guides/projects/auth/login.md index c0bf4c4c4..9e34dcbf1 100644 --- a/docs/user_guides/projects/auth/login.md +++ b/docs/user_guides/projects/auth/login.md @@ -1,12 +1,16 @@ # Log in To Hopsworks ## Introduction -Hopsworks supports different methods of authentication. Here we will look at authentication using username and password. + +Hopsworks supports different methods of authentication. +Here we will look at authentication using username and password. ## Prerequisites + An account on a Hopsworks cluster. ### Step 1: Log in with email and password + After your account is validated by an administrator you can use your email and password to login.
@@ -16,8 +20,9 @@ After your account is validated by an administrator you can use your email and p ### Step 2: Two-factor authentication -If two-factor authentication is enabled you will be presented with a two-factor authentication window after you -enter your password. Use your authenticator app +If two-factor authentication is enabled you will be presented with a two-factor authentication window after you +enter your password. +Use your authenticator app (example. [Google Authenticator](https://play.google.com/store/apps/details?id=com.google.android.apps.authenticator2&hl=en&gl=US)) on your phone to get a one-time password. @@ -33,5 +38,6 @@ Upon successful login, you will arrive at the landing page:
Landing page
-In the landing page, you will find two buttons. Use these buttons to either create a -_demo project_ or [a new project](../../../projects/project/create_project). +In the landing page, you will find two buttons. +Use these buttons to either create a +_demo project_ or [a new project](../../projects/project/create_project.md). diff --git a/docs/user_guides/projects/auth/oauth.md b/docs/user_guides/projects/auth/oauth.md index 8d7a52a23..0cb683a77 100644 --- a/docs/user_guides/projects/auth/oauth.md +++ b/docs/user_guides/projects/auth/oauth.md @@ -1,14 +1,19 @@ # Login Using A Third-party Identity Provider ## Introduction -Hopsworks supports different methods of authentication. Here we will look at authentication using Third-party Identity Provider. + +Hopsworks supports different methods of authentication. +Here we will look at authentication using Third-party Identity Provider. ## Prerequisites -A Hopsworks cluster with OAuth authentication. -See [Configure OAuth2](../../../../setup_installation/admin/oauth2/create-client) on how to configure OAuth on your cluster. + +A Hopsworks cluster with OAuth authentication. +See [Configure OAuth2](../../../setup_installation/admin/oauth2/create-client.md) on how to configure OAuth on your cluster. ### Step 1: Log in with OAuth -If OAuth is configured a **Login with ** button will appear in the login page. Use this button to log in to Hopsworks + +If OAuth is configured a **Login with** button will appear in the login page. +Use this button to log in to Hopsworks using your OAuth credentials.
@@ -17,7 +22,8 @@ using your OAuth credentials.
### Step 2: Give consent -When logging in with OAuth for the first time Hopsworks will retrieve and save consented claims (firstname, lastname + +When logging in with OAuth for the first time Hopsworks will retrieve and save consented claims (firstname, lastname and email), about the logged in end-user.
@@ -31,5 +37,6 @@ After clicking on **Register** you will be redirected to the landing page:
Landing page
-In the landing page, you will find two buttons. Use these buttons to either create a -_demo project_ or [a new project](../../../projects/project/create_project). +In the landing page, you will find two buttons. +Use these buttons to either create a +_demo project_ or [a new project](../../projects/project/create_project.md). diff --git a/docs/user_guides/projects/auth/profile.md b/docs/user_guides/projects/auth/profile.md index 194f2d3c2..3553f5ea7 100644 --- a/docs/user_guides/projects/auth/profile.md +++ b/docs/user_guides/projects/auth/profile.md @@ -1,18 +1,26 @@ # Update Your Profile and Credentials ## Introduction -A profile is required to access Hopsworks. A profile is created when a user registers and can be updated via Account settings. + +A profile is required to access Hopsworks. +A profile is created when a user registers and can be updated via Account settings. ## Prerequisites + An account on a Hopsworks cluster. -Updating profile and credentials is not supported if you are using Third-party Identity Providers like Kerberos, LDAP, or OAuth +Updating profile and credentials is not supported if you are using Third-party Identity Providers like Kerberos, LDAP, or OAuth to authenticate to Hopsworks. ### Step 1: Go to your Account settings -After you have logged in, in the upper right-hand corner of the screen, you will see your name. Click on your name, -then click on the menu item **Account settings**. The account settings page will open with profile tab selected. In this tab -you can change your first and last name. You cannot change your email address and will need to create a new + +After you have logged in, in the upper right-hand corner of the screen, you will see your name. +Click on your name, +then click on the menu item **Account settings**. +The account settings page will open with profile tab selected. +In this tab +you can change your first and last name. +You cannot change your email address and will need to create a new account if you wish to change your email address.
@@ -21,6 +29,7 @@ account if you wish to change your email address.
### Step 2: Update credential + To update your credential go to the **Authentication** tab as shown in the image below.
Update credentials @@ -28,7 +37,9 @@ To update your credential go to the **Authentication** tab as shown in the image
### Step 3: Enable/Reset Two-factor Authentication -You can also change your two-factor setting in the **Authentication** tab. Two-factor authentication + +You can also change your two-factor setting in the **Authentication** tab. +Two-factor authentication is only available if it is enabled from the cluster administration page.
@@ -36,9 +47,10 @@ is only available if it is enabled from the cluster administration page.
Enable Two-factor Authentication
-After enabling or resetting two-factor you will be presented with a QR Code. You will then need to scan the QR code -to add it on your phone's authenticator application -(example. [Google Authenticator](https://play.google.com/store/apps/details?id=com.google.android.apps.authenticator2&hl=en&gl=US)). +After enabling or resetting two-factor you will be presented with a QR Code. +You will then need to scan the QR code +to add it on your phone's authenticator application +(example. [Google Authenticator](https://play.google.com/store/apps/details?id=com.google.android.apps.authenticator2&hl=en&gl=US)). If you miss this step, you will have to recover your smartphone credentials at a later stage. diff --git a/docs/user_guides/projects/auth/recovery.md b/docs/user_guides/projects/auth/recovery.md index 2d36d30c5..a6834ff52 100644 --- a/docs/user_guides/projects/auth/recovery.md +++ b/docs/user_guides/projects/auth/recovery.md @@ -1,13 +1,17 @@ # Password Recovery ## Introduction + This topic describes how to recover a forgotten password. ## Prerequisites + An account on a Hopsworks cluster. ### Step 1: Request password reset -If you forget your password start by clicking on **Forgot password** on the login page. Enter your email and click on the + +If you forget your password start by clicking on **Forgot password** on the login page. +Enter your email and click on the **Send reset link** button.
Recover password @@ -15,5 +19,6 @@ If you forget your password start by clicking on **Forgot password** on the logi
### Step 2: Use the password reset link + A password reset link will be sent to the email address you entered if the email is found in the system. Click on the reset link to set your new password. diff --git a/docs/user_guides/projects/auth/registration.md b/docs/user_guides/projects/auth/registration.md index 67db5f857..12e18b195 100644 --- a/docs/user_guides/projects/auth/registration.md +++ b/docs/user_guides/projects/auth/registration.md @@ -1,10 +1,12 @@ # Register A New Account On Hopsworks ## Introduction -Hopsworks supports different methods of authentication. + +Hopsworks supports different methods of authentication. To use username and password as the method of authentication, you first need to register. ## Prerequisites + Registration enabled Hopsworks cluster. The process for registering a new account is as follows @@ -20,9 +22,10 @@ Click on the _Register_ button on the login page and register your email address ### Step 2: Enable Two-Factor Authentication -If two-factor authentication is required you will be presented with a page like in the figure below. Scan the QR -code or type the code in bold to register your account in your authenticator app -(example. [Google Authenticator](https://play.google.com/store/apps/details?id=com.google.android.apps.authenticator2&hl=en&gl=US)). +If two-factor authentication is required you will be presented with a page like in the figure below. +Scan the QR +code or type the code in bold to register your account in your authenticator app +(example. [Google Authenticator](https://play.google.com/store/apps/details?id=com.google.android.apps.authenticator2&hl=en&gl=US)).
two-factor diff --git a/docs/user_guides/projects/git/clone_repo.md b/docs/user_guides/projects/git/clone_repo.md index 8189ca3e1..342ed7b07 100644 --- a/docs/user_guides/projects/git/clone_repo.md +++ b/docs/user_guides/projects/git/clone_repo.md @@ -2,12 +2,17 @@ ## Introduction -Repositories are cloned and managed within the scope of a project. The content of the repository will reside on the Hopsworks File System. The content of the repository can be edited from Jupyter notebooks and can for example be used to configure Jobs. -Repositories can be managed from the Git section in the project settings. The Git overview in the project settings provides a list of repositories currently cloned within the project, the location of their content as well which branch and commit their HEAD is currently at. +Repositories are cloned and managed within the scope of a project. +The content of the repository will reside on the Hopsworks File System. +The content of the repository can be edited from Jupyter notebooks and can for example be used to configure Jobs. +Repositories can be managed from the Git section in the project settings. +The Git overview in the project settings provides a list of repositories currently cloned within the project, the location of their content as well which branch and commit their HEAD is currently at. ## Prerequisites -- For cloning a private repository, you should configure a [Git Provider](configure_git_provider.md) with your git credentials. You can clone a GitHub and GitLab public repository without configuring the provider. However, for BitBucket you always need to configure the username and token to clone a repository. +- For cloning a private repository, you should configure a [Git Provider](configure_git_provider.md) with your git credentials. +You can clone a GitHub and GitLab public repository without configuring the provider. +However, for BitBucket you always need to configure the username and token to clone a repository. ## UI @@ -35,11 +40,18 @@ To clone a new repository, click on the `Clone repository` button on the Git ove

-You should first choose the git provider e.g., GitHub, GitLab or BitBucket. If you are cloning a private repository, remember to configure the username and token for the provider first in [Git Provider](configure_git_provider.md). The clone dialog also asks you to specify the URL of the repository to clone. The supported protocol is HTTPS. As an example, if the repository is hosted on GitHub, the URL should look like: `https://github.com/logicalclocks/hops-examples.git`. +You should first choose the git provider e.g., GitHub, GitLab or BitBucket. +If you are cloning a private repository, remember to configure the username and token for the provider first in [Git Provider](configure_git_provider.md). +The clone dialog also asks you to specify the URL of the repository to clone. +The supported protocol is HTTPS. +As an example, if the repository is hosted on GitHub, the URL should look like: `https://github.com/logicalclocks/hops-examples.git`. -Then specify which branch you want to clone. By default the `main` branch will be used, however a different branch or commit can be specified by selecting `Clone from a specific branch`. +Then specify which branch you want to clone. +By default the `main` branch will be used, however a different branch or commit can be specified by selecting `Clone from a specific branch`. -You can select the folder, within your project, in which the repository should be cloned. By default, the repository is going to be cloned within the `Jupyter` dataset. However, by clicking on the location button, a different location can be selected. +You can select the folder, within your project, in which the repository should be cloned. +By default, the repository is going to be cloned within the `Jupyter` dataset. +However, by clicking on the location button, a different location can be selected. Finally, click on the `Clone repository` button to trigger the cloning of the repository. @@ -56,7 +68,8 @@ The progress of the git clone can be tracked under `Git Executions`. ### Step 4: Browse repository files -In the `File browser` page you can now browse the files of the cloned repository. In the figure below, the repository is located in `Jupyter/hops-examples` directory. +In the `File browser` page you can now browse the files of the cloned repository. +In the figure below, the repository is located in `Jupyter/hops-examples` directory.

@@ -66,7 +79,9 @@ In the `File browser` page you can now browse the files of the cloned repository

## Code + You can also clone a repository through the hopsworks git API in python. + ### Step 1: Get the git API ```python @@ -91,26 +106,36 @@ BRANCH="master" # optional branch to clone examples_repo = git_api.clone(REPO_URL, HOPSWORKS_FOLDER, PROVIDER, branch=BRANCH) ``` + ### API Reference + Api reference for git repositories is available here: -[GitRepo](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/git_repo/) +[`GitRepo`][hopsworks_common.git_repo.GitRepo] -A notebook for managing git can be found [here](https://github.com/logicalclocks/hops-examples/blob/master/notebooks/services/git.ipynb). +A notebook for managing git can be found in the [Git Management Tutorial](https://github.com/logicalclocks/hops-examples/blob/master/notebooks/services/git.ipynb). ## Errors and Troubleshooting -### Invalid credentials -This might happen when the credentials entered for the provider are incorrect. Try the following: -- Confirm that the settings for the provider ( in Account Settings > Git providers) are correct. You must enter both your Git provider username and token. +### Invalid credentials + +This might happen when the credentials entered for the provider are incorrect. +Try the following: + +- Confirm that the settings for the provider ( in Account Settings > Git providers) are correct. +You must enter both your Git provider username and token. - Confirm that you have selected the correct Git provider when cloning the repository. - Ensure your personal access token has the correct repository access rights. - Ensure your personal access token has not expired. ### Timeout errors -Cloning a large repo or checking out a large branch may hit timeout errors. You can try again later if the system was under heavy load at the time. + +Cloning a large repo or checking out a large branch may hit timeout errors. +You can try again later if the system was under heavy load at the time. ### Symlink errors -Git repositories with symlinks are not yet supported, therefore cloning repositories with symlinks will fail. You can create a separate branch to remove the symlinks, and clone from this branch. + +Git repositories with symlinks are not yet supported, therefore cloning repositories with symlinks will fail. +You can create a separate branch to remove the symlinks, and clone from this branch. ## Going Further diff --git a/docs/user_guides/projects/git/configure_git_provider.md b/docs/user_guides/projects/git/configure_git_provider.md index 44966274e..585d1bb8a 100644 --- a/docs/user_guides/projects/git/configure_git_provider.md +++ b/docs/user_guides/projects/git/configure_git_provider.md @@ -2,11 +2,13 @@ ## Introduction -When you perform Git operations on Hopsworks that need to interact with the remote repository, Hopsworks relies on the Git HTTPS protocol to perform those operations. Authentication with the remote repository happens through a token generated by the Git repository hosting service (GitHub, GitLab, BitBucket). +When you perform Git operations on Hopsworks that need to interact with the remote repository, Hopsworks relies on the Git HTTPS protocol to perform those operations. +Authentication with the remote repository happens through a token generated by the Git repository hosting service (GitHub, GitLab, BitBucket). !!! notice "Token permissions" - The token permissions should grant access to public and private repositories including read and write access to repository contents and commit statuses. - If you are using the new GitHub access tokens, make sure you choose the correct `Resource owner` when generating the token for the repositories you will want to clone. For the `Repository permissions` of the new GitHub fine-grained token, you should atleast give read and write access to `Commit statuses` and `Contents`. + The token permissions should grant access to public and private repositories including read and write access to repository contents and commit statuses. + If you are using the new GitHub access tokens, make sure you choose the correct `Resource owner` when generating the token for the repositories you will want to clone. + For the `Repository permissions` of the new GitHub fine-grained token, you should atleast give read and write access to `Commit statuses` and `Contents`. ## UI @@ -15,9 +17,11 @@ Documentation on how to generate a token for the supported Git hosting services - [GitHub](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) - [GitLab](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) - [BitBucket](https://confluence.atlassian.com/bitbucketserver/http-access-tokens-939515499.html) + ### Step 1: Navigate to Git Providers -You can access the `Git Providers` page of your Hopsworks cluster by clicking on your name, in the top right corner, and choosing `Account Settings` from the dropdown menu. The `Git providers` section displays which providers have been already configured and can be used to clone new repositories. +You can access the `Git Providers` page of your Hopsworks cluster by clicking on your name, in the top right corner, and choosing `Account Settings` from the dropdown menu. +The `Git providers` section displays which providers have been already configured and can be used to clone new repositories.

@@ -53,7 +57,9 @@ The configured provider should now be marked as configured.

## Code + You can also configure a git provider using the hopsworks git API in python. + ### Step 1: Get the git API ```python @@ -80,7 +86,7 @@ git_api.set_provider(PROVIDER, GITHUB_USER, API_TOKEN) ### API Reference -[GitProvider](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/git_provider/) +[`GitProvider`][hopsworks_common.git_provider.GitProvider] ## Going Further diff --git a/docs/user_guides/projects/git/repository_actions.md b/docs/user_guides/projects/git/repository_actions.md index 81bef3cf1..c7c0095ca 100644 --- a/docs/user_guides/projects/git/repository_actions.md +++ b/docs/user_guides/projects/git/repository_actions.md @@ -1,10 +1,16 @@ # Repository actions + ## Introduction -This section explains the git operations or commands you can perform on hopsworks git repositories. These commands include commit, pull, push, create branches and many more. + +This section explains the git operations or commands you can perform on hopsworks git repositories. +These commands include commit, pull, push, create branches and many more. !!! notice "Repository permissions" - Git repositories are private. Only the owner of the repository can perform git actions on the repository such as commit, push, pull e.t.c. + Git repositories are private. + Only the owner of the repository can perform git actions on the repository such as commit, push, pull e.t.c. + ## UI + The operations to perform on the cloned repository can be found in the dropdown as shown below.

@@ -14,7 +20,10 @@ The operations to perform on the cloned repository can be found in the dropdown

-Note that some repository actions will require the username and token to be configured first depending on the provider. For example to be able to perform a push action in any repository, you must configure the provider for the repository first. To be able to perform a pull action for the for a GitLab repository, you must configure the GitLab provider first. You will see the dialog below in the case you need to configure the provider first to perform the repository action. +Note that some repository actions will require the username and token to be configured first depending on the provider. +For example to be able to perform a push action in any repository, you must configure the provider for the repository first. +To be able to perform a pull action for the for a GitLab repository, you must configure the GitLab provider first. +You will see the dialog below in the case you need to configure the provider first to perform the repository action.

@@ -24,10 +33,15 @@ Note that some repository actions will require the username and token to be conf

## Read only repositories -In read only repositories, the following actions are disabled: commit, push and file checkout. The read only property can be enabled or disabled in the Cluster settings > Configuration, by updating the `enable_read_only_git_repositories` variable to true or false. Note that you need administrator privileges to update this property. + +In read only repositories, the following actions are disabled: commit, push and file checkout. +The read only property can be enabled or disabled in the Cluster settings > Configuration, by updating the `enable_read_only_git_repositories` variable to true or false. +Note that you need administrator privileges to update this property. ## Code -You can also perform the repository actions using the hopsworks git API in python. + +You can also perform the repository actions using the hopsworks git API in python. + ### Step 1: Get the git API ```python @@ -39,19 +53,22 @@ project = hopsworks.login() git_api = project.get_git_api() ``` + ### Step 2: Get the git repository + ```python git_repo = git_api.get_repo(REPOSITORY_NAME) ``` ### Step 3: Perform the git repository action e.g commit + ```python git_repo = git_api.commit("Test commit") ``` ### API Reference -Api reference for repository actions is available here: -[GitRepo](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/git_repo/) +Api reference for repository actions is available here: +[`GitRepo`][hopsworks_common.git_repo.GitRepo] diff --git a/docs/user_guides/projects/iam_role/iam_role_chaining.md b/docs/user_guides/projects/iam_role/iam_role_chaining.md index 654fac898..c8ba089f8 100644 --- a/docs/user_guides/projects/iam_role/iam_role_chaining.md +++ b/docs/user_guides/projects/iam_role/iam_role_chaining.md @@ -2,17 +2,20 @@ ## Introduction -When deploying Hopsworks on EC2 instances you might need to assume different roles to access resources on AWS. +When deploying Hopsworks on EC2 instances you might need to assume different roles to access resources on AWS. These roles can be configured in AWS and mapped to a project in Hopsworks. ## Prerequisites + Before you begin this guide you'll need the following: - A Hopsworks cluster running on EC2. - [Role chaining](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_terms-and-concepts.html#iam-term-role-chaining) setup in AWS. -- Configure role mappings in Hopsworks. For a guide on how to configure this see [AWS IAM Role Chaining](../../../../setup_installation/admin/roleChaining). +- Configure role mappings in Hopsworks. + For a guide on how to configure this see [AWS IAM Role Chaining](../../../setup_installation/admin/roleChaining.md). ## UI + In this guide, you will learn how to use a mapped IAM role in your project. ### Step 1: Navigate to your project's IAM Role Chaining tab @@ -24,6 +27,6 @@ In the _Project Settings_ page you can find the _IAM Role Chaining_ section show
Role Chaining
-### Step 2: Use the IAM role +### Step 2: Use the IAM role -You can now use the IAM roles listed in your project when creating a Data Source with [Temporary Credentials](../../../fs/data_source/creation/s3/#temporary-credentials). +You can now use the IAM roles listed in your project when creating a Data Source with [Temporary Credentials](../../fs/data_source/creation/s3.md#temporary-credentials). diff --git a/docs/user_guides/projects/jobs/notebook_job.md b/docs/user_guides/projects/jobs/notebook_job.md index 3d92ea2a4..6b671b441 100644 --- a/docs/user_guides/projects/jobs/notebook_job.md +++ b/docs/user_guides/projects/jobs/notebook_job.md @@ -15,8 +15,10 @@ All members of a project in Hopsworks can launch the following types of applicat - Ray Launching a job of any type is very similar process, what mostly differs between job types is -the various configuration parameters each job type comes with. Hopsworks support scheduling jobs to run on a regular basis, -e.g backfilling a Feature Group by running your feature engineering pipeline nightly. Scheduling can be done both through the UI and the python API, +the various configuration parameters each job type comes with. +Hopsworks support scheduling jobs to run on a regular basis, +e.g backfilling a Feature Group by running your feature engineering pipeline nightly. +Scheduling can be done both through the UI and the python API, checkout [our Scheduling guide](schedule_job.md). ## UI @@ -45,7 +47,8 @@ Click `New Job` and the following dialog will appear. ### Step 3: Set the job type -By default, the dialog will create a Spark job. To instead configure a Jupyter Notebook job, select `PYTHON`. +By default, the dialog will create a Spark job. +To instead configure a Jupyter Notebook job, select `PYTHON`.

@@ -56,7 +59,9 @@ By default, the dialog will create a Spark job. To instead configure a Jupyter N ### Step 4: Set the notebook -Next step is to select the Jupyter Notebook to run. You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. By default, the job name is the same as the file name, but you can customize it as shown. +Next step is to select the Jupyter Notebook to run. +You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. +By default, the job name is the same as the file name, but you can customize it as shown.

@@ -70,7 +75,8 @@ Then click `Create job` to create the job. ### Step 5 (optional): Set the Jupyter Notebook arguments In the job settings, you can specify arguments for your notebook script. -Arguments must be in the format of `-p arg1 value1 -p arg2 value2`. For each argument, you must first provide `-p`, followed by the parameter name (e.g. `arg1`), followed by its value (e.g. `value1`). +Arguments must be in the format of `-p arg1 value1 -p arg2 value2`. +For each argument, you must first provide `-p`, followed by the parameter name (e.g. `arg1`), followed by its value (e.g. `value1`). The next step is to read the arguments in the notebook which is explained in this [guide](https://papermill.readthedocs.io/en/latest/usage-parameterize.html).

@@ -84,10 +90,11 @@ The next step is to read the arguments in the notebook which is explained in thi It is possible to also set following configuration settings for a `PYTHON` job. -* `Environment`: The python environment to use -* `Container memory`: The amount of memory in MB to be allocated to the Jupyter Notebook script -* `Container cores`: The number of cores to be allocated for the Jupyter Notebook script -* `Additional files`: List of files that will be locally accessible in the working directory of the application. Only recommended to use if project datasets are not mounted under `/hopsfs`. +- `Environment`: The python environment to use +- `Container memory`: The amount of memory in MB to be allocated to the Jupyter Notebook script +- `Container cores`: The number of cores to be allocated for the Jupyter Notebook script +- `Additional files`: List of files that will be locally accessible in the working directory of the application. +Only recommended to use if project datasets are not mounted under `/hopsfs`. You can always modify the arguments in the job settings.

@@ -99,13 +106,15 @@ You can always modify the arguments in the job settings. ### Step 7: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) ### Step 8: Execute the job -Now click the `Run` button to start the execution of the job. You will be redirected to the `Executions` page where you can see the list of all executions. +Now click the `Run` button to start the execution of the job. +You will be redirected to the `Executions` page where you can see the list of all executions.

@@ -115,6 +124,7 @@ Now click the `Run` button to start the execution of the job. You will be redire

### Step 9: Visualize output notebook + Once the execution is finished, click `Logs` and then `notebook out` to see the logs for the execution.

@@ -130,7 +140,7 @@ You can directly edit and save the output notebook by clicking `Open Notebook`. ### Step 1: Upload the Jupyter Notebook script -This snippet assumes the Jupyter Notebook script is in the current working directory and named `notebook.ipynb`. +This snippet assumes the Jupyter Notebook script is in the current working directory and named `notebook.ipynb`. It will upload the Jupyter Notebook script to the `Resources` dataset in your project. @@ -146,7 +156,6 @@ uploaded_file_path = dataset_api.upload("notebook.ipynb", "Resources") ``` - ### Step 2: Create Jupyter Notebook job In this snippet we get the `JobsApi` object to get the default job configuration for a `PYTHON` job, set the jupyter notebook file and override the environment to run in, and finally create the `Job` object. @@ -178,37 +187,41 @@ execution = job.run(args='-p a 2 -p b 5', await_termination=True) ``` ## Configuration + The following table describes the job configuration parameters for a PYTHON job. `conf = jobs_api.get_configuration("PYTHON")` -| Field | Type | Description | Default | -|-------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| -| `conf['type']` | string | Type of the job configuration | `"pythonJobConfiguration"` | -| `conf['appPath']` | string | Project relative path to notebook (e.g., `Resources/foo.ipynb`) | `null` | -| `conf['defaultArgs']` | string | Arguments to pass to the notebook.
Will be overridden if arguments are passed explicitly via `Job.run(args="...")`.
Must conform to Papermill format `-p arg1 val1` | `null` | -| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | -| `conf['resourceConfig']['cores']` | float | Number of CPU cores to be allocated | `1.0` | -| `conf['resourceConfig']['memory']` | int | Number of MBs to be allocated | `2048` | -| `conf['resourceConfig']['gpus']` | int | Number of GPUs to be allocated | `0` | -| `conf['logRedirection']` | boolean | Whether logs are redirected | `true` | -| `conf['jobType']` | string | Type of job | `"PYTHON"` | -| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | - +| Field | Type | Description | Default | +| --- | --- | --- | --- | +| `conf['type']` | string | Type of the job configuration | `"pythonJobConfiguration"` | +| `conf['appPath']` | string | Project relative path to notebook (e.g., `Resources/foo.ipynb`) | `null` | +| `conf['defaultArgs']` | string | Arguments to pass to the notebook.
Will be overridden if arguments are passed explicitly via `Job.run(args="...")`.
Must conform to Papermill format `-p arg1 val1` | `null` | +| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | +| `conf['resourceConfig']['cores']` | float | Number of CPU cores to be allocated | `1.0` | +| `conf['resourceConfig']['memory']` | int | Number of MBs to be allocated | `2048` | +| `conf['resourceConfig']['gpus']` | int | Number of GPUs to be allocated | `0` | +| `conf['logRedirection']` | boolean | Whether logs are redirected | `true` | +| `conf['jobType']` | string | Type of job | `"PYTHON"` | +| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | ## Accessing project data + !!! notice "Recommended approach if `/hopsfs` is mounted" If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section instead of the `Additional files` property to reference file resources. ### Absolute paths + The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your notebook. ### Relative paths -The notebook's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. +The notebook's working directory is the folder it is located in. +For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. +Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. ## API Reference -[Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) +[`Job`][hopsworks_common.job.Job] -[Executions](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/executions/) +[`Execution`][hopsworks_common.execution.Execution] diff --git a/docs/user_guides/projects/jobs/pyspark_job.md b/docs/user_guides/projects/jobs/pyspark_job.md index c9581bd0c..79aad2173 100644 --- a/docs/user_guides/projects/jobs/pyspark_job.md +++ b/docs/user_guides/projects/jobs/pyspark_job.md @@ -15,11 +15,12 @@ All members of a project in Hopsworks can launch the following types of applicat - Ray Launching a job of any type is very similar process, what mostly differs between job types is -the various configuration parameters each job type comes with. Hopsworks clusters support scheduling to run jobs on a regular basis, -e.g backfilling a Feature Group by running your feature engineering pipeline nightly. Scheduling can be done both through the UI and the python API, +the various configuration parameters each job type comes with. +Hopsworks clusters support scheduling to run jobs on a regular basis, +e.g backfilling a Feature Group by running your feature engineering pipeline nightly. +Scheduling can be done both through the UI and the python API, checkout [our Scheduling guide](schedule_job.md). - PySpark program can either be a `.py` script or a `.ipynb` file, however be mindful of how to access/create the spark session based on the extension you provide. @@ -54,11 +55,14 @@ Click `New Job` and the following dialog will appear. ### Step 3: Set the job type -By default, the dialog will create a Spark job. Make sure `SPARK` is chosen. +By default, the dialog will create a Spark job. +Make sure `SPARK` is chosen. ### Step 4: Set the script -Next step is to select the program to run. You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. By default, the job name is the same as the file name, but you can customize it as shown. +Next step is to select the program to run. +You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. +By default, the job name is the same as the file name, but you can customize it as shown.

@@ -85,18 +89,17 @@ Remember to handle the arguments inside your PySpark script. Resource allocation for the Spark driver and executors can be configured, also the number of executors and whether dynamic execution should be enabled. -* `Environment`: The python environment to use, must be based on `spark-feature-pipeline` - -* `Driver memory`: Number of cores to allocate for the Spark driver +- `Environment`: The python environment to use, must be based on `spark-feature-pipeline` -* `Driver virtual cores`: Number of MBs to allocate for the Spark driver +- `Driver memory`: Number of cores to allocate for the Spark driver -* `Executor memory`: Number of cores to allocate for each Spark executor +- `Driver virtual cores`: Number of MBs to allocate for the Spark driver -* `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Executor memory`: Number of cores to allocate for each Spark executor -* `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details). +- `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details).

@@ -107,13 +110,13 @@ Resource allocation for the Spark driver and executors can be configured, also t Additional files or dependencies required for the Spark job can be configured. -* `Additional archives`: List of archives to be extracted into the working directory of each executor. +- `Additional archives`: List of archives to be extracted into the working directory of each executor. -* `Additional jars`: List of jars to be placed in the working directory of each executor. +- `Additional jars`: List of jars to be placed in the working directory of each executor. -* `Additional python dependencies`: List of python files and archives to be placed on each executor and added to PATH. +- `Additional python dependencies`: List of python files and archives to be placed on each executor and added to PATH. -* `Additional files`: List of files to be placed in the working directory of each executor. +- `Additional files`: List of files to be placed in the working directory of each executor.

@@ -122,7 +125,8 @@ Additional files or dependencies required for the Spark job can be configured.

-Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below. +Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. +For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below.

@@ -133,11 +137,13 @@ Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.ht ### Step 7: (Kueue enabled) Select a Queue -Currently we do not have Kueue support for Spark. You do not need to select a queue to run the job in. +Currently we do not have Kueue support for Spark. +You do not need to select a queue to run the job in. ### Step 8: Execute the job -Now click the `Run` button to start the execution of the job. You will be redirected to the `Executions` page where you can see the list of all executions. +Now click the `Run` button to start the execution of the job. +You will be redirected to the `Executions` page where you can see the list of all executions.

@@ -148,7 +154,7 @@ Now click the `Run` button to start the execution of the job. You will be redire ### Step 9: Application logs -To monitor logs while the execution is running, click `Spark UI` to open the Spark UI in a separate tab. +To monitor logs while the execution is running, click `Spark UI` to open the Spark UI in a separate tab. Once the execution is finished, you can click on `Logs` to see the full logs for execution. @@ -163,7 +169,7 @@ Once the execution is finished, you can click on `Logs` to see the full logs for ### Step 1: Upload the PySpark program -This snippet assumes the program to run is in the current working directory and named `script.py`. +This snippet assumes the program to run is in the current working directory and named `script.py`. It will upload the python script to the `Resources` dataset in your project. @@ -179,7 +185,6 @@ uploaded_file_path = dataset_api.upload("script.py", "Resources") ``` - ### Step 2: Create PySpark job In this snippet we get the `JobsApi` object to get the default job configuration for a `PYSPARK` job, set the pyspark script and override the environment to run in, and finally create the `Job` object. @@ -219,38 +224,40 @@ print(f_err.read()) ``` ## Configuration + The following table describes the job configuration parameters for a PYSPARK job. `conf = jobs_api.get_configuration("PYSPARK")` -| Field | Type | Description | Default | -|----------------------------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------| -| `conf['type']` | string | Type of the job configuration | `"sparkJobConfiguration"` | -| `conf['appPath']` | string | Project path to spark program (e.g `Resources/foo.py`) | `null` | -| `conf['defaultArgs']` | string | Arguments to pass to the program. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | -| `conf['environmentName']` | string | Name of the project spark environment to use | `"spark-feature-pipeline"` | -| `conf['spark.driver.cores']` | float | Number of CPU cores allocated for the driver | `1.0` | -| `conf['spark.driver.memory']` | int | Memory allocated for the driver (in MB) | `2048` | -| `conf['spark.executor.instances']` | int | Number of executor instances | `1` | -| `conf['spark.executor.cores']` | float | Number of CPU cores per executor | `1.0` | -| `conf['spark.executor.memory']` | int | Memory allocated per executor (in MB) | `4096` | -| `conf['spark.dynamicAllocation.enabled']` | boolean | Enable dynamic allocation of executors | `true` | -| `conf['spark.dynamicAllocation.minExecutors']` | int | Minimum number of executors with dynamic allocation | `1` | -| `conf['spark.dynamicAllocation.maxExecutors']` | int | Maximum number of executors with dynamic allocation | `2` | -| `conf['spark.dynamicAllocation.initialExecutors']` | int | Initial number of executors with dynamic allocation | `1` | -| `conf['spark.blacklist.enabled']` | boolean | Whether executor/node blacklisting is enabled | `false` | -| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | -| `conf['pyFiles']` | string | Comma-separated string of HDFS path(s) to python modules to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | -| `conf['jars']` | string | Comma-separated string of HDFS path(s) to jars to be included in CLASSPATH. Example: `hdfs:///Project//Resources/app.jar,...` | `null` | -| `conf['archives']` | string | Comma-separated string of HDFS path(s) to archives to be made available to the application. Example: `hdfs:///Project//Resources/archive.zip,...` | `null` | -| `conf['properties']` | string | A new line separated (`\n`) list of properties to pass to the Spark application. The properties should be in the format `name=value` | `null` | +| Field | Type | Description | Default | +| --- | --- | --- | --- | +| `conf['type']` | string | Type of the job configuration | `"sparkJobConfiguration"` | +| `conf['appPath']` | string | Project path to spark program (e.g `Resources/foo.py`) | `null` | +| `conf['defaultArgs']` | string | Arguments to pass to the program. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | +| `conf['environmentName']` | string | Name of the project spark environment to use | `"spark-feature-pipeline"` | +| `conf['spark.driver.cores']` | float | Number of CPU cores allocated for the driver | `1.0` | +| `conf['spark.driver.memory']` | int | Memory allocated for the driver (in MB) | `2048` | +| `conf['spark.executor.instances']` | int | Number of executor instances | `1` | +| `conf['spark.executor.cores']` | float | Number of CPU cores per executor | `1.0` | +| `conf['spark.executor.memory']` | int | Memory allocated per executor (in MB) | `4096` | +| `conf['spark.dynamicAllocation.enabled']` | boolean | Enable dynamic allocation of executors | `true` | +| `conf['spark.dynamicAllocation.minExecutors']` | int | Minimum number of executors with dynamic allocation | `1` | +| `conf['spark.dynamicAllocation.maxExecutors']` | int | Maximum number of executors with dynamic allocation | `2` | +| `conf['spark.dynamicAllocation.initialExecutors']` | int | Initial number of executors with dynamic allocation | `1` | +| `conf['spark.blacklist.enabled']` | boolean | Whether executor/node blacklisting is enabled | `false` | +| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | +| `conf['pyFiles']` | string | Comma-separated string of HDFS path(s) to python modules to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | +| `conf['jars']` | string | Comma-separated string of HDFS path(s) to jars to be included in CLASSPATH. Example: `hdfs:///Project//Resources/app.jar,...` | `null` | +| `conf['archives']` | string | Comma-separated string of HDFS path(s) to archives to be made available to the application. Example: `hdfs:///Project//Resources/archive.zip,...` | `null` | +| `conf['properties']` | string | A new line separated (`\n`) list of properties to pass to the Spark application. The properties should be in the format `name=value` | `null` | ## Accessing project data ### Read directly from the filesystem (recommended) -To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. +For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: ```python df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) @@ -259,13 +266,13 @@ df.show() ### Additional files -Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the PySpark job is started. This configuration is mainly useful when you need to add additional setup, such as jars that needs to be added to the CLASSPATH. +Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the PySpark job is started. +This configuration is mainly useful when you need to add additional setup, such as jars that needs to be added to the CLASSPATH. When reading data in your Spark job it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. - ## API Reference -[Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) +[`Job`][hopsworks_common.job.Job] -[Executions](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/executions/) +[`Execution`][hopsworks_common.execution.Execution] diff --git a/docs/user_guides/projects/jobs/python_job.md b/docs/user_guides/projects/jobs/python_job.md index d4cd0c657..5252f5081 100644 --- a/docs/user_guides/projects/jobs/python_job.md +++ b/docs/user_guides/projects/jobs/python_job.md @@ -15,8 +15,10 @@ All members of a project in Hopsworks can launch the following types of applicat - Ray Launching a job of any type is very similar process, what mostly differs between job types is -the various configuration parameters each job type comes with. Hopsworks support scheduling jobs to run on a regular basis, -e.g backfilling a Feature Group by running your feature engineering pipeline nightly. Scheduling can be done both through the UI and the python API, +the various configuration parameters each job type comes with. +Hopsworks support scheduling jobs to run on a regular basis, +e.g backfilling a Feature Group by running your feature engineering pipeline nightly. +Scheduling can be done both through the UI and the python API, checkout [our Scheduling guide](schedule_job.md). ## UI @@ -45,7 +47,8 @@ Click `New Job` and the following dialog will appear. ### Step 3: Set the job type -By default, the dialog will create a Spark job. To instead configure a Python job, select `PYTHON`. +By default, the dialog will create a Spark job. +To instead configure a Python job, select `PYTHON`.

@@ -56,7 +59,9 @@ By default, the dialog will create a Spark job. To instead configure a Python jo ### Step 4: Set the script -Next step is to select the python script to run. You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. By default, the job name is the same as the file name, but you can customize it as shown. +Next step is to select the python script to run. +You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. +By default, the job name is the same as the file name, but you can customize it as shown.

@@ -81,10 +86,11 @@ Remember to handle the arguments inside your Python script. It is possible to also set following configuration settings for a `PYTHON` job. -* `Environment`: The python environment to use -* `Container memory`: The amount of memory in MB to be allocated to the Python script -* `Container cores`: The number of cores to be allocated for the Python script -* `Additional files`: List of files that will be locally accessible in the working directory of the application. Only recommended to use if project datasets are not mounted under `/hopsfs`. +- `Environment`: The python environment to use +- `Container memory`: The amount of memory in MB to be allocated to the Python script +- `Container cores`: The number of cores to be allocated for the Python script +- `Additional files`: List of files that will be locally accessible in the working directory of the application. + Only recommended to use if project datasets are not mounted under `/hopsfs`. You can always modify the arguments in the job settings.

@@ -96,13 +102,15 @@ It is possible to also set following configuration settings for a `PYTHON` job. ### Step 7: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) ### Step 8: Execute the job -Now click the `Run` button to start the execution of the job. You will be redirected to the `Executions` page where you can see the list of all executions. +Now click the `Run` button to start the execution of the job. +You will be redirected to the `Executions` page where you can see the list of all executions. Once the execution is finished, click on `Logs` to see the logs for the execution. @@ -117,7 +125,7 @@ Once the execution is finished, click on `Logs` to see the logs for the executio ### Step 1: Upload the Python script -This snippet assumes the python script is in the current working directory and named `script.py`. +This snippet assumes the python script is in the current working directory and named `script.py`. It will upload the python script to the `Resources` dataset in your project. @@ -174,36 +182,41 @@ print(f_err.read()) ``` ## Configuration + The following table describes the job configuration parameters for a PYTHON job. `conf = jobs_api.get_configuration("PYTHON")` -| Field | Type | Description | Default | -|-------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------|---------| -| `conf['type']` | string | Type of the job configuration | `"pythonJobConfiguration"` | -| `conf['appPath']` | string | Project relative path to script (e.g., `Resources/foo.py`) | `null` | -| `conf['defaultArgs']` | string | Arguments to pass to the script. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | -| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | -| `conf['resourceConfig']['cores']` | float | Number of CPU cores to be allocated | `1.0` | -| `conf['resourceConfig']['memory']` | int | Number of MBs to be allocated | `2048` | -| `conf['resourceConfig']['gpus']` | int | Number of GPUs to be allocated | `0` | -| `conf['logRedirection']` | boolean | Whether logs are redirected | `true` | -| `conf['jobType']` | string | Type of job | `"PYTHON"` | -| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | +| Field | Type | Description | Default | +| --- | --- | --- | --- | +| `conf['type']` | string | Type of the job configuration | `"pythonJobConfiguration"` | +| `conf['appPath']` | string | Project relative path to script (e.g., `Resources/foo.py`) | `null` | +| `conf['defaultArgs']` | string | Arguments to pass to the script. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | +| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | +| `conf['resourceConfig']['cores']` | float | Number of CPU cores to be allocated | `1.0` | +| `conf['resourceConfig']['memory']` | int | Number of MBs to be allocated | `2048` | +| `conf['resourceConfig']['gpus']` | int | Number of GPUs to be allocated | `0` | +| `conf['logRedirection']` | boolean | Whether logs are redirected | `true` | +| `conf['jobType']` | string | Type of job | `"PYTHON"` | +| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | ## Accessing project data + !!! notice "Recommended approach if `/hopsfs` is mounted" If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section instead of the `Additional files` property to reference file resources. ### Absolute paths + The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your script. ### Relative paths -The script's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. +The script's working directory is the folder it is located in. +For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. +Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. ## API Reference -[Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) +[`Job`][hopsworks_common.job.Job] -[Executions](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/executions/) +[`Execution`][hopsworks_common.execution.Execution] diff --git a/docs/user_guides/projects/jobs/ray_job.md b/docs/user_guides/projects/jobs/ray_job.md index 24bdb1506..e558ff58a 100644 --- a/docs/user_guides/projects/jobs/ray_job.md +++ b/docs/user_guides/projects/jobs/ray_job.md @@ -15,14 +15,16 @@ All members of a project in Hopsworks can launch the following types of applicat - Ray Launching a job of any type is very similar process, what mostly differs between job types is -the various configuration parameters each job type comes with. Hopsworks support scheduling to run jobs on a regular basis, -e.g backfilling a Feature Group by running your feature engineering pipeline nightly. Scheduling can be done both through the UI and the python API, +the various configuration parameters each job type comes with. +Hopsworks support scheduling to run jobs on a regular basis, +e.g backfilling a Feature Group by running your feature engineering pipeline nightly. +Scheduling can be done both through the UI and the python API, checkout [our Scheduling guide](schedule_job.md). !!!warning "Enable Ray" Support for Ray needs to be explicitly enabled by adding the following option in the `values.yaml` file for the deployment: - + ```yaml global: ray: @@ -55,11 +57,15 @@ Click `New Job` and the following dialog will appear. ### Step 3: Set the job type -By default, the dialog will create a Spark job. Make sure `RAY` is chosen. +By default, the dialog will create a Spark job. +Make sure `RAY` is chosen. ### Step 4: Set the script -Next step is to select the program to run. You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. After that set the name for the job. By default, the job name is the same as the file name, but you can customize it here. +Next step is to select the program to run. +You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. +After that set the name for the job. +By default, the job name is the same as the file name, but you can customize it here.

@@ -73,25 +79,25 @@ Next step is to select the program to run. You can either select `From project`, Resource allocation for the Driver and Workers can be configured. !!! notice "Using the resources in the Ray script" - The resource configurations describe the cluster that will be provisioned when launching the Ray job. User can still + The resource configurations describe the cluster that will be provisioned when launching the Ray job. + User can still provide extra configurations in the job script using `ScalingConfig`, i.e. `ScalingConfig(num_workers=4, trainer_resources={"CPU": 1}, use_gpu=True)`. -* `Driver memory`: Memory in MBs to allocate for Driver - -* `Driver virtual cores`: Number of cores to allocate for the Driver +- `Driver memory`: Memory in MBs to allocate for Driver -* `Worker memory`: Memory in MBs to allocate for each worker +- `Driver virtual cores`: Number of cores to allocate for the Driver -* `Worker cores`: Number of cores to allocate for each worker +- `Worker memory`: Memory in MBs to allocate for each worker -* `Min workers`: Minimum number of workers to start with +- `Worker cores`: Number of cores to allocate for each worker -* `Max workers`: Maximum number of workers to scale up to +- `Min workers`: Minimum number of workers to start with +- `Max workers`: Maximum number of workers to scale up to

- Resource configuration 
+    <img src=
Resource configuration for the Ray Job
@@ -99,13 +105,17 @@ for Ray Job"> Runtime environment and Additional files required for the Ray job can also be provided. -* `Runtime Environment (Optional)`: A [runtime environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments) describes the dependencies required for the Ray job including files, packages, environment variables, and more. This is useful when you need to install specific packages and set environment variables for this particular Ray job. It should be provided as a YAML file. You can select the file from the project or upload a new one. +- `Runtime Environment (Optional)`: A [runtime environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments) describes the dependencies required for the Ray job including files, packages, environment variables, and more. + This is useful when you need to install specific packages and set environment variables for this particular Ray job. + It should be provided as a YAML file. + You can select the file from the project or upload a new one. -* `Additional files`: List of other files required for the Ray job. These files will be placed in `/srv/hops/ray/job`. +- `Additional files`: List of other files required for the Ray job. + These files will be placed in `/srv/hops/ray/job`.

- Runtime 
+    <img src=
Runtime configuration and additional files for Ray job
@@ -113,7 +123,8 @@ environment and additional files"> ### Step 6: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the job should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) @@ -129,8 +140,12 @@ Now click the `Run` button to start the execution of the job, and then click on

### Step 8: Ray Dashboard -When the Ray job is running, you can access the Ray dashboard to monitor the job. The Ray dashboard is accessible from the -`Executions` page. Please note that the Ray dashboard is only available when the job execution is running. In the Ray Dashboard, + +When the Ray job is running, you can access the Ray dashboard to monitor the job. +The Ray dashboard is accessible from the +`Executions` page. +Please note that the Ray dashboard is only available when the job execution is running. +In the Ray Dashboard, you can monitor the resources used by the job, the number of workers, logs, and the tasks that are running.

@@ -140,6 +155,7 @@ you can monitor the resources used by the job, the number of workers, logs, and

### Step 9: Application logs + Once the execution is finished, you can click on `Logs` to see the full logs for execution.

@@ -153,7 +169,8 @@ Once the execution is finished, you can click on `Logs` to see the full logs for ### Step 1: Upload the Ray script -This snippet assumes the Ray program is in the current working directory and named `ray_job.py`. If the file is already in the project, you can skip this step. +This snippet assumes the Ray program is in the current working directory and named `ray_job.py`. +If the file is already in the project, you can skip this step. It will upload the jar to the `Resources` dataset in your project. @@ -169,7 +186,6 @@ uploaded_file_path = dataset_api.upload("ray_job.py", "Resources") ``` - ### Step 2: Create Ray job In this snippet we get the `JobsApi` object to get the default job configuration for a `RAY` job, set the python script to run and create the `Job` object. @@ -212,35 +228,34 @@ print(f_err.read()) ``` ## Configuration + The following table describes the job configuration parameters for a RAY job. `conf = jobs_api.get_configuration("RAY")` -| Field | Type | Description | Default | -|----------------------------------------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------| -| `conf['type']` | string | Type of the job configuration | `"rayJobConfiguration"` | -| `conf['appPath']` | string | Project relative path to script (e.g., `Resources/foo.py`) | `null` | -| `conf['defaultArgs']` | string | Arguments to pass to the script. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | -| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | -| `conf['driverCores']` | float | Number of CPU cores to be allocated for the Ray head process | `1.0` | -| `conf['driverMemory']` | int | Number of MBs to be allocated for the Ray head process | `2048` | -| `conf['driverGpus']` | int | Number of GPUs to be allocated for the Ray head process | `0` | -| `conf['workerCores']` | float | Number of CPU cores to be allocated for each Ray worker process | `1.0` | -| `conf['workerMemory']` | int | Number of MBs to be allocated for each Ray worker process | `4096` | -| `conf['workerGpus']` | int | Number of GPUs to be allocated for each Ray worker process | `0` | -| `conf['workerMinInstances']` | int | Minimum number of Ray workers | `1` | -| `conf['workerMaxInstances']` | int | Maximum number of Ray workers | `1` | -| `conf['jobType']` | string | Type of job | `"RAY"` | -| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | - +| Field | Type | Description | Default | +| --- | --- | --- | --- | +| `conf['type']` | string | Type of the job configuration | `"rayJobConfiguration"` | +| `conf['appPath']` | string | Project relative path to script (e.g., `Resources/foo.py`) | `null` | +| `conf['defaultArgs']` | string | Arguments to pass to the script. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | +| `conf['environmentName']` | string | Name of the project Python environment to use | `"pandas-training-pipeline"` | +| `conf['driverCores']` | float | Number of CPU cores to be allocated for the Ray head process | `1.0` | +| `conf['driverMemory']` | int | Number of MBs to be allocated for the Ray head process | `2048` | +| `conf['driverGpus']` | int | Number of GPUs to be allocated for the Ray head process | `0` | +| `conf['workerCores']` | float | Number of CPU cores to be allocated for each Ray worker process | `1.0` | +| `conf['workerMemory']` | int | Number of MBs to be allocated for each Ray worker process | `4096` | +| `conf['workerGpus']` | int | Number of GPUs to be allocated for each Ray worker process | `0` | +| `conf['workerMinInstances']` | int | Minimum number of Ray workers | `1` | +| `conf['workerMaxInstances']` | int | Maximum number of Ray workers | `1` | +| `conf['jobType']` | string | Type of job | `"RAY"` | +| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | ## Accessing project data The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your script. - ## API Reference -[Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) +[`Job`][hopsworks_common.job.Job] -[Executions](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/executions/) \ No newline at end of file +[`Execution`][hopsworks_common.execution.Execution] diff --git a/docs/user_guides/projects/jobs/schedule_job.md b/docs/user_guides/projects/jobs/schedule_job.md index 0cd57f9ee..f168ebec6 100644 --- a/docs/user_guides/projects/jobs/schedule_job.md +++ b/docs/user_guides/projects/jobs/schedule_job.md @@ -2,25 +2,30 @@ description: Documentation on how to schedule a job on Hopsworks. --- -# How To Schedule a Job +# How To Schedule a Job ## Introduction -Hopsworks clusters can run jobs on a schedule, allowing you to automate the execution. Whether you need to backfill your feature groups on a nightly basis or run a model training pipeline every week, the Hopsworks scheduler will help you automate these tasks. Each job can be configured to have a single schedule. For more advanced use cases, Hopsworks integrates with any DAG manager and directly with the open-source [Apache Airflow](https://airflow.apache.org/use-cases/), check out our [Airflow Guide](../airflow/airflow.md). +Hopsworks clusters can run jobs on a schedule, allowing you to automate the execution. +Whether you need to backfill your feature groups on a nightly basis or run a model training pipeline every week, the Hopsworks scheduler will help you automate these tasks. +Each job can be configured to have a single schedule. +For more advanced use cases, Hopsworks integrates with any DAG manager and directly with the open-source [Apache Airflow](https://airflow.apache.org/use-cases/), check out our [Airflow Guide](../airflow/airflow.md). -Schedules can be defined using the drop down menus in the UI or a Quartz [cron](https://en.wikipedia.org/wiki/Cron) expression. +Schedules can be defined using the drop down menus in the UI or a Quartz [cron](https://en.wikipedia.org/wiki/Cron) expression. !!! note "Schedule frequency" - The Hopsworks scheduler runs every minute. As such, the scheduling frequency should be of at least 1 minute. + The Hopsworks scheduler runs every minute. + As such, the scheduling frequency should be of at least 1 minute. !!! note "Parallel executions" - If a job execution needs to be scheduled, the scheduler will first check that there are no active executions for that job. If there is an execution running, the scheduler will postpone the execution until the running one is done. + If a job execution needs to be scheduled, the scheduler will first check that there are no active executions for that job. + If there is an execution running, the scheduler will postpone the execution until the running one is done. -## UI +## UI ### Scheduling Jobs -You can define a schedule for a job during the creation of the job itself or after the job has been created from the job overview UI. +You can define a schedule for a job during the creation of the job itself or after the job has been created from the job overview UI.

@@ -30,10 +35,13 @@ You can define a schedule for a job during the creation of the job itself or aft

The *add schedule* prompt requires you to select a frequency either through the drop down menus or by using a cron expression. -You can also provide a start time to specify from when the schedule should have effect. -The start time can also be in the past. If that's the case, the scheduler will backfill the executions from the specified start time. As mentioned above, the execution backfilling will happen one execution at the time. +You can also provide a start time to specify from when the schedule should have effect. +The start time can also be in the past. +If that's the case, the scheduler will backfill the executions from the specified start time. +As mentioned above, the execution backfilling will happen one execution at the time. -You can optionally provide an end date time to specify until when the scheduling should continue. The end time can also be in the past. +You can optionally provide an end date time to specify until when the scheduling should continue. +The end time can also be in the past. In the job overview, you can see the current scheduling configuration, whether or not it is enabled and when the next execution is planned for. @@ -46,19 +54,25 @@ All times will be considered as UTC time.

-#### Job argument +#### Job argument -When a job execution is triggered by the scheduler, a `-start_time` argument is added to the job arguments. The `-start_time` value will be the time of the scheduled execution in UTC in the ISO-8601 format (e.g.: `-start_time 2023-08-19T18:00:00Z`). +When a job execution is triggered by the scheduler, a `-start_time` argument is added to the job arguments. +The `-start_time` value will be the time of the scheduled execution in UTC in the ISO-8601 format (e.g.: `-start_time 2023-08-19T18:00:00Z`). -The `-start_time` value passed as argument represents the time when the execution was scheduled, not when the execution was started. For example, if the scheduled execution time was in the past (e.g. in the case of backfilling), the `-start_time` passed to the execution is the time in the past, not the current time when the execution is running. -Similarly, if the scheduler was not running for a period of time, when it comes back online, it will start the executions it missed to schedule while offline. Even in that case, the `-start_time` value will contain the time at which the execution was supposed to be started, not the current time. +The `-start_time` value passed as argument represents the time when the execution was scheduled, not when the execution was started. +For example, if the scheduled execution time was in the past (e.g., in the case of backfilling), the `-start_time` passed to the execution is the time in the past, not the current time when the execution is running. +Similarly, if the scheduler was not running for a period of time, when it comes back online, it will start the executions it missed to schedule while offline. +Even in that case, the `-start_time` value will contain the time at which the execution was supposed to be started, not the current time. ### Disable / Enable a schedule -You can decide to pause the scheduling of a job and avoid new executions to be started. You can later on re-enable the same scheduling configuration, and the scheduler will run the executions that were skipped while the schedule was disabled, if any, sequentially. In this way you will backfill the executions in between. +You can decide to pause the scheduling of a job and avoid new executions to be started. +You can later on re-enable the same scheduling configuration, and the scheduler will run the executions that were skipped while the schedule was disabled, if any, sequentially. +In this way you will backfill the executions in between. -You can skip the backfilling of the executions by editing the scheduling configuration and bringing forward the schedule start time for the job. +You can skip the backfilling of the executions by editing the scheduling configuration and bringing forward the schedule start time for the job. -### Delete a scheduling +### Delete a scheduling -You can remove the schedule for a job using the UI and by clicking on the trash icon on the schedule section of the job overview. If you re-schedule a job after having deleted the previous schedule, even with the same options, it will not take into account previous scheduled executions. \ No newline at end of file +You can remove the schedule for a job using the UI and by clicking on the trash icon on the schedule section of the job overview. +If you re-schedule a job after having deleted the previous schedule, even with the same options, it will not take into account previous scheduled executions. diff --git a/docs/user_guides/projects/jobs/spark_job.md b/docs/user_guides/projects/jobs/spark_job.md index 592756a84..a3e744460 100644 --- a/docs/user_guides/projects/jobs/spark_job.md +++ b/docs/user_guides/projects/jobs/spark_job.md @@ -15,11 +15,12 @@ All members of a project in Hopsworks can launch the following types of applicat - Ray Launching a job of any type is very similar process, what mostly differs between job types is -the various configuration parameters each job type comes with. Hopsworks support scheduling to run jobs on a regular basis, -e.g backfilling a Feature Group by running your feature engineering pipeline nightly. Scheduling can be done both through the UI and the python API, +the various configuration parameters each job type comes with. +Hopsworks support scheduling to run jobs on a regular basis, +e.g backfilling a Feature Group by running your feature engineering pipeline nightly. +Scheduling can be done both through the UI and the python API, checkout [our Scheduling guide](schedule_job.md). - ## UI ### Step 1: Jobs overview @@ -46,11 +47,15 @@ Click `New Job` and the following dialog will appear. ### Step 3: Set the job type -By default, the dialog will create a Spark job. Make sure `SPARK` is chosen. +By default, the dialog will create a Spark job. +Make sure `SPARK` is chosen. ### Step 4: Set the jar -Next step is to select the program to run. You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. After that set the name for the job. By default, the job name is the same as the file name, but you can customize it here. +Next step is to select the program to run. +You can either select `From project`, if the file was previously uploaded to Hopsworks, or `Upload new file` which lets you select a file from your local filesystem as demonstrated below. +After that set the name for the job. +By default, the job name is the same as the file name, but you can customize it here.

@@ -61,7 +66,8 @@ Next step is to select the program to run. You can either select `From project`, ### Step 5: Set the main class -Next step is to set the main class for the application. Then specify [advanced configuration](#step-6-optional-advanced-configuration) or click `Create New Job` to create the job. +Next step is to set the main class for the application. +Then specify [advanced configuration](#step-7-optional-advanced-configuration) or click `Create New Job` to create the job.

@@ -88,18 +94,17 @@ Remember to handle the arguments inside your Spark script. Resource allocation for the Spark driver and executors can be configured, also the number of executors and whether dynamic execution should be enabled. -* `Environment`: The environment to use, must be based on `spark-feature-pipeline` - -* `Driver memory`: Number of cores to allocate for the Spark driver +- `Environment`: The environment to use, must be based on `spark-feature-pipeline` -* `Driver virtual cores`: Number of MBs to allocate for the Spark driver +- `Driver memory`: Number of cores to allocate for the Spark driver -* `Executor memory`: Number of cores to allocate for each Spark executor +- `Driver virtual cores`: Number of MBs to allocate for the Spark driver -* `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Executor memory`: Number of cores to allocate for each Spark executor -* `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details). +- `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details).

@@ -110,13 +115,13 @@ Resource allocation for the Spark driver and executors can be configured, also t Additional files or dependencies required for the Spark job can be configured. -* `Additional archives`: List of archives to be extracted into the working directory of each executor. +- `Additional archives`: List of archives to be extracted into the working directory of each executor. -* `Additional jars`: List of jars to be placed in the working directory of each executor. +- `Additional jars`: List of jars to be placed in the working directory of each executor. -* `Additional python dependencies`: List of python files and archives to be placed on each executor and added to PATH. +- `Additional python dependencies`: List of python files and archives to be placed on each executor and added to PATH. -* `Additional files`: List of files to be placed in the working directory of each executor. +- `Additional files`: List of files to be placed in the working directory of each executor.

@@ -125,7 +130,8 @@ Additional files or dependencies required for the Spark job can be configured.

-Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below. +Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. +For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below.

@@ -136,13 +142,13 @@ Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.ht ### Step 8: (Kueue enabled) Select a Queue -Currently we do not have Kueue support for Spark. You do not need to select a queue to run the job in. +Currently we do not have Kueue support for Spark. +You do not need to select a queue to run the job in. ### Step 9: Execute the job Now click the `Run` button to start the execution of the job, and then click on `Executions` to see the list of all executions. -

Start job execution @@ -167,7 +173,7 @@ Once the execution is finished, you can click on `Logs` to see the full logs for ### Step 1: Upload the Spark jar -This snippet assumes the Spark program is in the current working directory and named `sparkpi.jar`. +This snippet assumes the Spark program is in the current working directory and named `sparkpi.jar`. It will upload the jar to the `Resources` dataset in your project. @@ -183,7 +189,6 @@ uploaded_file_path = dataset_api.upload("sparkpi.jar", "Resources") ``` - ### Step 2: Create Spark job In this snippet we get the `JobsApi` object to get the default job configuration for a `SPARK` job, set the python script to run and create the `Job` object. @@ -220,39 +225,41 @@ print(f_err.read()) ``` ## Configuration + The following table describes the job configuration parameters for a SPARK job. `conf = jobs_api.get_configuration("SPARK")` -| Field | Type | Description | Default | -|----------------------------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------| -| `conf['type']` | string | Type of the job configuration | `"sparkJobConfiguration"` | -| `conf['appPath']` | string | Project path to spark program (e.g., `Resources/foo.jar`) | `null` | -| `conf['mainClass']` | string | Name of the main class to run (e.g., `org.company.Main`) | `null` | -| `conf['defaultArgs']` | string | Arguments to pass to the program. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | -| `conf['environmentName']` | string | Name of the project spark environment to use | `"spark-feature-pipeline"` | -| `conf['spark.driver.cores']` | float | Number of CPU cores allocated for the driver | `1.0` | -| `conf['spark.driver.memory']` | int | Memory allocated for the driver (in MB) | `2048` | -| `conf['spark.executor.instances']` | int | Number of executor instances | `1` | -| `conf['spark.executor.cores']` | float | Number of CPU cores per executor | `1.0` | -| `conf['spark.executor.memory']` | int | Memory allocated per executor (in MB) | `4096` | -| `conf['spark.dynamicAllocation.enabled']` | boolean | Enable dynamic allocation of executors | `true` | -| `conf['spark.dynamicAllocation.minExecutors']` | int | Minimum number of executors with dynamic allocation | `1` | -| `conf['spark.dynamicAllocation.maxExecutors']` | int | Maximum number of executors with dynamic allocation | `2` | -| `conf['spark.dynamicAllocation.initialExecutors']` | int | Initial number of executors with dynamic allocation | `1` | -| `conf['spark.blacklist.enabled']` | boolean | Whether executor/node blacklisting is enabled | `false` | -| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | -| `conf['pyFiles']` | string | Comma-separated string of HDFS path(s) to Python modules to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | -| `conf['jars']` | string | Comma-separated string of HDFS path(s) to jars to be included in CLASSPATH. Example: `hdfs:///Project//Resources/app.jar,...` | `null` | -| `conf['archives']` | string | Comma-separated string of HDFS path(s) to archives to be made available to the application. Example: `hdfs:///Project//Resources/archive.zip,...` | `null` | -| `conf['properties']` | string | A new line separated (`\n`) list of properties to pass to the Spark application. The properties should be in the format `name=value` | `null` | +| Field | Type | Description | Default | +| --- | --- | --- | --- | +| `conf['type']` | string | Type of the job configuration | `"sparkJobConfiguration"` | +| `conf['appPath']` | string | Project path to spark program (e.g., `Resources/foo.jar`) | `null` | +| `conf['mainClass']` | string | Name of the main class to run (e.g., `org.company.Main`) | `null` | +| `conf['defaultArgs']` | string | Arguments to pass to the program. Will be overridden if arguments are passed explicitly via `Job.run(args="...")` | `null` | +| `conf['environmentName']` | string | Name of the project spark environment to use | `"spark-feature-pipeline"` | +| `conf['spark.driver.cores']` | float | Number of CPU cores allocated for the driver | `1.0` | +| `conf['spark.driver.memory']` | int | Memory allocated for the driver (in MB) | `2048` | +| `conf['spark.executor.instances']` | int | Number of executor instances | `1` | +| `conf['spark.executor.cores']` | float | Number of CPU cores per executor | `1.0` | +| `conf['spark.executor.memory']` | int | Memory allocated per executor (in MB) | `4096` | +| `conf['spark.dynamicAllocation.enabled']` | boolean | Enable dynamic allocation of executors | `true` | +| `conf['spark.dynamicAllocation.minExecutors']` | int | Minimum number of executors with dynamic allocation | `1` | +| `conf['spark.dynamicAllocation.maxExecutors']` | int | Maximum number of executors with dynamic allocation | `2` | +| `conf['spark.dynamicAllocation.initialExecutors']` | int | Initial number of executors with dynamic allocation | `1` | +| `conf['spark.blacklist.enabled']` | boolean | Whether executor/node blacklisting is enabled | `false` | +| `conf['files']` | string | Comma-separated string of HDFS path(s) to files to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | +| `conf['pyFiles']` | string | Comma-separated string of HDFS path(s) to Python modules to be made available to the application. Example: `hdfs:///Project//Resources/file1.py,...` | `null` | +| `conf['jars']` | string | Comma-separated string of HDFS path(s) to jars to be included in CLASSPATH. Example: `hdfs:///Project//Resources/app.jar,...` | `null` | +| `conf['archives']` | string | Comma-separated string of HDFS path(s) to archives to be made available to the application. Example: `hdfs:///Project//Resources/archive.zip,...` | `null` | +| `conf['properties']` | string | A new line separated (`\n`) list of properties to pass to the Spark application. The properties should be in the format `name=value` | `null` | ## Accessing project data ### Read directly from the filesystem (recommended) -To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. +For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: ```java Dataset df = spark.read() @@ -265,12 +272,13 @@ df.show(); ### Additional files -Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the Spark job is started. This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. +Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the Spark job is started. +This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. When reading data in your Spark job it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. ## API Reference -[Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) +[`Job`][hopsworks_common.job.Job] -[Executions](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/executions/) +[`Execution`][hopsworks_common.execution.Execution] diff --git a/docs/user_guides/projects/jupyter/python_notebook.md b/docs/user_guides/projects/jupyter/python_notebook.md index 47ff6e960..48c6ab7e0 100644 --- a/docs/user_guides/projects/jupyter/python_notebook.md +++ b/docs/user_guides/projects/jupyter/python_notebook.md @@ -1,11 +1,11 @@ # How To Run A Python Notebook -### Introduction +## Introduction Jupyter is provided as a service in Hopsworks, providing the same user experience and features as if run on your laptop. -* Supports JupyterLab and the classic Jupyter front-end -* Configured with Python3, PySpark and Ray kernels +- Supports JupyterLab and the classic Jupyter front-end +- Configured with Python3, PySpark and Ray kernels ## Step 1: Jupyter dashboard @@ -24,9 +24,9 @@ From this page, you can configure various options and settings to start Jupyter Next step is to configure Jupyter, Click `edit configuration` to get to the configuration page and select `Python`. -* `Container cores`: Number of cores to allocate for the Jupyter instance +- `Container cores`: Number of cores to allocate for the Jupyter instance -* `Container memory`: Number of MBs to allocate for the Jupyter instance +- `Container memory`: Number of MBs to allocate for the Jupyter instance !!! notice "Configured resource pool is shared by all running kernels. If a kernel crashes while executing a cell, try increasing the Container memory." @@ -43,7 +43,8 @@ Click `Save` to save the new configuration. Before starting the server there are three additional configurations that can be set next to the `Run Jupyter` button. -The environment that Jupyter should run in needs to be configured. Select the environment that contains the necessary dependencies for your code. +The environment that Jupyter should run in needs to be configured. +Select the environment that contains the necessary dependencies for your code.

@@ -52,7 +53,8 @@ The environment that Jupyter should run in needs to be configured. Select the en

-The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. If a limited runtime is not desirable, this can be disabled by setting `no limit`. +The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. +If a limited runtime is not desirable, this can be disabled by setting `no limit`.

@@ -61,7 +63,8 @@ The runtime of the Jupyter instance can be configured, this is useful to ensure

-The root path from which to start the Jupyter instance can be configured. By default it starts by setting the `/Jupyter` folder as the root. +The root path from which to start the Jupyter instance can be configured. +By default it starts by setting the `/Jupyter` folder as the root.

@@ -72,11 +75,11 @@ The root path from which to start the Jupyter instance can be configured. By def ## Step 4: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the notebook should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the notebook should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) - ## Step 5: Start Jupyter Start the Jupyter instance by clicking the `Run Jupyter` button. @@ -89,16 +92,20 @@ Start the Jupyter instance by clicking the `Run Jupyter` button.

## Accessing project data + !!! notice "Recommended approach if `/hopsfs` is mounted" If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section. - If the file system is not mounted, then project files can be localized using the [download api](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/datasets/#download) to localize files in the current working directory. + If the file system is not mounted, then project files can be localized using the [download api][hopsworks_common.core.dataset_api.DatasetApi.download] to localize files in the current working directory. ### Absolute paths + The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your notebook. ### Relative paths -The notebook's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. +The notebook's working directory is the folder it is located in. +For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. +Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. ## Going Further diff --git a/docs/user_guides/projects/jupyter/ray_notebook.md b/docs/user_guides/projects/jupyter/ray_notebook.md index 9e38dd6ae..a14a4cce1 100644 --- a/docs/user_guides/projects/jupyter/ray_notebook.md +++ b/docs/user_guides/projects/jupyter/ray_notebook.md @@ -3,17 +3,17 @@ description: Documentation on how to run Ray applications on Jupyter on Hopswork --- # How To Run A Ray Notebook -### Introduction +## Introduction Jupyter is provided as a service in Hopsworks, providing the same user experience and features as if run on your laptop. -* Supports JupyterLab and the classic Jupyter front-end -* Configured with Python3, PySpark and Ray kernels +- Supports JupyterLab and the classic Jupyter front-end +- Configured with Python3, PySpark and Ray kernels !!!warning "Enable Ray" Support for Ray needs to be explicitly enabled by adding the following option in the `values.yaml` file for the deployment: - + ```yaml global: ray: @@ -35,32 +35,32 @@ From this page, you can configure various options and settings to start Jupyter ## Step 2 (Optional): Configure Ray -Next step is to configure the Ray cluster configuration that will be created when you start Ray session later in -Jupyter. Click `edit configuration` to get to the configuration page and select `Ray`. +Next step is to configure the Ray cluster configuration that will be created when you start Ray session later in Jupyter. +Click `edit configuration` to get to the configuration page and select `Ray`. ### Resource and compute Resource allocation for the Driver and Workers can be configured. !!! notice "Using the resources in the Ray script" - The resource configurations describe the cluster that will be provisioned when launching the Ray job. User can still - provide extra configurations in the job script using `ScalingConfig`, i.e. `ScalingConfig(num_workers=4, trainer_resources={"CPU": 1}, use_gpu=True)`. + The resource configurations describe the cluster that will be provisioned when launching the Ray job. + User can still provide extra configurations in the job script using `ScalingConfig`, i.e. `ScalingConfig(num_workers=4, trainer_resources={"CPU": 1}, use_gpu=True)`. -* `Driver memory`: Memory in MBs to allocate for Driver +- `Driver memory`: Memory in MBs to allocate for Driver -* `Driver virtual cores`: Number of cores to allocate for the Driver +- `Driver virtual cores`: Number of cores to allocate for the Driver -* `Worker memory`: Memory in MBs to allocate for each worker +- `Worker memory`: Memory in MBs to allocate for each worker -* `Worker cores`: Number of cores to allocate for each worker +- `Worker cores`: Number of cores to allocate for each worker -* `Min workers`: Minimum number of workers to start with +- `Min workers`: Minimum number of workers to start with -* `Max workers`: Maximum number of workers to scale up to +- `Max workers`: Maximum number of workers to scale up to

- Resource configuration for 
+    <img src=
Resource configuration for the Ray kernels
@@ -68,27 +68,30 @@ the Ray kernels"> Runtime environment and Additional files required for the Ray job can also be provided. -* `Runtime Environment (Optional)`: A [runtime environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments) describes the dependencies required for the Ray job including files, packages, environment variables, and more. This is useful when you need to install specific packages and set environment variables for this particular Ray job. It should be provided as a YAML file. You can select the file from the project or upload a new one. - +- `Runtime Environment (Optional)`: A [runtime environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments) describes the dependencies required for the Ray job including files, packages, environment variables, and more. + This is useful when you need to install specific packages and set environment variables for this particular Ray job. + It should be provided as a YAML file. + You can select the file from the project or upload a new one. -* `Additional files`: List of other files required for the Ray job. These files will be placed in `/srv/hops/ray/job`. +- `Additional files`: List of other files required for the Ray job. + These files will be placed in `/srv/hops/ray/job`.

- Runtime 
+    <img src=
Runtime configuration and additional files for Ray jupyter session

- Click `Save` to save the new configuration. ## Step 3 (Optional): Configure max runtime and root path Before starting the server there are two additional configurations that can be set next to the `Run Jupyter` button. -The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. If a limited runtime is not desirable, this can be disabled by setting `no limit`. +The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. +If a limited runtime is not desirable, this can be disabled by setting `no limit`.

@@ -97,7 +100,8 @@ The runtime of the Jupyter instance can be configured, this is useful to ensure

-The root path from which to start the Jupyter instance can be configured. By default it starts by setting the `/Jupyter` folder as the root. +The root path from which to start the Jupyter instance can be configured. +By default it starts by setting the `/Jupyter` folder as the root.

@@ -107,13 +111,16 @@ The root path from which to start the Jupyter instance can be configured. By def

## Step 4: Select the environment -Hopsworks provides a variety of environments to run Jupyter notebooks. Select the environment you want to use by clicking on the dropdown menu. -In order to be able to run a Ray notebook, you need to select the environment that has the Ray kernel installed. + +Hopsworks provides a variety of environments to run Jupyter notebooks. +Select the environment you want to use by clicking on the dropdown menu. +In order to be able to run a Ray notebook, you need to select the environment that has the Ray kernel installed. Environment with Ray kernel have a `Ray Enabled` label next to them. ## Step 5: (Kueue enabled) Select a Queue -If the cluster is installed with Kueue enabled, you will need to select a queue in which the notebook should run. This can be done from `Advance configuration -> Scheduler section`. +If the cluster is installed with Kueue enabled, you will need to select a queue in which the notebook should run. +This can be done from `Advance configuration -> Scheduler section`. ![Default queue for job](../../../assets/images/guides/project/scheduler/job_queue.png) @@ -122,8 +129,11 @@ If the cluster is installed with Kueue enabled, you will need to select a queue Start the Jupyter instance by clicking the `Run Jupyter` button. ## Running Ray code in Jupyter -Once the Jupyter instance is started, you can create a new notebook by clicking on the `New` button and selecting -`Ray` kernel. You can now write and run Ray code in the notebook. When you first run a cell with Ray code, a Ray session will be started and you can monitor the resources used by the job in the Ray dashboard. + +Once the Jupyter instance is started, you can create a new notebook by clicking on the `New` button and selecting +`Ray` kernel. +You can now write and run Ray code in the notebook. +When you first run a cell with Ray code, a Ray session will be started and you can monitor the resources used by the job in the Ray dashboard.

@@ -135,9 +145,11 @@ Once the Jupyter instance is started, you can create a new notebook by clicking ## Step 7: Access Ray Dashboard When you start a Ray session in Jupyter, a new application will appear in the Jupyter page. -The notebook name from which the session was started is displayed. You can access the Ray UI by clicking on the `Ray Dashboard` and a new -tab will be opened. The Ray dashboard is only available while the Ray kernel is running. -You can kill the Ray session to free up resources by shutting down the kernel in Jupyter. +The notebook name from which the session was started is displayed. +You can access the Ray UI by clicking on the `Ray Dashboard` and a new +tab will be opened. +The Ray dashboard is only available while the Ray kernel is running. +You can kill the Ray session to free up resources by shutting down the kernel in Jupyter. In the Ray Dashboard, you can monitor the resources used by code you are running, the number of workers, logs, and the tasks that are running.

diff --git a/docs/user_guides/projects/jupyter/remote_filesystem_driver.md b/docs/user_guides/projects/jupyter/remote_filesystem_driver.md index 3d55a24dc..430fb39a3 100644 --- a/docs/user_guides/projects/jupyter/remote_filesystem_driver.md +++ b/docs/user_guides/projects/jupyter/remote_filesystem_driver.md @@ -1,14 +1,16 @@ # Configuring remote filesystem driver -### Introduction +## Introduction We provide two ways to access and persist files in HopsFS from a jupyter notebook: -* `hdfscontentsmanager`: With `hdfscontentsmanager` you interact with the project datasets using the dataset api. When you - start a notebook using the `hdfscontentsmanager` you will only see the files in the configured root path. -* `hopsfsmount`: With `hopsfsmount` all the project datasets are available in the jupyter notebook as a local filesystem. +- `hdfscontentsmanager`: With `hdfscontentsmanager` you interact with the project datasets using the dataset api. + When you start a notebook using the `hdfscontentsmanager` you will only see the files in the configured root path. +- `hopsfsmount`: With `hopsfsmount` all the project datasets are available in the jupyter notebook as a local filesystem. This means you can use native Python file I/O operations (copy, move, create, open, etc.) to interact with the project datasets. When you open the jupyter notebook you will see all the project datasets. ## Configuring the driver -To configure the driver you need to have admin role and set the `jupyter_remote_fs_driver` to either `hdfscontentsmanager` or `hopsfsmount`. The default driver is `hdfscontentsmanager`. + +To configure the driver you need to have admin role and set the `jupyter_remote_fs_driver` to either `hdfscontentsmanager` or `hopsfsmount`. +The default driver is `hdfscontentsmanager`. diff --git a/docs/user_guides/projects/jupyter/spark_notebook.md b/docs/user_guides/projects/jupyter/spark_notebook.md index 95ee6cdf3..17eeb2167 100644 --- a/docs/user_guides/projects/jupyter/spark_notebook.md +++ b/docs/user_guides/projects/jupyter/spark_notebook.md @@ -1,12 +1,11 @@ # How To Run A PySpark Notebook -### Introduction +## Introduction Jupyter is provided as a service in Hopsworks, providing the same user experience and features as if run on your laptop. -* Supports JupyterLab and the classic Jupyter front-end -* Configured with Python3, PySpark and Ray kernels - +- Supports JupyterLab and the classic Jupyter front-end +- Configured with Python3, PySpark and Ray kernels ## Step 1: Jupyter dashboard @@ -34,23 +33,22 @@ You can easily refer to the green ticks as to what kernels are available in whic ## Step 3 (Optional): Configure spark properties -Next step is to configure the Ray properties to be used in Jupyter, Click `edit configuration` to get to the +Next step is to configure the Ray properties to be used in Jupyter, Click `edit configuration` to get to the configuration page and select `Ray`. ### Resource and compute Resource allocation for the Spark driver and executors can be configured, also the number of executors and whether dynamic execution should be enabled. -* `Driver memory`: Number of cores to allocate for the Spark driver - -* `Driver virtual cores`: Number of MBs to allocate for the Spark driver +- `Driver memory`: Number of cores to allocate for the Spark driver -* `Executor memory`: Number of cores to allocate for each Spark executor +- `Driver virtual cores`: Number of MBs to allocate for the Spark driver -* `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Executor memory`: Number of cores to allocate for each Spark executor -* `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details). +- `Executor virtual cores`: Number of MBs to allocate for each Spark executor +- `Dynamic/Static`: Run the Spark application in static or dynamic allocation mode (see [spark docs](https://spark.apache.org/docs/latest/configuration.html#dynamic-allocation) for details).

@@ -63,13 +61,13 @@ Resource allocation for the Spark driver and executors can be configured, also t Additional files or dependencies required for the Spark job can be configured. -* `Additional archives`: List of zip or .tgz files that will be locally accessible by the application +- `Additional archives`: List of zip or .tgz files that will be locally accessible by the application -* `Additional jars`: List of .jar files to add to the CLASSPATH of the application +- `Additional jars`: List of .jar files to add to the CLASSPATH of the application -* `Additional python dependencies`: List of .py, .zip or .egg files that will be locally accessible by the application +- `Additional python dependencies`: List of .py, .zip or .egg files that will be locally accessible by the application -* `Additional files`: List of files that will be locally accessible by the application +- `Additional files`: List of files that will be locally accessible by the application

@@ -78,7 +76,8 @@ Additional files or dependencies required for the Spark job can be configured.

-Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below. +Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.html) to be set for the Spark application. +For example, changing the configuration variables for the Kryo Serializer or setting environment variables for the driver, you can set the properties as shown below.

@@ -87,14 +86,14 @@ Line-separates [properties](https://spark.apache.org/docs/3.1.1/configuration.ht

- Click `Save` to save the new configuration. ## Step 4 (Optional): Configure root folder and automatic shutdown Before starting the server there are two additional configurations that can be set next to the `Run Jupyter` button. -The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. If a limited runtime is not desirable, this can be disabled by setting `no limit`. +The runtime of the Jupyter instance can be configured, this is useful to ensure that idle instances will not be hanging around and keep allocating resources. +If a limited runtime is not desirable, this can be disabled by setting `no limit`.

@@ -103,7 +102,8 @@ The runtime of the Jupyter instance can be configured, this is useful to ensure

-The root path from which to start the Jupyter instance can be configured. By default it starts by setting the `/Jupyter` folder as the root. +The root path from which to start the Jupyter instance can be configured. +By default it starts by setting the `/Jupyter` folder as the root.

@@ -112,10 +112,10 @@ The root path from which to start the Jupyter instance can be configured. By def

- ## Step 5: (Kueue enabled) Select a Queue -Currently we do not have Kueue support for Spark. You do not need to select a queue to run the notebook in. +Currently we do not have Kueue support for Spark. +You do not need to select a queue to run the notebook in. ## Step 5: Start Jupyter @@ -143,7 +143,8 @@ Navigate back to Hopsworks and a Spark session will have appeared, click on the ### Read directly from the filesystem (recommended) -To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. +For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: ```python df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) @@ -152,7 +153,8 @@ df.show() ### Additional files -Different files can be attached to the jupyter session and made available in the `/srv/hops/artifacts` folder when the PySpark kernel is started. This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. +Different files can be attached to the jupyter session and made available in the `/srv/hops/artifacts` folder when the PySpark kernel is started. +This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. When reading data in your Spark application, it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. diff --git a/docs/user_guides/projects/kafka/consume_messages.md b/docs/user_guides/projects/kafka/consume_messages.md index 3ae01aa76..6874d903a 100644 --- a/docs/user_guides/projects/kafka/consume_messages.md +++ b/docs/user_guides/projects/kafka/consume_messages.md @@ -2,7 +2,8 @@ ## Introduction -A Consumer is a process which reads messages from a Kafka topic. In Hopsworks, all user roles are capable of performing 'Read' and 'Describe' actions on Kafka topics within projects that they are a member of or are shared with them. +A Consumer is a process which reads messages from a Kafka topic. +In Hopsworks, all user roles are capable of performing 'Read' and 'Describe' actions on Kafka topics within projects that they are a member of or are shared with them. ## Prerequisites @@ -52,4 +53,4 @@ for i in range(0, 10): ### API Reference -[KafkaTopic](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/kafka_topic/) +[`KafkaTopic`][hopsworks_common.kafka_topic.KafkaTopic] diff --git a/docs/user_guides/projects/kafka/create_schema.md b/docs/user_guides/projects/kafka/create_schema.md index a3aa204a7..cebd9fe30 100644 --- a/docs/user_guides/projects/kafka/create_schema.md +++ b/docs/user_guides/projects/kafka/create_schema.md @@ -2,8 +2,6 @@ ## Introduction - - ## Code In this guide, you will learn how to create a Kafka Avro Schema in the Hopsworks Schema Registry. @@ -60,4 +58,4 @@ my_schema = kafka_api.create_schema(SCHEMA_NAME, schema) ### API Reference -[KafkaSchema](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/kafka_schema/) +[`KafkaSchema`][hopsworks_common.kafka_schema.KafkaSchema] diff --git a/docs/user_guides/projects/kafka/create_topic.md b/docs/user_guides/projects/kafka/create_topic.md index afa4b8411..7ab420b4b 100644 --- a/docs/user_guides/projects/kafka/create_topic.md +++ b/docs/user_guides/projects/kafka/create_topic.md @@ -2,13 +2,13 @@ ## Introduction -A Topic is a queue to which records are stored and published. Producer applications write data to topics and consumer applications read from topics. +A Topic is a queue to which records are stored and published. +Producer applications write data to topics and consumer applications read from topics. ## Prerequisites This guide requires that you have 'Data owner' role and have previously created a [Kafka Schema](create_schema.md) to be used for the topic. - ## Code In this guide, you will learn how to create a Kafka Topic. @@ -38,4 +38,4 @@ my_topic = kafka_api.create_topic(TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partit ### API Reference -[KafkaTopic](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/kafka_topic/) +[`KafkaTopic`][hopsworks_common.kafka_topic.KafkaTopic] diff --git a/docs/user_guides/projects/kafka/produce_messages.md b/docs/user_guides/projects/kafka/produce_messages.md index 602ac150c..a44a3804c 100644 --- a/docs/user_guides/projects/kafka/produce_messages.md +++ b/docs/user_guides/projects/kafka/produce_messages.md @@ -2,7 +2,8 @@ ## Introduction -A Producer is a process which produces messages to a Kafka topic. In Hopsworks, only users with the 'Data owner' role are capable of performing the 'Write' action on Kafka topics within the project that they are a member of. +A Producer is a process which produces messages to a Kafka topic. +In Hopsworks, only users with the 'Data owner' role are capable of performing the 'Write' action on Kafka topics within the project that they are a member of. ## Prerequisites @@ -54,7 +55,7 @@ producer.flush(10) ### API Reference -[KafkaTopic](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/kafka_topic/) +[`KafkaTopic`][hopsworks_common.kafka_topic.KafkaTopic] ## Going Further diff --git a/docs/user_guides/projects/opensearch/connect.md b/docs/user_guides/projects/opensearch/connect.md index 1e6f9f2ad..00d701f08 100644 --- a/docs/user_guides/projects/opensearch/connect.md +++ b/docs/user_guides/projects/opensearch/connect.md @@ -7,10 +7,9 @@ Text here !!! notice "Limited to internal Jobs and Notebooks" Currently it's only possible to configure the opensearch-py client in a job or jupyter notebook running inside the Hopsworks cluster. - ## Code -In this guide, you will learn how to connect to the OpenSearch cluster using an [opensearch-py](https://opensearch.org/docs/1.3/clients/python/) client. +In this guide, you will learn how to connect to the OpenSearch cluster using an [opensearch-py](https://opensearch.org/docs/1.3/clients/python/) client. ### Step 1: Get the OpenSearch API @@ -36,8 +35,8 @@ client = OpenSearch(**opensearch_api.get_default_py_config()) ### API Reference -[OpenSearch](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/opensearch/) +[`OpenSearchApi`][hopsworks_common.core.opensearch_api.OpenSearchApi] ## Going Further -You can now use the client to interact directly with the OpenSearch cluster, such as [vector database](../../mlops/vector_database/index.md). +You can now use the client to interact directly with the OpenSearch cluster, such as [vector database](../../../concepts/mlops/opensearch.md). diff --git a/docs/user_guides/projects/opensearch/knn.md b/docs/user_guides/projects/opensearch/knn.md index ab8d0d2ab..a1c91db57 100644 --- a/docs/user_guides/projects/opensearch/knn.md +++ b/docs/user_guides/projects/opensearch/knn.md @@ -2,7 +2,8 @@ ## Introduction -The k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. To determine the neighbors, you can specify the space (the distance function) you want to use to measure the distance between points. +The k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. +To determine the neighbors, you can specify the space (the distance function) you want to use to measure the distance between points. Use cases include recommendations (for example, an “other songs you might like” feature in a music application), image recognition, and fraud detection. @@ -63,7 +64,8 @@ Create an index to use by calling `opensearch_api.get_project_index(..)`. ### Step 4: Bulk ingestion of vectors -Ingest 10 vectors in a bulk fashion to the index. These vectors represent the list of vectors to calculate the similarity for. +Ingest 10 vectors in a bulk fashion to the index. +These vectors represent the list of vectors to calculate the similarity for. === "Python" ```python @@ -123,6 +125,7 @@ Score the vector `[2.5, 3]` and find the 3 most similar vectors. `[4.798869166444522, 4.069064892468535]` is the most similar vector to `[2.5, 3]` with a score of `0.1346312`. === "Bash" + ```bash 2022-05-30 09:55:50,529 INFO: POST https://10.0.2.15:9200/my_project_demo_knn_index/_search [status:200 request:0.017s] @@ -155,4 +158,4 @@ Score the vector `[2.5, 3]` and find the 3 most similar vectors. [k-NN plugin](https://opensearch.org/docs/1.3/search-plugins/knn/knn-index/) -[OpenSearch](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/open_search/) +[`OpenSearchApi`][hopsworks_common.core.opensearch_api.OpenSearchApi] diff --git a/docs/user_guides/projects/project/add_members.md b/docs/user_guides/projects/project/add_members.md index 53e0c3516..98710a394 100644 --- a/docs/user_guides/projects/project/add_members.md +++ b/docs/user_guides/projects/project/add_members.md @@ -1,6 +1,6 @@ # How To Add Members To A Project -### Introduction +## Introduction In this guide, you will learn how to add new members to your project as well as the different roles within a project. @@ -17,7 +17,8 @@ On the `Project settings` page, you can find the `General` section, which lists ## Step 2: Add a new member -Next click `Add members` and a dialog where users can be invited will appear. Select the users to invite. +Next click `Add members` and a dialog where users can be invited will appear. +Select the users to invite.

@@ -33,17 +34,20 @@ Subsequently, the selected project members can be assigned to 2 different roles, Data owners hold the highest authority in the project, having full control of its contents. They are allowed to: + - Share a project - Manage the project and its members - Work with all feature store abstractions (such as Feature groups, Feature views, Data Sources, etc.) -It is worth mentioning that the project's creator (aka. `author`) is a special type of `Data owner`. He is the only user capable of deleting the project and it is impossible to change his role to `Data scientist`. +It is worth mentioning that the project's creator (aka. `author`) is a special type of `Data owner`. +He is the only user capable of deleting the project and it is impossible to change his role to `Data scientist`. ### Data scientist Data scientists can be viewed as the users of data. They are allowed to: + - Create feature views/training datasets using existing features - Manage the feature views/training datasets they have created diff --git a/docs/user_guides/projects/project/create_project.md b/docs/user_guides/projects/project/create_project.md index 7a3fa5664..6e9299437 100644 --- a/docs/user_guides/projects/project/create_project.md +++ b/docs/user_guides/projects/project/create_project.md @@ -5,13 +5,15 @@ In this guide, you will learn how to create a new project. !!! notice "Project name validation rules" - A valid project name can only contain characters a-z, A-Z, 0-9 and special characters ‘_’ and ‘.’ but not ‘__’ (double underscore). There is also a number of [reserved project names](#reserved-project-names) that can not be used. + A valid project name can only contain characters a-z, A-Z, 0-9 and special characters ‘_’ and ‘.’ but not ‘__’ (double underscore). + There is also a number of [reserved project names](#reserved-project-names) that can not be used. ## GUI ### Step 1: Create a project -If you log in to the platform and do not have any projects, you are presented with the following view. To run the Feature Store tour click `Run a demo project`, to create a new project click `Create new project`. +If you log in to the platform and do not have any projects, you are presented with the following view. +To run the Feature Store tour click `Run a demo project`, to create a new project click `Create new project`. For this guide click `Create new project` to continue. @@ -73,7 +75,7 @@ project = hopsworks.create_project("my_project") ### API Reference -[Projects](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/projects/) +[`Project`][hopsworks_common.project.Project] ## Reserved project names diff --git a/docs/user_guides/projects/python/custom_commands.md b/docs/user_guides/projects/python/custom_commands.md index a74e69753..892b5995a 100644 --- a/docs/user_guides/projects/python/custom_commands.md +++ b/docs/user_guides/projects/python/custom_commands.md @@ -1,9 +1,13 @@ # Adding extra configuration with generic bash commands ## Introduction -Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. -Some Python libraries require the installation of some OS-Level libraries. In some cases, you may need to add more complex configuration to your environment. This demands writing your own commands and executing them on top of the existing environment. +Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. +Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. + +Some Python libraries require the installation of some OS-Level libraries. +In some cases, you may need to add more complex configuration to your environment. +This demands writing your own commands and executing them on top of the existing environment. In this guide, you will learn how to run custom bash commands that can be used to add more complex configuration to your environment e.g., installing OS-Level packages or configuring an oracle database. @@ -12,12 +16,20 @@ In this guide, you will learn how to run custom bash commands that can be used t In order to install a custom dependency one of the base environments must first be cloned, follow [this guide](python_env_clone.md) for that. ## Running bash commands + In this section, we will see how you can run custom bash commands in Hopsworks to configure your Python environment. -In Hopsworks, we maintain a docker image built on top of Ubuntu Linux distribution. You can run generic bash commands on top of the project environment from the UI or REST API. +In Hopsworks, we maintain a docker image built on top of Ubuntu Linux distribution. +You can run generic bash commands on top of the project environment from the UI or REST API. ### Setting up the bash script and artifacts from the UI -To use the UI, navigate to the Python environment in the Project settings. In the Python environment page, navigate to custom commands. From the UI, you can write the bash commands in the textbox provided. These bash commands will be uploaded and executed when building your new environment. You can include build artifacts e.g., binaries that you would like to execute or include when building the environment. See Figure 1. + +To use the UI, navigate to the Python environment in the Project settings. +In the Python environment page, navigate to custom commands. +From the UI, you can write the bash commands in the textbox provided. +These bash commands will be uploaded and executed when building your new environment. +You can include build artifacts e.g., binaries that you would like to execute or include when building the environment. +See Figure 1.

@@ -27,7 +39,11 @@ To use the UI, navigate to the Python environment in the Project settings. In th

## Code -You can also run the custom commands using the REST API. From the REST API, you should provide the path, in HOPSFS, to the bash script and the artifacts(comma separated string of paths in HopsFs). The REST API endpoint for running custom commands is: `hopsworks-api/api/project//python/environments//commands/custom` and the body should look like this: + +You can also run the custom commands using the REST API. +From the REST API, you should provide the path, in HOPSFS, to the bash script and the artifacts(comma separated string of paths in HopsFs). +The REST API endpoint for running custom commands is: `hopsworks-api/api/project//python/environments//commands/custom` and the body should look like this: + ```python { "commandsFile": "", @@ -37,9 +53,19 @@ You can also run the custom commands using the REST API. From the REST API, you ``` ## What to include in the bash script + There are few important things to be aware of when writing the bash script: -* The first line of your bash script should always be `#!/bin/bash` (known as shebang) so that the script can be interpreted and executed using the Bash shell. -* You can use `apt`, `apt-get` and `deb` commands to install packages. You should always run these commands with `sudo`. In some cases, these commands will ask for user input, therefore you should provide the input of what the command expects, e.g., `sudo apt -y install`, otherwise the build will fail. We have already configured `apt-get` to be non-interactive -* The build artifacts will be copied to `srv/hops/build`. You can use them in your script via this path. This path is also available via the environmental variable `BUILD_PATH`. If you want to use many artifacts it is advisable to create a zip file and upload it to HopsFS in one of your project datasets. You can then include the zip file as one of the artifacts. -* The conda environment is located in `/srv/hops/anaconda/envs/hopsworks_environment`. You can install or uninstall packages in the conda environment using pip like: `/srv/hops/anaconda/envs/hopsworks_environment/bin/pip install spotify==0.10.2`. If the command requires some input, write the command together with the expected input otherwise the build will fail. +- The first line of your bash script should always be `#!/bin/bash` (known as shebang) so that the script can be interpreted and executed using the Bash shell. +- You can use `apt`, `apt-get` and `deb` commands to install packages. + You should always run these commands with `sudo`. + In some cases, these commands will ask for user input, therefore you should provide the input of what the command expects, e.g., `sudo apt -y install`, otherwise the build will fail. + We have already configured `apt-get` to be non-interactive +- The build artifacts will be copied to `srv/hops/build`. + You can use them in your script via this path. + This path is also available via the environmental variable `BUILD_PATH`. + If you want to use many artifacts it is advisable to create a zip file and upload it to HopsFS in one of your project datasets. + You can then include the zip file as one of the artifacts. +- The conda environment is located in `/srv/hops/anaconda/envs/hopsworks_environment`. + You can install or uninstall packages in the conda environment using pip like: `/srv/hops/anaconda/envs/hopsworks_environment/bin/pip install spotify==0.10.2`. + If the command requires some input, write the command together with the expected input otherwise the build will fail. diff --git a/docs/user_guides/projects/python/environment_history.md b/docs/user_guides/projects/python/environment_history.md index e7f72bebe..17b574b65 100644 --- a/docs/user_guides/projects/python/environment_history.md +++ b/docs/user_guides/projects/python/environment_history.md @@ -1,12 +1,21 @@ # Environment History -Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. -The Python virtual environment is shared by different members of the project. When a member of the project introduces a change to the environment i.e., installs/uninstalls a library, a new environment is created and it becomes a defacto environment for everyone in the project. It is therefore important to track how the environment has been changing over time i.e., what libraries were installed, uninstalled, upgraded, or downgraded when the environment was created and who introduced the changes. +Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. +Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. + +The Python virtual environment is shared by different members of the project. +When a member of the project introduces a change to the environment i.e., installs/uninstalls a library, a new environment is created and it becomes a defacto environment for everyone in the project. +It is therefore important to track how the environment has been changing over time i.e., what libraries were installed, uninstalled, upgraded, or downgraded when the environment was created and who introduced the changes. In this guide, you will learn how you can track the changes of your Python environment. ## Viewing python environment history in the UI -The Python environment evolves over time as libraries are installed, uninstalled, upgraded, and downgraded. To assist in tracking the changes in the environment, you can see the environment history in the UI. You can view what changes were introduced at each point a new environment was created. Hopsworks will keep a version of a YAML file for each environment so that if you want to restore an older environment you can use it. To see the differences between environments click on the button as shown in figure 1. You will then see the difference between the environment and the previous environment it was created from. + +The Python environment evolves over time as libraries are installed, uninstalled, upgraded, and downgraded. +To assist in tracking the changes in the environment, you can see the environment history in the UI. +You can view what changes were introduced at each point a new environment was created. +Hopsworks will keep a version of a YAML file for each environment so that if you want to restore an older environment you can use it. +To see the differences between environments click on the button as shown in figure 1. You will then see the difference between the environment and the previous environment it was created from.

Python environment history @@ -14,7 +23,7 @@ The Python environment evolves over time as libraries are installed, uninstalled

-If you had built the environment using custom commands you can go back to see what commands were run during the build as shown in figure 2. +If you had built the environment using custom commands you can go back to see what commands were run during the build as shown in figure 2.

Python environment history with custom commands diff --git a/docs/user_guides/projects/python/python_env_clone.md b/docs/user_guides/projects/python/python_env_clone.md index ae1bb07a6..3ad5b1965 100644 --- a/docs/user_guides/projects/python/python_env_clone.md +++ b/docs/user_guides/projects/python/python_env_clone.md @@ -1,8 +1,10 @@ # How To Clone Python Environment -### Introduction +## Introduction -Cloning an environment in Hopsworks means creating a copy of one of the base environments. The base environments are immutable, meaning that it is required to clone an environment before you can make any change to it, such as installing your own libraries. This ensures that the project maintains a set of stable environments that are tested with the capabilities of the platform, meanwhile through cloning, allowing users to further customize an environment without affecting the base environments. +Cloning an environment in Hopsworks means creating a copy of one of the base environments. +The base environments are immutable, meaning that it is required to clone an environment before you can make any change to it, such as installing your own libraries. +This ensures that the project maintains a set of stable environments that are tested with the capabilities of the platform, meanwhile through cloning, allowing users to further customize an environment without affecting the base environments. In this guide, you will learn how to clone an environment. @@ -21,7 +23,8 @@ First select an environment, for example the `python-feature-pipeline`. ## Step 2: Clone environment -The environment can now be cloned by clicking `Clone env` and entering a name and description. The interface will show `Syncing packages` while creating the environment. +The environment can now be cloned by clicking `Clone env` and entering a name and description. +The interface will show `Syncing packages` while creating the environment.

@@ -48,8 +51,10 @@ The environment can now be cloned by clicking `Clone env` and entering a name an ## Concerning upgrades !!! warning "Please note" - The base environments are automatically upgraded when Hopsworks is upgraded and application code should keep functioning provided that no breaking changes were made in the upgraded version of the environment. A `CUSTOM` environment is not automatically upgraded and the users is recommended to reapply the modifications to a base environment if they encounter issues after an upgrade. + The base environments are automatically upgraded when Hopsworks is upgraded and application code should keep functioning provided that no breaking changes were made in the upgraded version of the environment. + A `CUSTOM` environment is not automatically upgraded and the users is recommended to reapply the modifications to a base environment if they encounter issues after an upgrade. ## Next steps -In this guide you learned how to clone a new environment. The next step is to [install](python_install.md) a library in the environment. \ No newline at end of file +In this guide you learned how to clone a new environment. +The next step is to [install](python_install.md) a library in the environment. diff --git a/docs/user_guides/projects/python/python_env_export.md b/docs/user_guides/projects/python/python_env_export.md index 177805c0b..160a7abda 100644 --- a/docs/user_guides/projects/python/python_env_export.md +++ b/docs/user_guides/projects/python/python_env_export.md @@ -1,10 +1,11 @@ # How To Export Python Environment -### Introduction +## Introduction -Each of the python environments in a project can be exported to an `environment.yml` file. It can be useful to export it to keep a snapshot of all the installed libraries and their versions. +Each of the python environments in a project can be exported to an `environment.yml` file. +It can be useful to export it to keep a snapshot of all the installed libraries and their versions. -In this guide, you will learn how to export a python environment. +In this guide, you will learn how to export a python environment. ## Step 1: Go to environment @@ -12,7 +13,8 @@ Under the `Project settings` section you can find the `Python environment` setti ## Step 2: Select a CUSTOM environment -Select the environment that you have previously cloned and want to export. Only a `CUSTOM` environment can be exported. +Select the environment that you have previously cloned and want to export. +Only a `CUSTOM` environment can be exported. ## Step 3: Click Export env diff --git a/docs/user_guides/projects/python/python_env_overview.md b/docs/user_guides/projects/python/python_env_overview.md index 6d59e6afc..c9f190d2a 100644 --- a/docs/user_guides/projects/python/python_env_overview.md +++ b/docs/user_guides/projects/python/python_env_overview.md @@ -1,14 +1,15 @@ # Python Environments -### Introduction +## Introduction -Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. This architecture consists of three independently developed and operated ML pipelines: +Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. +This architecture consists of three independently developed and operated ML pipelines: - Feature Pipeline: takes as input raw data that it transforms into features (and labels) - Training Pipeline: takes as input features (and labels) and outputs a trained model - Inference Pipeline: takes new feature data and a trained model and makes predictions. -In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. +In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. Each environment can also be customized further by installing additional dependencies from PyPi, Conda, Wheel files, GitHub repos or applying custom Dockerfiles on top. ### Step 1: Go to environments page @@ -17,7 +18,7 @@ Under the `Project settings` section you can find the `Python environment` setti ### Step 2: List available environments -Environments listed under `FEATURE ENGINEERING` corresponds to environments you would use in a feature pipeline, `MODEL TRAINING` maps to environments used in a training pipeline and `MODEL INFERENCE` are what you would use in inference pipelines. +Environments listed under `FEATURE ENGINEERING` corresponds to environments you would use in a feature pipeline, `MODEL TRAINING` maps to environments used in a training pipeline and `MODEL INFERENCE` are what you would use in inference pipelines.

@@ -33,32 +34,33 @@ Environments listed under `FEATURE ENGINEERING` corresponds to environments you The `FEATURE ENGINEERING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md), a [Python job](../jobs/python_job.md) or a [PySpark job](../jobs/pyspark_job.md). -* `python-feature-pipeline` for writing feature pipelines using Python -* `spark-feature-pipeline` for writing feature pipelines using PySpark +- `python-feature-pipeline` for writing feature pipelines using Python +- `spark-feature-pipeline` for writing feature pipelines using PySpark ### Model training -The `MODEL TRAINING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md) or a [Python job](../jobs/python_job.md) or in a [Ray job](../jobs/ray_job.md). +The `MODEL TRAINING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md) or a [Python job](../jobs/python_job.md) or in a [Ray job](../jobs/ray_job.md). -* `tensorflow-training-pipeline` to train TensorFlow models -* `torch-training-pipeline` to train PyTorch models -* `pandas-training-pipeline` to train XGBoost, Catboost and Sklearn models -* `ray_training_pipeline` a general purpose environment for distributed training using Ray framework to train - XGBoost and Sklearn models. Should be used in [Ray job](../jobs/ray_job.md). It can be customized to install - additional dependencies of your choice. -* `ray_torch_training_pipeline` for distributed training of PyTorch models using Ray framework in a [Ray job](../jobs/ray_job.md) -* `ray_tensorflow_training_pipeline` for distributed training of TensorFlow models using Ray framework in a [Ray job](../jobs/ray_job.md) +- `tensorflow-training-pipeline` to train TensorFlow models +- `torch-training-pipeline` to train PyTorch models +- `pandas-training-pipeline` to train XGBoost, Catboost and Sklearn models +- `ray_training_pipeline` a general purpose environment for distributed training using Ray framework to train XGBoost and Sklearn models. + Should be used in [Ray job](../jobs/ray_job.md). + It can be customized to install additional dependencies of your choice. +- `ray_torch_training_pipeline` for distributed training of PyTorch models using Ray framework in a [Ray job](../jobs/ray_job.md) +- `ray_tensorflow_training_pipeline` for distributed training of TensorFlow models using Ray framework in a [Ray job](../jobs/ray_job.md) ### Model inference The `MODEL INFERENCE` environments can be used in a deployment using a custom predictor script. -* `tensorflow-inference-pipeline` to load and serve TensorFlow models -* `torch-inference-pipeline` to load and serve PyTorch models -* `pandas-inference-pipeline` to load and serve XGBoost, Catboost and Sklearn models -* `vllm-inference-pipeline` to load and serve LLMs with vLLM inference engine -* `minimal-inference-pipeline` to install your own custom framework, contains a minimal set of dependencies +- `tensorflow-inference-pipeline` to load and serve TensorFlow models +- `torch-inference-pipeline` to load and serve PyTorch models +- `pandas-inference-pipeline` to load and serve XGBoost, Catboost and Sklearn models +- `vllm-inference-pipeline` to load and serve LLMs with vLLM inference engine +- `minimal-inference-pipeline` to install your own custom framework, contains a minimal set of dependencies ## Next steps -In this guide you learned how to find the bundled python environments and where they can be used. Now you can test out the environment in a [Jupyter notebook](../jupyter/python_notebook.md). +In this guide you learned how to find the bundled python environments and where they can be used. +Now you can test out the environment in a [Jupyter notebook](../jupyter/python_notebook.md). diff --git a/docs/user_guides/projects/python/python_install.md b/docs/user_guides/projects/python/python_install.md index d7510133a..d9403ef4e 100644 --- a/docs/user_guides/projects/python/python_install.md +++ b/docs/user_guides/projects/python/python_install.md @@ -2,20 +2,20 @@ ## Introduction -Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. +Hopsworks comes with several prepackaged Python environments that contain libraries for data engineering, machine learning, and more general data science use-cases. +Hopsworks also offers the ability to install additional packages from various sources, such as using the pip or conda package managers and public or private git repository. In this guide, you will learn how to install Python packages using these different options. -* PyPi, using pip package manager -* A conda channel, using conda package manager -* Packages contained in .whl format -* A public or private git repository -* A requirements.txt file to install multiple libraries at the same time using pip +- PyPi, using pip package manager +- A conda channel, using conda package manager +- Packages contained in .whl format +- A public or private git repository +- A requirements.txt file to install multiple libraries at the same time using pip !!! notice "Notice" If your libraries require installing some extra OS-Level packages, refer to the guide custom commands guide on how to install OS-Level packages. - ## Prerequisites In order to install a custom dependency one of the base environments must first be cloned, follow [this guide](python_env_clone.md) for that. @@ -76,7 +76,8 @@ For example to install matplotlib 3.7.2, the following are correct inputs: In the case of a private git repository, also select whether it is a GitHub or GitLab repository and the preconfigured access token for the repository. !!! notice "Keep your secrets safe" - If you are installing from a git repository which is not GitHub or GitLab simply supply the access token in the URL. Keep in mind that in this case the access token may be visible in logs for other users in the same project to see. + If you are installing from a git repository which is not GitHub or GitLab simply supply the access token in the URL. + Keep in mind that in this case the access token may be visible in logs for other users in the same project to see.

@@ -85,7 +86,6 @@ In the case of a private git repository, also select whether it is a GitHub or G

- ## Going Further Now you can use the library in a [Jupyter notebook](../jupyter/python_notebook.md) or a [Job](../jobs/python_job.md). diff --git a/docs/user_guides/projects/scheduling/kube_scheduler.md b/docs/user_guides/projects/scheduling/kube_scheduler.md index 3c62fa033..d28be075b 100644 --- a/docs/user_guides/projects/scheduling/kube_scheduler.md +++ b/docs/user_guides/projects/scheduling/kube_scheduler.md @@ -6,15 +6,20 @@ description: Documentation on how to configure Kubernetes scheduling options for ## Introduction -Hopsworks allows users to configure some Kubernetes scheduler abstractions, such as [Affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) and [Priority Classes](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass). Hopsworks also supports additional scheduling abstractions backed by Kueue. This includes [Queues](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/), [Cohorts](https://kueue.sigs.k8s.io/docs/concepts/cohort/) and [Topologies](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/). All these scheduling abstractions are supported in jobs, jupyter notebooks and model deployments. Kueue abstractions however, are currently not supported for Spark jobs. +Hopsworks allows users to configure some Kubernetes scheduler abstractions, such as [Affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) and [Priority Classes](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass). +Hopsworks also supports additional scheduling abstractions backed by Kueue. +This includes [Queues](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/), [Cohorts](https://kueue.sigs.k8s.io/docs/concepts/cohort/) and [Topologies](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/). +All these scheduling abstractions are supported in jobs, jupyter notebooks and model deployments. +Kueue abstractions however, are currently not supported for Spark jobs. -Hopsworks Admins can control which labels and priority classes can be used the cluster (see [Cluster configuration](#cluster-configuration) section) and by which project (see [Default Project configuration](#default-project-configuration) section) +Hopsworks Admins can control which labels and priority classes can be used in the cluster (see [Admin configuration](#admin-configuration) section) and by which project (see [Project Configuration](#project-configuration) section) Within a project, data owners can set defaults for jobs and Jupyter notebooks running within that project (see: [Project defaults](#project-defaults) section). ### Node Labels, Node Affinity and Node Anti-Affinity -Labels in Kubernetes are key-value pairs used to organize and select resources. Hopsworks relies on labels applied to nodes for pod-node affinity to determine where the pod can (or cannot) run. +Labels in Kubernetes are key-value pairs used to organize and select resources. +Hopsworks relies on labels applied to nodes for pod-node affinity to determine where the pod can (or cannot) run. Some uses cases where labels and affinity can be used include: - Hardware constraints (GPU, SSD) @@ -53,7 +58,9 @@ For a more detailed view on how Hopsworks uses the Kueue abstractions you can ch ### Queues, Cohorts -Jobs, notebooks and model deployments are submitted to these queues. Hopsworks administrator can define quotas on how many resources a queue can use. Queues can be grouped together in cohorts in order to add the ability to borrow resources from each other when the other queue does not use its resources. +Jobs, notebooks and model deployments are submitted to these queues. +Hopsworks administrator can define quotas on how many resources a queue can use. +Queues can be grouped together in cohorts in order to add the ability to borrow resources from each other when the other queue does not use its resources. When creating a new job, the user can select a queue for the job in the `Advance configuration -> Scheduler section`. @@ -61,7 +68,9 @@ When creating a new job, the user can select a queue for the job in the `Advance ### Topologies -The integration of Hopsworks with Kueue, also provides access to the topology abstraction. Topologies can be defined, so that the user can decide for the pods of jobs or model deployments to run somehow grouped together. The user could decide for example, that all pods of a job should run on the same host, because the pods need to transfer a lot of data between each other, and we want to avoid network traffic to lower the latency. +The integration of Hopsworks with Kueue, also provides access to the topology abstraction. +Topologies can be defined, so that the user can decide for the pods of jobs or model deployments to run somehow grouped together. +The user could decide for example, that all pods of a job should run on the same host, because the pods need to transfer a lot of data between each other, and we want to avoid network traffic to lower the latency. The user can select the topology unit for jobs, notebooks and model deployments in the `Advance configuration -> Scheduler section`. @@ -75,7 +84,9 @@ Hopsworks admins can control the affinity labels and priority classes available ![Cluster Configuration - Node Labels and Priority Classes](../../../assets/images/guides/project/scheduler/admin_cluster_scheduler.png) -Hopsworks Cluster can run within a shared Kubernets Cluster. The first configuration level is to limit the subset of labels and priority classes that can be used within the Hopsworks Cluster. This can be done from the `Available in Hopsworks` sub-section. +Hopsworks Cluster can run within a shared Kubernets Cluster. +The first configuration level is to limit the subset of labels and priority classes that can be used within the Hopsworks Cluster. +This can be done from the `Available in Hopsworks` sub-section. !!! note "Permissions" @@ -95,20 +106,26 @@ Hopsworks Cluster can run within a shared Kubernets Cluster. The first configura verbs: ["get", "list"] ``` - If the roles above are configured properly (default behaviour), admins can only select values from the drop down menu. If the roles are missing, admins would be required to enter them as free text and should be careful about typos. Any typos here will be propagated in the other configuration and use levels leading to errors or missbehaviour when running computation. + If the roles above are configured properly (default behaviour), admins can only select values from the drop down menu. + If the roles are missing, admins would be required to enter them as free text and should be careful about typos. + Any typos here will be propagated in the other configuration and use levels leading to errors or missbehaviour when running computation. ### Queues -Every new project gets automatic access to the default Hopsworks queue. An administrator can define the default queue for projects user jobs and system jobs. +Every new project gets automatic access to the default Hopsworks queue. +An administrator can define the default queue for projects user jobs and system jobs. ![Default queue for user and system jobs](../../../assets/images/guides/project/scheduler/default_queue.png) ## Project Configuration -Hopsworks admins can configure the labels and priority classes that can be used by default within a project. This will be a subset of the ones configured for Hopsworks. +Hopsworks admins can configure the labels and priority classes that can be used by default within a project. +This will be a subset of the ones configured for Hopsworks. In the figure above, in the sub-section `Available in Project` Hopsworks admins can configure the labels and priority classes available by default in any Hopsworks Project. -Hopsworks admins can also override the default project configuration on a per-project basis. That is, Hopsworks admins can make certain labels and priority classes available only to certain projects. This can be achieved from the `Cluster Settings -> Project -> -> edit configuration` configuration page: +Hopsworks admins can also override the default project configuration on a per-project basis. +That is, Hopsworks admins can make certain labels and priority classes available only to certain projects. +This can be achieved from the `Cluster Settings -> Project -> -> edit configuration` configuration page: ![Custom Project Configuration - Node Labels and Priority Classes](../../../assets/images/guides/project/scheduler/admin_project_scheduler.png) @@ -121,11 +138,14 @@ The default Label will be used for the default Node Affinity for jobs, notebooks ## Configuration of Jobs, Notebooks, and Deployments -In the advanced configuration sections for job, notebook, and model deployments, users can set affinity, anti affinity and priority class. The Affinity and Anti Affinity can be selected from the list of allowed labels. +In the advanced configuration sections for job, notebook, and model deployments, users can set affinity, anti affinity and priority class. +The Affinity and Anti Affinity can be selected from the list of allowed labels. -`Affinity` configures on which nodes this pod can run. If a node has any of the labels present in the Affinity option, the pod can be scheduler to run to run there. +`Affinity` configures on which nodes this pod can run. +If a node has any of the labels present in the Affinity option, the pod can be scheduler to run to run there. -`Anti Affinity` configures on which nodes this pod will not run on. If a node has any of the labels present in the Anti Affinity option, the pod will not be scheduler to run there. +`Anti Affinity` configures on which nodes this pod will not run on. +If a node has any of the labels present in the Anti Affinity option, the pod will not be scheduler to run there. `Priority Class` specifies with which priority a pod will run. diff --git a/docs/user_guides/projects/scheduling/kueue_details.md b/docs/user_guides/projects/scheduling/kueue_details.md index 241967a47..c4965ee90 100644 --- a/docs/user_guides/projects/scheduling/kueue_details.md +++ b/docs/user_guides/projects/scheduling/kueue_details.md @@ -6,15 +6,21 @@ description: Kueue abstractions ## Introduction -Hopsworks provides the integration with Kueue to provide the additional scheduling abstractions. Hopsworks currently acts only as a "reader" to the Kueue abstractions and currently does not manage the lifecycle of Kueue abstraction with the exception of the default localqueue for each namespace. All the other abstractions are expected to be managed by the administrators of Hopsworks, directly on the Kubernetes cluster. +Hopsworks provides the integration with Kueue to provide the additional scheduling abstractions. +Hopsworks currently acts only as a "reader" to the Kueue abstractions and currently does not manage the lifecycle of Kueue abstraction with the exception of the default localqueue for each namespace. +All the other abstractions are expected to be managed by the administrators of Hopsworks, directly on the Kubernetes cluster. -However Hopsworks and Kueue integration currently only supports frameworks python and ray for jobs, notebooks and model deployments. The same queues are also used for Hopsworks internal jobs (zipping, git operations, python library installation). Spark is currently not supported, and thus will not be managed by Kueue for scheduling, and instead it will bypass the queues setup (important to note when thinking about queue quotas) and instead are managed directly by the Kubernetes Scheduler. +However Hopsworks and Kueue integration currently only supports frameworks python and ray for jobs, notebooks and model deployments. +The same queues are also used for Hopsworks internal jobs (zipping, git operations, python library installation). +Spark is currently not supported, and thus will not be managed by Kueue for scheduling, and instead it will bypass the queues setup (important to note when thinking about queue quotas) and instead are managed directly by the Kubernetes Scheduler. ### Resource flavors -When trying to define queues in Kueue, the first abstraction that needs to be defined is a [Resource Flavor](https://kueue.sigs.k8s.io/docs/concepts/resource_flavor/). The resource flavor defines the resources that a queue will later manage. Hopsworks helm chart installs and uses a default ResourceFlavor +When trying to define queues in Kueue, the first abstraction that needs to be defined is a [Resource Flavor](https://kueue.sigs.k8s.io/docs/concepts/resource_flavor/). +The resource flavor defines the resources that a queue will later manage. +Hopsworks helm chart installs and uses a default ResourceFlavor -``` +```yaml apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -25,13 +31,14 @@ spec: topologyName: default ``` -Node labels filter the available nodes to this resource flavor and is required for [topologies](#Topologies) +Node labels filter the available nodes to this resource flavor and is required for [topologies](#topologies) ### Cluster Queues -[Cluster Queues](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/) are the actual queues for submitting jobs and model deployments to. The default hopsworks queue looks like: +[Cluster Queues](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/) are the actual queues for submitting jobs and model deployments to. +The default hopsworks queue looks like: -``` +```yaml apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: @@ -64,21 +71,25 @@ spec: nominalQuota: "0" ``` -The [preemption](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/#preemption) and [nominal quotas](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/#flavors-and-resources) are set to the minimal as this queue is designed to have lowest priority in getting resources allocated. If a cluster is underutilized and there are resources available, it can still borrow up to the maximum resources present in the parent cohort, but by design this queue has no dedicated resources. The presumption is that other, more important queues, defined by the cluster administrator will have higher preference in getting resources. +The [preemption](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/#preemption) and [nominal quotas](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/#flavors-and-resources) are set to the minimal as this queue is designed to have lowest priority in getting resources allocated. +If a cluster is underutilized and there are resources available, it can still borrow up to the maximum resources present in the parent cohort, but by design this queue has no dedicated resources. +The presumption is that other, more important queues, defined by the cluster administrator will have higher preference in getting resources. ### Local Queues [Local Queues](https://kueue.sigs.k8s.io/docs/concepts/local_queue/) are the mechanism to provide access to a queue (cluster queue) to a specific project in Hopsworks (Kubernetes namespace). -Every new project gets automatic access to the default Hopsworks queue. An administrator can define the default queue for projects user jobs and system jobs. +Every new project gets automatic access to the default Hopsworks queue. +An administrator can define the default queue for projects user jobs and system jobs. ![Default queue for user and system jobs](../../../assets/images/guides/project/scheduler/default_queue.png) ### Cohorts -[Cohorts](https://kueue.sigs.k8s.io/docs/concepts/cohort/) are groupings of cluster queues that have some meaning together and can share resources. Hopsworks defines a default `cluster` cohort +[Cohorts](https://kueue.sigs.k8s.io/docs/concepts/cohort/) are groupings of cluster queues that have some meaning together and can share resources. +Hopsworks defines a default `cluster` cohort -``` +```yaml apiVersion: kueue.x-k8s.io/v1alpha1 kind: Cohort metadata: @@ -103,9 +114,10 @@ spec: nominalQuota: 50 ``` -Cohorts can contain other cohorts and thus you can create a hierarchy of cohorts. Cohorts can set [fair sharing weight](https://kueue.sigs.k8s.io/docs/concepts/admission_fair_sharing/) where using +Cohorts can contain other cohorts and thus you can create a hierarchy of cohorts. +Cohorts can set [fair sharing weight](https://kueue.sigs.k8s.io/docs/concepts/admission_fair_sharing/) where using -``` +```yaml fairSharing: weight ``` @@ -114,9 +126,10 @@ in the definition of a cohort, the user can control a priority towards borrowing ### Topologies -[Topologies](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/) defines a way of grouping together pods belonging to the same job/deployment so that they are colocated within the same topology unit. Hopsworks defines a default topology: +[Topologies](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/) defines a way of grouping together pods belonging to the same job/deployment so that they are colocated within the same topology unit. +Hopsworks defines a default topology: -``` +```yaml apiVersion: kueue.x-k8s.io/v1alpha1 kind: Topology metadata: @@ -130,6 +143,7 @@ spec: The topology is defined in the Resource Flavor used by a Cluster Queue. -When creating a new job, the user can select a topology unit for the job to run in and thus decide if all pods of a job should run on the same hostname, in the same zone or in the same region. The user can select the topology for jobs, notebooks and deployments in the `Advance configuration -> Scheduler section`. +When creating a new job, the user can select a topology unit for the job to run in and thus decide if all pods of a job should run on the same hostname, in the same zone or in the same region. +The user can select the topology for jobs, notebooks and deployments in the `Advance configuration -> Scheduler section`. ![Default queue for user and system jobs](../../../assets/images/guides/project/scheduler/job_topology_unit.png) diff --git a/docs/user_guides/projects/secrets/create_secret.md b/docs/user_guides/projects/secrets/create_secret.md index bfba15025..a9a56b03c 100644 --- a/docs/user_guides/projects/secrets/create_secret.md +++ b/docs/user_guides/projects/secrets/create_secret.md @@ -2,7 +2,7 @@ ## Introduction -A Secret is a key-value pair used to store encrypted information accessible only to the owner of the secret. +A Secret is a key-value pair used to store encrypted information accessible only to the owner of the secret. Also if you wish to, you can share the same secret API key with all the members of a Project. ## UI @@ -20,7 +20,8 @@ In the `Account Settings` page you can find the `Secrets` section showing a list ### Step 2: Create a Secret -Click `New Secret` to bring up the dialog for secret creation. Enter a name for the secret to be used for lookup, and the secret value. +Click `New Secret` to bring up the dialog for secret creation. +Enter a name for the secret to be used for lookup, and the secret value. If the secret should be private to this user, select `Private`, to share the secret with all members of a project select `Project` and enter the project name. @@ -33,7 +34,8 @@ If the secret should be private to this user, select `Private`, to share the sec ### Step 3: Secret created -Click `New Secret` to bring up the dialog for secret creation. Enter a name for the secret to be used for lookup, and the secret value. +Click `New Secret` to bring up the dialog for secret creation. +Enter a name for the secret to be used for lookup, and the secret value. If the secret should be private to this user, select `Private`, to share the secret with all members of a project select `Project` and enter the project name. @@ -64,4 +66,4 @@ secret = secrets_api.create_secret("my_secret", "Fk3MoPlQXCQvPo") ### API Reference -[Secrets](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/secrets/) +[`SecretsApi`][hopsworks_common.core.secret_api.SecretsApi] diff --git a/mkdocs.yml b/mkdocs.yml index 32b3a9bca..a42cc8c04 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,7 +4,9 @@ site_author: "Hopsworks" site_url: "https://docs.hopsworks.ai/" # Repository -edit_uri: "" +edit_uri: "https://github.com/logicalclocks/logicalclocks.github.io" +repo_url: "https://github.com/logicalclocks/hopsworks-api" +repo_name: logicalclocks/hopsworks-api strict: false @@ -74,6 +76,7 @@ nav: - user_guides/fs/feature_group/index.md - Create: user_guides/fs/feature_group/create.md - Create External: user_guides/fs/feature_group/create_external.md + - Create Spine: user_guides/fs/feature_group/create_spine.md - Deprecate: user_guides/fs/feature_group/deprecation.md - Data Types and Schema management: user_guides/fs/feature_group/data_types.md - Statistics: user_guides/fs/feature_group/statistics.md @@ -107,10 +110,10 @@ nav: - Compute Engines: user_guides/fs/compute_engines.md - Client Integrations: - user_guides/integrations/index.md - - Python / SageMaker / Kubeflow : user_guides/integrations/python.md + - Python / SageMaker / Kubeflow: user_guides/integrations/python.md - AWS EMR: - - Networking: user_guides/integrations/emr/networking.md - - Configure EMR for Hopsworks: user_guides/integrations/emr/emr_configuration.md + - Networking: user_guides/integrations/emr/networking.md + - Configure EMR for Hopsworks: user_guides/integrations/emr/emr_configuration.md - Azure HDInsight: user_guides/integrations/hdinsight.md - Azure Machine Learning: - Designer: user_guides/integrations/mlstudio_designer.md @@ -118,6 +121,12 @@ nav: - Apache Spark: user_guides/integrations/spark.md - Apache Flink: user_guides/integrations/flink.md - Apache Beam: user_guides/integrations/beam.md + - Java: user_guides/integrations/java.md + # TODO: update the docs and uncomment: + # - Databricks: + # - Networking: user_guides/integrations/databricks/networking.md + # - Configure Databricks for Hopsworks: user_guides/integrations/databricks/configuration.md + # - API Key: user_guides/integrations/databricks/api_key.md - Sharing: user_guides/fs/sharing/sharing.md - Tags: user_guides/fs/tags/tags.md - Provenance: user_guides/fs/provenance/provenance.md @@ -158,7 +167,7 @@ nav: - Run Spark Job: user_guides/projects/jobs/spark_job.md - Run Ray Job: user_guides/projects/jobs/ray_job.md - Scheduling: user_guides/projects/jobs/schedule_job.md - - Kubernetes Scheduling: + - Kubernetes Scheduling: - Base: user_guides/projects/scheduling/kube_scheduler.md - Kueue: user_guides/projects/scheduling/kueue_details.md @@ -231,29 +240,31 @@ nav: - IAM Role Chaining: setup_installation/admin/roleChaining.md - Configure Project Mapping: setup_installation/admin/configure-project-mapping.md - Monitoring: - - Services Dashboards: setup_installation/admin/monitoring/grafana.md - - Export metrics: setup_installation/admin/monitoring/export-metrics.md - - Services Logs: setup_installation/admin/monitoring/services-logs.md + - Services Dashboards: setup_installation/admin/monitoring/grafana.md + - Export metrics: setup_installation/admin/monitoring/export-metrics.md + - Services Logs: setup_installation/admin/monitoring/services-logs.md - Authentication: - - Configure Authentication: setup_installation/admin/auth.md - - Configure OAuth2: - - Register an Identity Provider: setup_installation/admin/oauth2/create-client.md - - Create Okta Client: setup_installation/admin/oauth2/create-okta-client.md - - Create Azure Client: setup_installation/admin/oauth2/create-azure-client.md - - Configure Project Mapping: setup_installation/admin/oauth2/configure-project-mapping.md - - Configure LDAP/Kerberos: - - Configure LDAP: setup_installation/admin/ldap/configure-ldap.md - - Configure Kerberos: setup_installation/admin/ldap/configure-krb.md - - Configure server for LDAP and Kerberos: setup_installation/admin/ldap/configure-server.md - - Configure Project Mapping: setup_installation/admin/ldap/configure-project-mapping.md + - Configure Authentication: setup_installation/admin/auth.md + - Configure OAuth2: + - Register an Identity Provider: setup_installation/admin/oauth2/create-client.md + - Create Okta Client: setup_installation/admin/oauth2/create-okta-client.md + - Create Azure Client: setup_installation/admin/oauth2/create-azure-client.md + - Configure Project Mapping: setup_installation/admin/oauth2/configure-project-mapping.md + - Configure LDAP/Kerberos: + - Configure LDAP: setup_installation/admin/ldap/configure-ldap.md + - Configure Kerberos: setup_installation/admin/ldap/configure-krb.md + - Configure server for LDAP and Kerberos: setup_installation/admin/ldap/configure-server.md + - Configure Project Mapping: setup_installation/admin/ldap/configure-project-mapping.md - High availability / Disaster Recovery: - - Overview: setup_installation/admin/ha-dr/intro.md - - High Availability: setup_installation/admin/ha-dr/ha.md - - Disaster Recovery: setup_installation/admin/ha-dr/dr.md + - Overview: setup_installation/admin/ha-dr/intro.md + - High Availability: setup_installation/admin/ha-dr/ha.md + - Disaster Recovery: setup_installation/admin/ha-dr/dr.md - Audit: - - Access Audit Logs: setup_installation/admin/audit/audit-logs.md - - Export Audit Logs: setup_installation/admin/audit/export-audit-logs.md - - : https://docs.hopsworks.ai + - Access Audit Logs: setup_installation/admin/audit/audit-logs.md + - Export Audit Logs: setup_installation/admin/audit/export-audit-logs.md + - ArrowFlight Server with DuckDB: setup_installation/common/arrow_flight_duckdb.md + - Python API: "!import https://github.com/logicalclocks/hopsworks-api?branch=main" + - Java API: javadoc - Community ↗: https://community.hopsworks.ai/ theme: @@ -267,8 +278,19 @@ theme: text: "Roboto" code: "IBM Plex Mono" palette: - accent: teal - scheme: hopsworks + - scheme: hopsworks + media: "(prefers-color-scheme: light)" + accent: teal + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + media: "(prefers-color-scheme: dark)" + primary: hopsworks + accent: teal + toggle: + icon: material/brightness-4 + name: Switch to light mode features: - navigation.tabs - navigation.tabs.sticky @@ -276,7 +298,6 @@ theme: - navigation.indexes extra: - hopsworks_version: dev version: provider: mike default: latest @@ -285,7 +306,7 @@ extra: - icon: fontawesome/brands/twitter link: https://twitter.com/hopsworks - icon: fontawesome/brands/github - link: https://github.com/logicalclocks/hopsworks + link: https://github.com/logicalclocks/hopsworks-api - icon: fontawesome/brands/discourse link: https://community.hopsworks.ai/ - icon: fontawesome/brands/linkedin @@ -299,11 +320,8 @@ extra: extra_css: - css/custom.css - css/marctech.css - - css/dropdown.css extra_javascript: - - js/inject-api-links.js - - js/dropdown.js - js/quickstart-fullscreen.js plugins: @@ -312,15 +330,34 @@ plugins: minify_html: true minify_css: true minify_js: true - - mkdocs-jupyter - - macros: - # have to use custom templating string otherwise it interferes with jupyter notebooks - j2_block_start_string: "{{{%" - j2_block_end_string: "%}}}" - j2_variable_start_string: "{{{" - j2_variable_end_string: "}}}" - mike: canonical_version: latest + - multirepo + - mkdocstrings: + custom_templates: docs/templates + handlers: + python: + options: + show_root_heading: true + show_root_full_path: false + show_signature_annotations: true + separate_signature: true + signature_crossrefs: true + show_symbol_type_heading: true + show_symbol_type_toc: true + show_source: false + link_source: true + docstring_section_style: spacy + annotations_path: source + inventories: + - https://docs.python.org/3/objects.inv + - https://pandas.pydata.org/docs/objects.inv + - https://numpy.org/doc/stable/objects.inv + - https://docs.python-requests.org/en/latest/objects.inv + - https://docs.pydantic.dev/latest/objects.inv + - https://fastapi.tiangolo.com/objects.inv + - https://scikit-learn.org/stable/objects.inv + - https://docs.pola.rs/api/python/stable/objects.inv markdown_extensions: - admonition @@ -342,8 +379,6 @@ markdown_extensions: permalink: "#" - pymdownx.tasklist: custom_checkbox: true - - markdown_include.include: - base_path: docs - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg diff --git a/requirements-docs.txt b/requirements-docs.txt index 560779ddb..e3e38fea1 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,12 +1,10 @@ -mkdocs==1.5.3 -mkdocs-material==9.5.17 -mike==2.0.0 -sphinx==7.2.6 -keras_autodoc @ git+https://git@github.com/logicalclocks/keras-autodoc -markdown-include==0.8.1 -mkdocs-jupyter==0.24.3 -markdown==3.6 -pymdown-extensions==10.7.1 -mkdocs-macros-plugin==1.0.4 +mkdocs==1.6.1 +mkdocs-material==9.7.0 +mike==2.1.3 +markdown==3.9 +pymdown-extensions==10.17.2 mkdocs-minify-plugin>=0.2.0 +mkdocs-multirepo-plugin==0.8.3 +mkdocstrings[python]==1.0.0 +mkdocstrings-python==2.0.1 linkchecker