instructlab · mergify · Jul 28, 2024 · Jul 27, 2024 · Jul 18, 2024 · Jul 27, 2024
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -131,6 +131,7 @@ nb
 oneMKL
 orchestrator
 ots
+parallelized
 png
 pre
 preceeds

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### Features
 
+* `ilab data generate` now supports parallelized data generation across batches of the seed
+   data when running with a the vLLM serving. The `--batch-size` argument can be used to
+   control this behavior.
 * `ilab model download` now supports downloading models from OCI registries. Repositories
    that are prefixed by "docker://" and specified against `--repository` are treated as OCI
    registries.

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
@@ -4,6 +4,13 @@ This document is for commonly found problems and their solutions when using `ila
 
 ## `ilab` troubleshooting
 
+### `ilab data generate --endpoint-url` with llama-cpp fails with `openai.InternalServerError: Service Unavailable`
+
+llama-cpp does not support batching, which is enabled by default with remote
+endpoints. To resolve this error, disable batching using `--batch-size=0`.
+
+See [this issue](https://github.com/instructlab/instructlab/issues/1892).
+
 ### `ilab data generate` command running slow on macOS
 
 If you notice `ilab data generate` running for several hours or more on a Mac M-series, you should first check out the available memory on your system (See [Activity Monitor](https://support.apple.com/en-ie/guide/activity-monitor/welcome/mac) for more details). If there is < 8GM RAM available before serving a model, then check to see if you can free up some memory.

diff --git a/scripts/basic-workflow-tests.sh b/scripts/basic-workflow-tests.sh
@@ -259,6 +259,12 @@ test_generate() {
     if [ "$SDG_PIPELINE" = "full" ]; then
         GENERATE_ARGS+=("--pipeline" "full")
     fi
+
+    # Disable batching with llama-cpp. See https://github.com/instructlab/instructlab/issues/1892
+    if [ "$BACKEND" = "llama-cpp" ]; then
+        GENERATE_ARGS+=("--batch-size" "0")
+    fi
+
     ilab data generate --num-instructions ${NUM_INSTRUCTIONS} "${GENERATE_ARGS[@]}"
 }
 

diff --git a/src/instructlab/data/generate.py b/src/instructlab/data/generate.py
@@ -139,10 +139,13 @@
     "--pipeline",
     type=click.STRING,
     default="simple",
-    # Hidden until instructlab-sdg releases a version with multiple pipelines
-    # For now only "simple" is supported in the latest release.
-    hidden=True,
-    help="Data generation pipeline to use. Available: simple, full, or a valid path to a directory of pipeline worlfow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b.",
+    help="Data generation pipeline to use. Available: simple, full, or a valid path to a directory of pipeline workflow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b.",
+)
+@click.option(
+    "--batch-size",
+    type=click.IntRange(min=0),
+    default=None,
+    help="Number of elements to process in each batch through the SDG pipeline. Enabled by default for the vLLM serving backend, with a batch size of 8 chosen based on experiments to optimize for throughput. Use 0 to disable.",
 )
 @click.option(
     "--enable-serving-output",
@@ -174,6 +177,7 @@ def generate(
     model_family,
     pipeline,
     enable_serving_output,
+    batch_size,
 ):
     """Generates synthetic data to enhance your example data"""
     # pylint: disable=import-outside-toplevel
@@ -192,12 +196,19 @@ def generate(
     if ctx.obj is not None:
         prompt_file_path = ctx.obj.config.generate.prompt_file
 
+    # If batch size is not set explicitly, default to 8
+    # Once https://github.com/instructlab/sdg/issues/224 is resolved we can
+    # pass batch_size=None to the library instead
+    if batch_size is None:
+        batch_size = 8
+
     backend_instance = None
     if endpoint_url:
         api_base = endpoint_url
     else:
         # First Party
         from instructlab.model.backends import backends
+        from instructlab.model.backends.llama_cpp import Server as llama_cpp_server
 
         ctx.obj.config.serve.llama_cpp.llm_family = model_family
         backend_instance = backends.select_backend(ctx.obj.config.generate.teacher)
@@ -210,6 +221,14 @@ def generate(
         except Exception as exc:
             click.secho(f"Failed to start server: {exc}", fg="red")
             raise click.exceptions.Exit(1)
+
+        # disable batching when running with the local llama.cpp server
+        if isinstance(backend_instance, llama_cpp_server):
+            if batch_size is not None:
+                logger.warning(
+                    "Disabling SDG batching - unsupported with llama.cpp serving"
+                )
+            batch_size = 0
     try:
         click.echo(
             f"Generating synthetic data using '{model}' model, taxonomy:'{taxonomy_path}' against {api_base} server"
@@ -236,6 +255,7 @@ def generate(
             tls_client_key=tls_client_key,
             tls_client_passwd=tls_client_passwd,
             pipeline=pipeline,
+            batch_size=batch_size,
         )
     except GenerateException as exc:
         click.secho(
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,6 +131,7 @@ nb @@
     oneMKL
     orchestrator
     ots
+    parallelized
     png
     pre
     preceeds
@@ Expand Down @@