Center-for-Applied-AI · Eric-Fithian · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -26,15 +26,40 @@ pip install -e .[dev]
 
 ## Configure Environment Variables
 
-Create an `.env` file (or export in your shell) with credentials for the LLM providers you use. A minimal configuration:
+DELM requires API keys for the LLM providers you use. You are responsible for loading these environment variables in whatever way works best for your workflow.
 
-```env
-OPENAI_API_KEY=sk-...
-ANTHROPIC_API_KEY=...
-TOGETHER_API_KEY=...
+### Required Environment Variables by Provider
+
+- **OpenAI**: `OPENAI_API_KEY`
+- **Anthropic**: `ANTHROPIC_API_KEY`
+- **Google**: `GOOGLE_API_KEY`
+- **Groq**: `GROQ_API_KEY`
+- **Together AI**: `TOGETHER_API_KEY`
+- **Fireworks AI**: `FIREWORKS_API_KEY`
+
+### Option 1: Export in Your Shell
+
+```bash
+export OPENAI_API_KEY="sk-..."
+export ANTHROPIC_API_KEY="..."
+```
+
+### Option 2: Use python-dotenv (Optional)
+
+If you prefer using `.env` files, install and use `python-dotenv`:
+
+```bash
+pip install python-dotenv
+```
+
+Then in your script:
+
+```python
+from dotenv import load_dotenv
+load_dotenv()  # Load from .env file in current directory
 ```
 
-Replace the values with your credentials. DELM only loads providers that have available keys.
+**Note**: You only need to set the API key for the provider you're using. DELM accesses environment variables directly via the LLM client libraries (OpenAI, Anthropic, etc.).
 
 ## Create Your Pipeline Configuration
 

diff --git a/example.env b/example.env
@@ -1,6 +1,47 @@
+# Environment Variable Reference for DELM
+# ========================================
+#
+# DELM requires certain environment variables to be set depending on which LLM provider you use.
+# These variables should be loaded into your environment before running DELM.
+#
+# How to set environment variables:
+# ---------------------------------
+# 
+# Option 1: Export in your shell
+#   export OPENAI_API_KEY="your-openai-key"
+#
+# Option 2: Use a .env file with python-dotenv (user's choice)
+#   pip install python-dotenv
+#   Then in your script: from dotenv import load_dotenv; load_dotenv()
+#
+# Option 3: Set in Docker/container environment
+#   docker run -e OPENAI_API_KEY="your-key" ...
+#
+# Option 4: Use cloud secrets manager (AWS Secrets Manager, GCP Secret Manager, etc.)
+#
+# Option 5: Set in your IDE/development environment
+#
+# Required Environment Variables by Provider:
+# -------------------------------------------
+
+# OpenAI (for gpt-4, gpt-3.5-turbo, etc.)
 OPENAI_API_KEY="your-openai-key"
+
+# Anthropic (for claude-3-*, claude-2, etc.)
 ANTHROPIC_API_KEY="your-anthropic-key"
+
+# Google (for gemini-*, palm-*, etc.)
 GOOGLE_API_KEY="your-google-key"
+
+# Groq (for llama-*, mixtral-*, etc.)
 GROQ_API_KEY="your-groq-key"
+
+# Together AI
 TOGETHER_API_KEY="your-together-key"
-FIREWORKS_API_KEY="your-fireworks-key"
+
+# Fireworks AI
+FIREWORKS_API_KEY="your-fireworks-key"
+
+# Note: You only need to set the API key for the provider you're using.
+# DELM no longer automatically loads .env files - you are responsible for 
+# ensuring the appropriate environment variables are set before running DELM.
diff --git a/examples/prompt_optimization/prompt_optimization.py b/examples/prompt_optimization/prompt_optimization.py
@@ -1,5 +1,4 @@
-"""Implements LLM-In-the-Loop PRompt Optimization (LILPRO) using DELM.
-"""
+"""Implements LLM-In-the-Loop PRompt Optimization (LILPRO) using DELM."""
 
 from __future__ import annotations
 
@@ -54,6 +53,7 @@
 # helpers
 # ----------------------------------------------------------------------------
 
+
 def build_expected_df(record_labeled_df: pd.DataFrame) -> pd.DataFrame:
     """Create nested expected JSON per id, aggregating duplicates.
 
@@ -86,24 +86,36 @@ def build_expected_df(record_labeled_df: pd.DataFrame) -> pd.DataFrame:
         .reset_index(name="items")
     )
 
-    grouped["expected_json"] = grouped["items"].apply(lambda items: {CONTAINER_NAME: items})
+    grouped["expected_json"] = grouped["items"].apply(
+        lambda items: {CONTAINER_NAME: items}
+    )
     return grouped[["id", "expected_json"]]
 
 
 def _count_price_expectation(items: List[Dict[str, Any]] | None) -> Tuple[int, int]:
     """Return counts of True/False for price_expectation across items."""
     if not items:
         return 0, 0
-    true_count = sum(1 for it in items if isinstance(it, dict) and it.get("price_expectation") is True)
-    false_count = sum(1 for it in items if isinstance(it, dict) and it.get("price_expectation") is False)
+    true_count = sum(
+        1
+        for it in items
+        if isinstance(it, dict) and it.get("price_expectation") is True
+    )
+    false_count = sum(
+        1
+        for it in items
+        if isinstance(it, dict) and it.get("price_expectation") is False
+    )
     return true_count, false_count
 
 
 def _extract_items(d: Dict[str, Any] | None) -> List[Dict[str, Any]]:
     if not isinstance(d, dict):
         return []
     items = d.get(CONTAINER_NAME)
-    return [it for it in items if isinstance(it, dict)] if isinstance(items, list) else []
+    return (
+        [it for it in items if isinstance(it, dict)] if isinstance(items, list) else []
+    )
 
 
 def _normalize_good(value: Any) -> str:
@@ -212,14 +224,24 @@ def annotate_price_expectation_counts(record_pairs_df: pd.DataFrame) -> pd.DataF
         ),
         axis=1,
     )
-    counts_df = pd.DataFrame(list(counts), columns=["expected_counts", "predicted_counts"], index=df.index)
+    counts_df = pd.DataFrame(
+        list(counts), columns=["expected_counts", "predicted_counts"], index=df.index
+    )
     out = pd.DataFrame(
         {
             "id": df["id"].tolist(),
-            "exp_true": counts_df["expected_counts"].apply(lambda x: int(x[0]) if isinstance(x, tuple) else 0),
-            "exp_false": counts_df["expected_counts"].apply(lambda x: int(x[1]) if isinstance(x, tuple) else 0),
-            "pred_true": counts_df["predicted_counts"].apply(lambda x: int(x[0]) if isinstance(x, tuple) else 0),
-            "pred_false": counts_df["predicted_counts"].apply(lambda x: int(x[1]) if isinstance(x, tuple) else 0),
+            "exp_true": counts_df["expected_counts"].apply(
+                lambda x: int(x[0]) if isinstance(x, tuple) else 0
+            ),
+            "exp_false": counts_df["expected_counts"].apply(
+                lambda x: int(x[1]) if isinstance(x, tuple) else 0
+            ),
+            "pred_true": counts_df["predicted_counts"].apply(
+                lambda x: int(x[0]) if isinstance(x, tuple) else 0
+            ),
+            "pred_false": counts_df["predicted_counts"].apply(
+                lambda x: int(x[1]) if isinstance(x, tuple) else 0
+            ),
         }
     )
     return out
@@ -259,7 +281,12 @@ def compute_batch_stats(
 
     # Total extractions (total predicted items across all records)
     n_extractions = int(
-        sum(len(_extract_items(d)) for d in record_pairs_df.get("extracted_dict", pd.Series([{}] * len(record_pairs_df))) )
+        sum(
+            len(_extract_items(d))
+            for d in record_pairs_df.get(
+                "extracted_dict", pd.Series([{}] * len(record_pairs_df))
+            )
+        )
     )
 
     # Wrong price_expectation among matched (id+good) pairs (boolean inequality)
@@ -291,7 +318,9 @@ def append_metrics_row(csv_path: Path, row: Dict[str, Any]) -> None:
     df.to_csv(csv_path, mode="a", header=header, index=False)
 
 
-def save_precision_plot(csv_path: Path, out_path: Path, series: str = "presence") -> None:
+def save_precision_plot(
+    csv_path: Path, out_path: Path, series: str = "presence"
+) -> None:
     """Render precision-vs-batch plot from CSV with dynamic y-limits.
 
     series: "presence" to plot estimator precision; "matched" to plot matched_precision
@@ -305,18 +334,20 @@ def save_precision_plot(csv_path: Path, out_path: Path, series: str = "presence"
         return
     # ICLR-friendly style similar to cost_vs_coverage
     sns.set_theme(style="whitegrid", font_scale=1.2)
-    plt.rcParams.update({
-        "figure.figsize": (3.0, 2.0),
-        "font.size": 8,
-        "axes.labelsize": 8,
-        "axes.titlesize": 9,
-        "legend.fontsize": 7,
-        "xtick.labelsize": 7,
-        "ytick.labelsize": 7,
-        "savefig.bbox": "tight",
-        "savefig.pad_inches": 0.02,
-        "pdf.fonttype": 42,
-    })
+    plt.rcParams.update(
+        {
+            "figure.figsize": (3.0, 2.0),
+            "font.size": 8,
+            "axes.labelsize": 8,
+            "axes.titlesize": 9,
+            "legend.fontsize": 7,
+            "xtick.labelsize": 7,
+            "ytick.labelsize": 7,
+            "savefig.bbox": "tight",
+            "savefig.pad_inches": 0.02,
+            "pdf.fonttype": 42,
+        }
+    )
     plt.figure()
     if series == "presence":
         y = df["precision"]
@@ -395,10 +426,14 @@ def set_price_expectation_description(schema_path: Path, new_description: str) -
             changed = True
             break
     if changed:
-        schema_path.write_text(yaml.safe_dump(spec, sort_keys=False, allow_unicode=True))
+        schema_path.write_text(
+            yaml.safe_dump(spec, sort_keys=False, allow_unicode=True)
+        )
 
 
-def run_optimizer_and_get_guidance(current_definition: str, examples_text: str) -> Dict[str, Any]:
+def run_optimizer_and_get_guidance(
+    current_definition: str, examples_text: str
+) -> Dict[str, Any]:
     """Run optimizer to produce a refined definition from wrong examples."""
     cfg = DELMConfig.from_yaml(OPTIMIZER_CONFIG_PATH)
     cfg.schema.spec_path = OPTIMIZER_SCHEMA_PATH
@@ -441,6 +476,7 @@ def run_optimizer_and_get_guidance(current_definition: str, examples_text: str)
 # main flow
 # ----------------------------------------------------------------------------
 
+
 def main() -> None:
     """Run iterative optimization and plot precision across batches."""
     random.seed(RANDOM_SEED)
@@ -465,7 +501,9 @@ def main() -> None:
     metrics_csv_path = EXPERIMENT_ROOT_DIR / "precision_by_batch.csv"
 
     # Determine 10% evaluation sample size (at least 1 record)
-    eval_record_sample_size = max(1, int(np.ceil(EVAL_SAMPLE_RATIO * len(record_expected_df))))
+    eval_record_sample_size = max(
+        1, int(np.ceil(EVAL_SAMPLE_RATIO * len(record_expected_df)))
+    )
 
     for batch_idx in tqdm(range(NUM_BATCHES + 1), desc="batches", leave=True):
         cfg = DELMConfig.from_dict(base_cfg.to_serialized_config_dict())
@@ -501,12 +539,24 @@ def main() -> None:
         )
 
         # Append to in-memory list for reference
-        batch_records.append({"batch": batch_idx, "precision": precision, "matched_precision": matched_precision, **stats})
+        batch_records.append(
+            {
+                "batch": batch_idx,
+                "precision": precision,
+                "matched_precision": matched_precision,
+                **stats,
+            }
+        )
 
         # Persist/update the metrics CSV after each batch
         append_metrics_row(
             metrics_csv_path,
-            {"batch": batch_idx, "precision": precision, "matched_precision": matched_precision, **stats},
+            {
+                "batch": batch_idx,
+                "precision": precision,
+                "matched_precision": matched_precision,
+                **stats,
+            },
         )
 
         # Save the per-record trace for price_expectation counts
@@ -518,13 +568,31 @@ def main() -> None:
             json.dump(metrics_dict, fh, ensure_ascii=False, indent=2)
 
         record_pairs_out_df = record_pairs_df.copy()
-        record_pairs_out_df.to_json(exp_dir / "record_pairs.json", orient="records", force_ascii=False, indent=2)
+        record_pairs_out_df.to_json(
+            exp_dir / "record_pairs.json", orient="records", force_ascii=False, indent=2
+        )
 
         # Save or update the precision plots incrementally (PNG + PDF)
-        save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png", series="presence")
-        save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf", series="presence")
-        save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png", series="matched")
-        save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf", series="matched")
+        save_precision_plot(
+            metrics_csv_path,
+            EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png",
+            series="presence",
+        )
+        save_precision_plot(
+            metrics_csv_path,
+            EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf",
+            series="presence",
+        )
+        save_precision_plot(
+            metrics_csv_path,
+            EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png",
+            series="matched",
+        )
+        save_precision_plot(
+            metrics_csv_path,
+            EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf",
+            series="matched",
+        )
 
         if batch_idx < NUM_BATCHES:
             wrong_df = find_wrong_price_expectation_records(record_pairs_df)
@@ -551,10 +619,26 @@ def main() -> None:
                 set_price_expectation_description(BASE_SCHEMA_PATH, new_def)
 
     # Final plot refresh from accumulated CSV (PNG + PDF)
-    save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png", series="presence")
-    save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf", series="presence")
-    save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png", series="matched")
-    save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf", series="matched")
+    save_precision_plot(
+        metrics_csv_path,
+        EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png",
+        series="presence",
+    )
+    save_precision_plot(
+        metrics_csv_path,
+        EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf",
+        series="presence",
+    )
+    save_precision_plot(
+        metrics_csv_path,
+        EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png",
+        series="matched",
+    )
+    save_precision_plot(
+        metrics_csv_path,
+        EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf",
+        series="matched",
+    )
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
     "instructor>=0.4.0",
     "pydantic>=2.0.0",
     "pyyaml>=6.0",
-    "python-dotenv>=1.0.0",
     "tqdm>=4.64.0",
     "rapidfuzz>=3.0.0",
     "beautifulsoup4>=4.11.0",