Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,40 @@ pip install -e .[dev]

## Configure Environment Variables

Create an `.env` file (or export in your shell) with credentials for the LLM providers you use. A minimal configuration:
DELM requires API keys for the LLM providers you use. You are responsible for loading these environment variables in whatever way works best for your workflow.

```env
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=...
TOGETHER_API_KEY=...
### Required Environment Variables by Provider

- **OpenAI**: `OPENAI_API_KEY`
- **Anthropic**: `ANTHROPIC_API_KEY`
- **Google**: `GOOGLE_API_KEY`
- **Groq**: `GROQ_API_KEY`
- **Together AI**: `TOGETHER_API_KEY`
- **Fireworks AI**: `FIREWORKS_API_KEY`

### Option 1: Export in Your Shell

```bash
export OPENAI_API_KEY="sk-..."
export ANTHROPIC_API_KEY="..."
```

### Option 2: Use python-dotenv (Optional)

If you prefer using `.env` files, install and use `python-dotenv`:

```bash
pip install python-dotenv
```

Then in your script:

```python
from dotenv import load_dotenv
load_dotenv() # Load from .env file in current directory
```

Replace the values with your credentials. DELM only loads providers that have available keys.
**Note**: You only need to set the API key for the provider you're using. DELM accesses environment variables directly via the LLM client libraries (OpenAI, Anthropic, etc.).

## Create Your Pipeline Configuration

Expand Down
43 changes: 42 additions & 1 deletion example.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,47 @@
# Environment Variable Reference for DELM
# ========================================
#
# DELM requires certain environment variables to be set depending on which LLM provider you use.
# These variables should be loaded into your environment before running DELM.
#
# How to set environment variables:
# ---------------------------------
#
# Option 1: Export in your shell
# export OPENAI_API_KEY="your-openai-key"
#
# Option 2: Use a .env file with python-dotenv (user's choice)
# pip install python-dotenv
# Then in your script: from dotenv import load_dotenv; load_dotenv()
#
# Option 3: Set in Docker/container environment
# docker run -e OPENAI_API_KEY="your-key" ...
#
# Option 4: Use cloud secrets manager (AWS Secrets Manager, GCP Secret Manager, etc.)
#
# Option 5: Set in your IDE/development environment
#
# Required Environment Variables by Provider:
# -------------------------------------------

# OpenAI (for gpt-4, gpt-3.5-turbo, etc.)
OPENAI_API_KEY="your-openai-key"

# Anthropic (for claude-3-*, claude-2, etc.)
ANTHROPIC_API_KEY="your-anthropic-key"

# Google (for gemini-*, palm-*, etc.)
GOOGLE_API_KEY="your-google-key"

# Groq (for llama-*, mixtral-*, etc.)
GROQ_API_KEY="your-groq-key"

# Together AI
TOGETHER_API_KEY="your-together-key"
FIREWORKS_API_KEY="your-fireworks-key"

# Fireworks AI
FIREWORKS_API_KEY="your-fireworks-key"

# Note: You only need to set the API key for the provider you're using.
# DELM no longer automatically loads .env files - you are responsible for
# ensuring the appropriate environment variables are set before running DELM.
162 changes: 123 additions & 39 deletions examples/prompt_optimization/prompt_optimization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Implements LLM-In-the-Loop PRompt Optimization (LILPRO) using DELM.
"""
"""Implements LLM-In-the-Loop PRompt Optimization (LILPRO) using DELM."""

from __future__ import annotations

Expand Down Expand Up @@ -54,6 +53,7 @@
# helpers
# ----------------------------------------------------------------------------


def build_expected_df(record_labeled_df: pd.DataFrame) -> pd.DataFrame:
"""Create nested expected JSON per id, aggregating duplicates.

Expand Down Expand Up @@ -86,24 +86,36 @@ def build_expected_df(record_labeled_df: pd.DataFrame) -> pd.DataFrame:
.reset_index(name="items")
)

grouped["expected_json"] = grouped["items"].apply(lambda items: {CONTAINER_NAME: items})
grouped["expected_json"] = grouped["items"].apply(
lambda items: {CONTAINER_NAME: items}
)
return grouped[["id", "expected_json"]]


def _count_price_expectation(items: List[Dict[str, Any]] | None) -> Tuple[int, int]:
"""Return counts of True/False for price_expectation across items."""
if not items:
return 0, 0
true_count = sum(1 for it in items if isinstance(it, dict) and it.get("price_expectation") is True)
false_count = sum(1 for it in items if isinstance(it, dict) and it.get("price_expectation") is False)
true_count = sum(
1
for it in items
if isinstance(it, dict) and it.get("price_expectation") is True
)
false_count = sum(
1
for it in items
if isinstance(it, dict) and it.get("price_expectation") is False
)
return true_count, false_count


def _extract_items(d: Dict[str, Any] | None) -> List[Dict[str, Any]]:
if not isinstance(d, dict):
return []
items = d.get(CONTAINER_NAME)
return [it for it in items if isinstance(it, dict)] if isinstance(items, list) else []
return (
[it for it in items if isinstance(it, dict)] if isinstance(items, list) else []
)


def _normalize_good(value: Any) -> str:
Expand Down Expand Up @@ -212,14 +224,24 @@ def annotate_price_expectation_counts(record_pairs_df: pd.DataFrame) -> pd.DataF
),
axis=1,
)
counts_df = pd.DataFrame(list(counts), columns=["expected_counts", "predicted_counts"], index=df.index)
counts_df = pd.DataFrame(
list(counts), columns=["expected_counts", "predicted_counts"], index=df.index
)
out = pd.DataFrame(
{
"id": df["id"].tolist(),
"exp_true": counts_df["expected_counts"].apply(lambda x: int(x[0]) if isinstance(x, tuple) else 0),
"exp_false": counts_df["expected_counts"].apply(lambda x: int(x[1]) if isinstance(x, tuple) else 0),
"pred_true": counts_df["predicted_counts"].apply(lambda x: int(x[0]) if isinstance(x, tuple) else 0),
"pred_false": counts_df["predicted_counts"].apply(lambda x: int(x[1]) if isinstance(x, tuple) else 0),
"exp_true": counts_df["expected_counts"].apply(
lambda x: int(x[0]) if isinstance(x, tuple) else 0
),
"exp_false": counts_df["expected_counts"].apply(
lambda x: int(x[1]) if isinstance(x, tuple) else 0
),
"pred_true": counts_df["predicted_counts"].apply(
lambda x: int(x[0]) if isinstance(x, tuple) else 0
),
"pred_false": counts_df["predicted_counts"].apply(
lambda x: int(x[1]) if isinstance(x, tuple) else 0
),
}
)
return out
Expand Down Expand Up @@ -259,7 +281,12 @@ def compute_batch_stats(

# Total extractions (total predicted items across all records)
n_extractions = int(
sum(len(_extract_items(d)) for d in record_pairs_df.get("extracted_dict", pd.Series([{}] * len(record_pairs_df))) )
sum(
len(_extract_items(d))
for d in record_pairs_df.get(
"extracted_dict", pd.Series([{}] * len(record_pairs_df))
)
)
)

# Wrong price_expectation among matched (id+good) pairs (boolean inequality)
Expand Down Expand Up @@ -291,7 +318,9 @@ def append_metrics_row(csv_path: Path, row: Dict[str, Any]) -> None:
df.to_csv(csv_path, mode="a", header=header, index=False)


def save_precision_plot(csv_path: Path, out_path: Path, series: str = "presence") -> None:
def save_precision_plot(
csv_path: Path, out_path: Path, series: str = "presence"
) -> None:
"""Render precision-vs-batch plot from CSV with dynamic y-limits.

series: "presence" to plot estimator precision; "matched" to plot matched_precision
Expand All @@ -305,18 +334,20 @@ def save_precision_plot(csv_path: Path, out_path: Path, series: str = "presence"
return
# ICLR-friendly style similar to cost_vs_coverage
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.rcParams.update({
"figure.figsize": (3.0, 2.0),
"font.size": 8,
"axes.labelsize": 8,
"axes.titlesize": 9,
"legend.fontsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"savefig.bbox": "tight",
"savefig.pad_inches": 0.02,
"pdf.fonttype": 42,
})
plt.rcParams.update(
{
"figure.figsize": (3.0, 2.0),
"font.size": 8,
"axes.labelsize": 8,
"axes.titlesize": 9,
"legend.fontsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"savefig.bbox": "tight",
"savefig.pad_inches": 0.02,
"pdf.fonttype": 42,
}
)
plt.figure()
if series == "presence":
y = df["precision"]
Expand Down Expand Up @@ -395,10 +426,14 @@ def set_price_expectation_description(schema_path: Path, new_description: str) -
changed = True
break
if changed:
schema_path.write_text(yaml.safe_dump(spec, sort_keys=False, allow_unicode=True))
schema_path.write_text(
yaml.safe_dump(spec, sort_keys=False, allow_unicode=True)
)


def run_optimizer_and_get_guidance(current_definition: str, examples_text: str) -> Dict[str, Any]:
def run_optimizer_and_get_guidance(
current_definition: str, examples_text: str
) -> Dict[str, Any]:
"""Run optimizer to produce a refined definition from wrong examples."""
cfg = DELMConfig.from_yaml(OPTIMIZER_CONFIG_PATH)
cfg.schema.spec_path = OPTIMIZER_SCHEMA_PATH
Expand Down Expand Up @@ -441,6 +476,7 @@ def run_optimizer_and_get_guidance(current_definition: str, examples_text: str)
# main flow
# ----------------------------------------------------------------------------


def main() -> None:
"""Run iterative optimization and plot precision across batches."""
random.seed(RANDOM_SEED)
Expand All @@ -465,7 +501,9 @@ def main() -> None:
metrics_csv_path = EXPERIMENT_ROOT_DIR / "precision_by_batch.csv"

# Determine 10% evaluation sample size (at least 1 record)
eval_record_sample_size = max(1, int(np.ceil(EVAL_SAMPLE_RATIO * len(record_expected_df))))
eval_record_sample_size = max(
1, int(np.ceil(EVAL_SAMPLE_RATIO * len(record_expected_df)))
)

for batch_idx in tqdm(range(NUM_BATCHES + 1), desc="batches", leave=True):
cfg = DELMConfig.from_dict(base_cfg.to_serialized_config_dict())
Expand Down Expand Up @@ -501,12 +539,24 @@ def main() -> None:
)

# Append to in-memory list for reference
batch_records.append({"batch": batch_idx, "precision": precision, "matched_precision": matched_precision, **stats})
batch_records.append(
{
"batch": batch_idx,
"precision": precision,
"matched_precision": matched_precision,
**stats,
}
)

# Persist/update the metrics CSV after each batch
append_metrics_row(
metrics_csv_path,
{"batch": batch_idx, "precision": precision, "matched_precision": matched_precision, **stats},
{
"batch": batch_idx,
"precision": precision,
"matched_precision": matched_precision,
**stats,
},
)

# Save the per-record trace for price_expectation counts
Expand All @@ -518,13 +568,31 @@ def main() -> None:
json.dump(metrics_dict, fh, ensure_ascii=False, indent=2)

record_pairs_out_df = record_pairs_df.copy()
record_pairs_out_df.to_json(exp_dir / "record_pairs.json", orient="records", force_ascii=False, indent=2)
record_pairs_out_df.to_json(
exp_dir / "record_pairs.json", orient="records", force_ascii=False, indent=2
)

# Save or update the precision plots incrementally (PNG + PDF)
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png", series="presence")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf", series="presence")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png", series="matched")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf", series="matched")
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png",
series="presence",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf",
series="presence",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png",
series="matched",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf",
series="matched",
)

if batch_idx < NUM_BATCHES:
wrong_df = find_wrong_price_expectation_records(record_pairs_df)
Expand All @@ -551,10 +619,26 @@ def main() -> None:
set_price_expectation_description(BASE_SCHEMA_PATH, new_def)

# Final plot refresh from accumulated CSV (PNG + PDF)
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png", series="presence")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf", series="presence")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png", series="matched")
save_precision_plot(metrics_csv_path, EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf", series="matched")
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.png",
series="presence",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_presence.pdf",
series="presence",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.png",
series="matched",
)
save_precision_plot(
metrics_csv_path,
EXPERIMENT_ROOT_DIR / "precision_vs_batch_matched.pdf",
series="matched",
)


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ dependencies = [
"instructor>=0.4.0",
"pydantic>=2.0.0",
"pyyaml>=6.0",
"python-dotenv>=1.0.0",
"tqdm>=4.64.0",
"rapidfuzz>=3.0.0",
"beautifulsoup4>=4.11.0",
Expand Down
Loading