Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
552 changes: 552 additions & 0 deletions blackbox_test_agent_memory_round2.py

Large diffs are not rendered by default.

405 changes: 405 additions & 0 deletions blackbox_test_agent_memory_round3.py

Large diffs are not rendered by default.

220 changes: 220 additions & 0 deletions blackbox_test_agent_memory_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env python3
"""Black-box test runner for the uploaded AgentMemorySystem implementation.

This runner intentionally treats the uploaded code as an opaque component and
only interacts with its public runtime behavior:

- MemLLM.load()
- MemLLM.write()
- MemLLM.generate()
- MemLLM.save_memory()
- MemLLM.load_memory()

It does not call private helpers, does not inspect internal memory state, and
does not use mocks.
"""

from __future__ import annotations

import importlib.util
import math
import os
import platform
import tempfile
import time
from dataclasses import dataclass
from importlib.machinery import SourceFileLoader

import torch
import transformers


TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
MODEL_NAME = "gpt2"
MUSIC_PROMPT = "The piano performance"
MUSIC_MEMORIES = [
"He practiced piano for hours perfecting a difficult Chopin nocturne.",
"She studied music theory and harmonic progression at the conservatory.",
"The orchestra rehearsed the symphony before the evening concert.",
]
MUSIC_KEYWORDS = {
"music",
"musical",
"violin",
"concert",
"symphony",
"guitar",
"practice",
"practicing",
}


@dataclass
class CaseResult:
case_id: str
title: str
passed: bool
duration_s: float
details: str


def load_target_module(path: str):
loader = SourceFileLoader("agent_memory_system_under_test", path)
spec = importlib.util.spec_from_loader(loader.name, loader)
if spec is None:
raise RuntimeError(f"Unable to create import spec for {path}")
module = importlib.util.module_from_spec(spec)
loader.exec_module(module)
return module


def keyword_hits(text: str, keywords: set[str]) -> list[str]:
lowered = text.lower()
return sorted(keyword for keyword in keywords if keyword in lowered)


def check(condition: bool, message: str) -> None:
if not condition:
raise AssertionError(message)


def build_model(module):
cfg = module.Cfg()
model = module.MemLLM(cfg)
model.load(MODEL_NAME)
return model


def run_case(case_id: str, title: str, fn) -> CaseResult:
start = time.perf_counter()
try:
details = fn()
passed = True
except Exception as exc: # pragma: no cover - failure path is test output
details = f"{type(exc).__name__}: {exc}"
passed = False
duration_s = time.perf_counter() - start
return CaseResult(case_id, title, passed, duration_s, details)


def main() -> int:
overall_start = time.perf_counter()
print("Black-box test runner: AgentMemorySystem")
print(f"Target file: {TARGET_PATH}")
print(f"Python: {platform.python_version()}")
print(f"Torch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print("")

check(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}")
module = load_target_module(TARGET_PATH)
torch.manual_seed(42)
model = build_model(module)
results: list[CaseResult] = []
state: dict[str, object] = {}

def tc01_load_public_api() -> str:
check(model is not None, "MemLLM.load() did not produce a model instance")
return f"Loaded public model API successfully with {MODEL_NAME}"

def tc02_generate_without_memory() -> str:
output = model.generate("Hello", mt=15, greedy=True)
check(isinstance(output, str), "generate() did not return a string")
check(output.startswith("Hello"), "Generated text does not preserve the prompt prefix")
check(len(output) > len("Hello"), "Generated text did not extend the prompt")
return f"Output: {output!r}"

def tc03_baseline_music_prompt_before_memory() -> str:
output = model.generate(MUSIC_PROMPT, mt=20, greedy=True)
continuation = output[len(MUSIC_PROMPT) :]
hits = keyword_hits(continuation, MUSIC_KEYWORDS)
state["baseline_music_output"] = output
state["baseline_music_hits"] = hits
check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved in the baseline run")
return f"Baseline output: {output!r}\nBaseline keyword hits: {hits}"

def tc04_write_and_ground_music_domain() -> str:
write_lines = []
for text in MUSIC_MEMORIES:
stored, gates = model.write(text, training_mode=True)
check(stored == 1, f"training_mode=True should store the input, got stored={stored}")
check(len(gates) == 1, f"Expected exactly one gate value, got {gates}")
check(math.isfinite(gates[0]), f"Gate value is not finite: {gates[0]}")
write_lines.append(f"stored={stored}, gate={gates[0]:.6f}, text={text!r}")

output = model.generate(MUSIC_PROMPT, mt=20, greedy=True)
continuation = output[len(MUSIC_PROMPT) :]
hits = keyword_hits(continuation, MUSIC_KEYWORDS)
state["post_memory_music_output"] = output
state["post_memory_music_hits"] = hits
check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved after writing memory")
check(hits, f"No music-domain grounding detected in continuation: {continuation!r}")
return "\n".join(write_lines + [f"Output: {output!r}", f"Keyword hits: {hits}"])

def tc05_memory_improves_domain_signal() -> str:
baseline_hits = state.get("baseline_music_hits")
post_hits = state.get("post_memory_music_hits")
check(isinstance(baseline_hits, list), "Baseline music output was not recorded")
check(isinstance(post_hits, list), "Post-memory music output was not recorded")
check(
len(post_hits) > len(baseline_hits),
f"Music-domain signal did not improve: baseline={baseline_hits}, post={post_hits}",
)
return (
f"Baseline hits: {baseline_hits}\n"
f"Post-memory hits: {post_hits}\n"
f"Baseline output: {state['baseline_music_output']!r}\n"
f"Post-memory output: {state['post_memory_music_output']!r}"
)

def tc06_save_load_roundtrip() -> str:
fd, memory_path = tempfile.mkstemp(prefix="agent-memory-", suffix=".pt")
os.close(fd)
try:
model.save_memory(memory_path)
check(os.path.exists(memory_path), "save_memory() did not create a file")
file_size = os.path.getsize(memory_path)
check(file_size > 0, "save_memory() created an empty file")

torch.manual_seed(42)
reloaded = build_model(module)
reloaded.load_memory(memory_path)
output = reloaded.generate(MUSIC_PROMPT, mt=20, greedy=True)
continuation = output[len(MUSIC_PROMPT) :]
hits = keyword_hits(continuation, MUSIC_KEYWORDS)
check(output.startswith(MUSIC_PROMPT), "Reloaded model did not preserve the prompt prefix")
check(hits, f"No music-domain grounding after reload: {continuation!r}")
return (
f"Saved file: {memory_path} ({file_size} bytes)\n"
f"Output after reload: {output!r}\n"
f"Keyword hits after reload: {hits}"
)
finally:
if os.path.exists(memory_path):
os.remove(memory_path)

results.append(run_case("TC-01", "load public API", tc01_load_public_api))
results.append(run_case("TC-02", "generate without memory", tc02_generate_without_memory))
results.append(run_case("TC-03", "baseline music prompt before memory", tc03_baseline_music_prompt_before_memory))
results.append(run_case("TC-04", "write memory and observe domain grounding", tc04_write_and_ground_music_domain))
results.append(run_case("TC-05", "memory improves domain signal", tc05_memory_improves_domain_signal))
results.append(run_case("TC-06", "save/load memory roundtrip", tc06_save_load_roundtrip))

passed = sum(1 for result in results if result.passed)
failed = len(results) - passed
total_duration = time.perf_counter() - overall_start

print("=" * 72)
for result in results:
status = "PASS" if result.passed else "FAIL"
print(f"[{status}] {result.case_id} - {result.title} ({result.duration_s:.2f}s)")
print(result.details)
print("-" * 72)
print(f"Summary: {passed}/{len(results)} passed, {failed} failed")
print(f"Total duration: {total_duration:.2f}s")

return 0 if failed == 0 else 1


if __name__ == "__main__":
raise SystemExit(main())
33 changes: 33 additions & 0 deletions reports/agent_memory_blackbox_extended_summary.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# AgentMemorySystem 扩展黑盒测试总览

## 1. 本轮完成内容

- 第二轮 full 覆盖
- 放大版跨域污染热图
- 第三轮 full 覆盖(边界输入、异常输入、性能/时延)

## 2. 第二轮 full 结果

- PASS: 7
- WARN: 2
- FAIL: 0

## 3. 第三轮 full 结果

- PASS: 10
- WARN: 1
- FAIL: 1

## 4. 放大版污染热图结论

- cooking: own=2, foreign=5, ratio=2.50, verdict=high-contamination
- finance: own=2, foreign=3, ratio=1.50, verdict=high-contamination
- music: own=8, foreign=2, ratio=0.25, verdict=mixed
- space: own=7, foreign=4, ratio=0.57, verdict=mixed

## 5. 最高优先级发现

- P1: 空 prompt `generate("")` 会直接崩溃。
- P1: `transformers 5.x` 兼容性失败仍然成立。
- P2: 跨域污染在 dual-domain、four-way 以及放大版热图中均被稳定复现。
- P3: `mt < 0` 缺少显式参数校验,当前表现为直接返回原 prompt。
Loading