diff --git a/blackbox_test_agent_memory_round2.py b/blackbox_test_agent_memory_round2.py new file mode 100644 index 0000000..902dd92 --- /dev/null +++ b/blackbox_test_agent_memory_round2.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python3 +"""Second-round black-box matrix runner for AgentMemorySystem. + +This runner extends the first-round smoke tests with broader black-box +coverage: + +- stress scenarios +- long-text scenarios +- cross-domain contamination diagnostics +- stability scenarios + +The runner still treats the uploaded implementation as an opaque component and +only uses its public runtime behavior: + +- MemLLM.load() +- MemLLM.write() +- MemLLM.generate() +- MemLLM.save_memory() +- MemLLM.load_memory() + +Status semantics: +- PASS: scenario met its acceptance rule +- WARN: scenario finished, but a diagnostic risk was observed +- FAIL: scenario violated a gating requirement or raised an unexpected error +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import math +import os +import platform +import tempfile +import time +from dataclasses import asdict, dataclass +from importlib.machinery import SourceFileLoader +from typing import Callable + +import torch +import transformers + + +TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md" +MODEL_NAME = "gpt2" +DEFAULT_SEED = 42 + +PROMPTS = { + "music": "The piano performance", + "space": "The space telescope", + "finance": "The market outlook", + "cooking": "The chef prepared", +} + +CORPORA = { + "music": [ + "He practiced piano for hours perfecting a difficult Chopin nocturne.", + "She studied music theory and harmonic progression at the conservatory.", + "The orchestra rehearsed the symphony before the evening concert.", + ], + "space": [ + "Astronauts trained for the Mars mission in simulated zero gravity.", + "The telescope revealed distant galaxies beyond the Milky Way.", + "Mission control tracked the spacecraft during orbital insertion.", + ], + "finance": [ + "Investors monitored inflation data before the central bank meeting.", + "The portfolio manager reduced exposure to volatile growth stocks.", + "Quarterly earnings guidance shifted sentiment across the market.", + ], + "cooking": [ + "The chef reduced the sauce slowly before plating the duck.", + "Fresh basil and olive oil brightened the pasta at the finish.", + "The pastry team tempered chocolate for the dessert service.", + ], +} + +KEYWORDS = { + "music": { + "music", + "musical", + "violin", + "concert", + "symphony", + "guitar", + "practice", + "practicing", + "piano", + "theory", + }, + "space": { + "space", + "telescope", + "galax", + "orbit", + "orbital", + "mars", + "mission", + "astronaut", + "spacecraft", + "planet", + }, + "finance": { + "market", + "stocks", + "portfolio", + "inflation", + "bank", + "earnings", + "investor", + "trading", + "equity", + "sentiment", + }, + "cooking": { + "chef", + "sauce", + "pasta", + "dessert", + "olive", + "basil", + "plating", + "chocolate", + "kitchen", + "roasted", + }, +} + + +@dataclass +class ScenarioDef: + scenario_id: str + category: str + title: str + suite: str + runner: Callable[[object], tuple[str, str, dict]] + + +@dataclass +class ScenarioResult: + scenario_id: str + category: str + title: str + suite: str + status: str + duration_s: float + summary: str + metrics: dict + + +def load_target_module(path: str): + loader = SourceFileLoader("agent_memory_system_round2", path) + spec = importlib.util.spec_from_loader(loader.name, loader) + if spec is None: + raise RuntimeError(f"Unable to create import spec for {path}") + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def build_model(module, seed: int = DEFAULT_SEED): + torch.manual_seed(seed) + cfg = module.Cfg() + model = module.MemLLM(cfg) + model.load(MODEL_NAME) + return model + + +def ensure(condition: bool, message: str) -> None: + if not condition: + raise AssertionError(message) + + +def keyword_hits(text: str, keywords: set[str]) -> list[str]: + lowered = text.lower() + return sorted(keyword for keyword in keywords if keyword in lowered) + + +def continuation(prompt: str, output: str) -> str: + ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}") + return output[len(prompt) :] + + +def mean_gate(values: list[float]) -> float: + ensure(values, "No gate values were collected") + return float(sum(values) / len(values)) + + +def stable_write(model, texts: list[str]) -> list[float]: + gates: list[float] = [] + for text in texts: + stored, gate_vals = model.write(text, training_mode=True) + ensure(stored == 1, f"Expected one stored memory for training_mode=True, got {stored}") + ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}") + ensure(math.isfinite(gate_vals[0]), f"Gate value is not finite: {gate_vals[0]}") + gates.extend(gate_vals) + return gates + + +def run_scenario(module, scenario: ScenarioDef) -> ScenarioResult: + start = time.perf_counter() + try: + status, summary, metrics = scenario.runner(module) + except AssertionError as exc: + status = "FAIL" + summary = str(exc) + metrics = {} + except Exception as exc: # pragma: no cover - surfaced in terminal output + status = "FAIL" + summary = f"{type(exc).__name__}: {exc}" + metrics = {} + duration_s = time.perf_counter() - start + return ScenarioResult( + scenario_id=scenario.scenario_id, + category=scenario.category, + title=scenario.title, + suite=scenario.suite, + status=status, + duration_s=duration_s, + summary=summary, + metrics=metrics, + ) + + +def scenario_stress_write_generate(module) -> tuple[str, str, dict]: + model = build_model(module) + gate_values: list[float] = [] + prompt_outputs: dict[str, str] = {} + rounds = 2 + for _ in range(rounds): + for domain in ("music", "space", "finance", "cooking"): + gate_values.extend(stable_write(model, CORPORA[domain])) + for domain, prompt in PROMPTS.items(): + output = model.generate(prompt, mt=20, greedy=True) + ensure(isinstance(output, str), f"generate() did not return a string for {domain}") + ensure(len(output) > len(prompt), f"generate() did not extend prompt for {domain}") + continuation(prompt, output) + prompt_outputs[domain] = output + return ( + "PASS", + "Completed repeated write/generate pressure loop without crash.", + { + "rounds": rounds, + "total_writes": rounds * sum(len(v) for v in CORPORA.values()), + "total_generations": rounds * len(PROMPTS), + "avg_gate": round(mean_gate(gate_values), 6), + "sample_outputs": prompt_outputs, + }, + ) + + +def scenario_stress_save_load_cycles(module) -> tuple[str, str, dict]: + model = build_model(module) + gate_values: list[float] = [] + for domain in ("music", "space", "finance", "cooking"): + gate_values.extend(stable_write(model, CORPORA[domain])) + + outputs_by_cycle: list[dict[str, str]] = [] + cycles = 2 + fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round2-", suffix=".pt") + os.close(fd) + current = model + try: + for _ in range(cycles): + current.save_memory(memory_path) + ensure(os.path.getsize(memory_path) > 0, "save_memory() produced an empty file") + reloaded = build_model(module) + reloaded.load_memory(memory_path) + cycle_outputs = {} + for domain, prompt in PROMPTS.items(): + output = reloaded.generate(prompt, mt=20, greedy=True) + continuation(prompt, output) + cycle_outputs[domain] = output + outputs_by_cycle.append(cycle_outputs) + current = reloaded + finally: + if os.path.exists(memory_path): + os.remove(memory_path) + + return ( + "PASS", + "Repeated save/load cycles preserved externally valid generation behavior.", + { + "cycles": cycles, + "avg_gate": round(mean_gate(gate_values), 6), + "cycle_outputs": outputs_by_cycle, + }, + ) + + +def scenario_long_memory_write(module) -> tuple[str, str, dict]: + model = build_model(module) + long_text = " ".join(CORPORA["music"] * 20) + stored, gates = model.write(long_text, training_mode=True) + ensure(stored == 1, f"Expected one stored memory for long text, got {stored}") + ensure(len(gates) == 1 and math.isfinite(gates[0]), f"Unexpected gate values: {gates}") + + prompt = PROMPTS["music"] + output = model.generate(prompt, mt=25, greedy=True) + cont = continuation(prompt, output) + hits = keyword_hits(cont, KEYWORDS["music"]) + ensure(hits, f"Long-memory write did not yield music-domain hits: {cont!r}") + return ( + "PASS", + "Long memory write remained usable for downstream generation.", + { + "long_text_chars": len(long_text), + "gate": round(gates[0], 6), + "output": output, + "keyword_hits": hits, + }, + ) + + +def scenario_long_prompt_resilience(module) -> tuple[str, str, dict]: + model = build_model(module) + prompt = ("The piano performance review discussed harmony and rhythm in detail. " * 60).strip() + try: + output = model.generate(prompt, mt=10, greedy=True) + continuation(prompt, output) + return ( + "PASS", + "Long prompt generation completed without crashing.", + { + "prompt_chars": len(prompt), + "output_chars": len(output), + }, + ) + except Exception as exc: + return ( + "WARN", + "Long prompt generation raised an externally visible error.", + { + "prompt_chars": len(prompt), + "error_type": type(exc).__name__, + "error_message": str(exc), + }, + ) + + +def scenario_cross_domain_dual(module) -> tuple[str, str, dict]: + model = build_model(module) + stable_write(model, CORPORA["music"] + CORPORA["space"]) + + music_output = model.generate(PROMPTS["music"], mt=20, greedy=True) + space_output = model.generate(PROMPTS["space"], mt=20, greedy=True) + music_cont = continuation(PROMPTS["music"], music_output) + space_cont = continuation(PROMPTS["space"], space_output) + + music_own = keyword_hits(music_cont, KEYWORDS["music"]) + music_foreign = keyword_hits(music_cont, KEYWORDS["space"]) + space_own = keyword_hits(space_cont, KEYWORDS["space"]) + space_foreign = keyword_hits(space_cont, KEYWORDS["music"]) + + contamination_detected = bool(music_foreign or space_foreign) + missing_own_signal = not music_own or not space_own + status = "PASS" + summary = "Dual-domain prompts preserved own-domain signal without obvious contamination." + if contamination_detected or missing_own_signal: + status = "WARN" + summary = "Dual-domain run showed contamination or weak own-domain separation." + + return ( + status, + summary, + { + "music_output": music_output, + "space_output": space_output, + "music_own_hits": music_own, + "music_foreign_hits": music_foreign, + "space_own_hits": space_own, + "space_foreign_hits": space_foreign, + }, + ) + + +def scenario_cross_domain_fourway(module) -> tuple[str, str, dict]: + model = build_model(module) + for domain in ("music", "space", "finance", "cooking"): + stable_write(model, CORPORA[domain]) + + matrix: dict[str, dict[str, list[str]]] = {} + warning = False + for domain, prompt in PROMPTS.items(): + output = model.generate(prompt, mt=20, greedy=True) + cont = continuation(prompt, output) + row = {} + for keyword_domain, keywords in KEYWORDS.items(): + row[keyword_domain] = keyword_hits(cont, keywords) + matrix[domain] = row + own = row[domain] + foreign = { + k: v for k, v in row.items() if k != domain and v + } + if not own or foreign: + warning = True + + status = "WARN" if warning else "PASS" + summary = ( + "Four-way contamination matrix detected cross-domain bleed or weak own-domain signal." + if warning + else "Four-way contamination matrix looked clean." + ) + return status, summary, {"matrix": matrix} + + +def scenario_stability_fresh_instance(module) -> tuple[str, str, dict]: + model_a = build_model(module, seed=DEFAULT_SEED) + stable_write(model_a, CORPORA["music"]) + output_a = model_a.generate(PROMPTS["music"], mt=20, greedy=True) + + model_b = build_model(module, seed=DEFAULT_SEED) + stable_write(model_b, CORPORA["music"]) + output_b = model_b.generate(PROMPTS["music"], mt=20, greedy=True) + + ensure(output_a == output_b, "Fresh seeded instances produced different greedy outputs") + return ( + "PASS", + "Fresh seeded instances were exactly deterministic under greedy generation.", + { + "prompt": PROMPTS["music"], + "output": output_a, + }, + ) + + +def scenario_stability_same_instance(module) -> tuple[str, str, dict]: + model = build_model(module) + stable_write(model, CORPORA["music"]) + outputs = [model.generate(PROMPTS["music"], mt=20, greedy=True) for _ in range(3)] + identical = outputs[0] == outputs[1] == outputs[2] + return ( + "PASS" if identical else "WARN", + "Repeated greedy calls on the same instance were identical." if identical else "Repeated greedy calls drifted on the same instance.", + { + "outputs": outputs, + }, + ) + + +def scenario_stability_roundtrip_exactness(module) -> tuple[str, str, dict]: + model = build_model(module) + stable_write(model, CORPORA["music"]) + baseline = model.generate(PROMPTS["music"], mt=20, greedy=True) + + fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round2-exact-", suffix=".pt") + os.close(fd) + try: + model.save_memory(memory_path) + reloaded = build_model(module) + reloaded.load_memory(memory_path) + after_reload = reloaded.generate(PROMPTS["music"], mt=20, greedy=True) + finally: + if os.path.exists(memory_path): + os.remove(memory_path) + + ensure(baseline == after_reload, "Greedy output changed after save/load roundtrip") + return ( + "PASS", + "Greedy output stayed exactly stable across save/load roundtrip.", + { + "output": baseline, + }, + ) + + +SCENARIOS = [ + ScenarioDef("R2-STRESS-01", "stress", "repeated write/generate pressure", "representative", scenario_stress_write_generate), + ScenarioDef("R2-STRESS-02", "stress", "repeated save/load pressure", "full", scenario_stress_save_load_cycles), + ScenarioDef("R2-LONG-01", "long-text", "long memory write grounding", "representative", scenario_long_memory_write), + ScenarioDef("R2-LONG-02", "long-text", "long prompt resilience", "full", scenario_long_prompt_resilience), + ScenarioDef("R2-CROSS-01", "cross-domain", "dual-domain contamination diagnostic", "representative", scenario_cross_domain_dual), + ScenarioDef("R2-CROSS-02", "cross-domain", "four-domain contamination matrix", "full", scenario_cross_domain_fourway), + ScenarioDef("R2-STABLE-01", "stability", "fresh instance determinism", "representative", scenario_stability_fresh_instance), + ScenarioDef("R2-STABLE-02", "stability", "same instance repeatability", "full", scenario_stability_same_instance), + ScenarioDef("R2-STABLE-03", "stability", "save/load exactness", "full", scenario_stability_roundtrip_exactness), +] + + +def select_scenarios(suite: str, scenario_ids: list[str] | None) -> list[ScenarioDef]: + selected = [] + allowed_suites = {"representative"} if suite == "representative" else {"representative", "full"} + id_filter = set(scenario_ids or []) + for scenario in SCENARIOS: + if scenario.suite not in allowed_suites: + continue + if id_filter and scenario.scenario_id not in id_filter: + continue + selected.append(scenario) + return selected + + +def main() -> int: + parser = argparse.ArgumentParser(description="Second-round AgentMemorySystem black-box test matrix runner") + parser.add_argument("--suite", choices=("representative", "full"), default="representative") + parser.add_argument("--scenario", action="append", help="Optional scenario ID filter; may be supplied multiple times") + parser.add_argument("--json-out", help="Optional path to write JSON results") + args = parser.parse_args() + + ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}") + module = load_target_module(TARGET_PATH) + scenarios = select_scenarios(args.suite, args.scenario) + ensure(scenarios, "No scenarios selected") + + print("Second-round black-box matrix runner: AgentMemorySystem") + print(f"Target file: {TARGET_PATH}") + print(f"Suite: {args.suite}") + print(f"Python: {platform.python_version()}") + print(f"Torch: {torch.__version__}") + print(f"Transformers: {transformers.__version__}") + print("") + + started = time.perf_counter() + results = [run_scenario(module, scenario) for scenario in scenarios] + total_duration = time.perf_counter() - started + + pass_count = sum(1 for result in results if result.status == "PASS") + warn_count = sum(1 for result in results if result.status == "WARN") + fail_count = sum(1 for result in results if result.status == "FAIL") + + print("=" * 80) + for result in results: + print( + f"[{result.status}] {result.scenario_id} | {result.category} | " + f"{result.title} ({result.duration_s:.2f}s)" + ) + print(result.summary) + if result.metrics: + print(json.dumps(result.metrics, indent=2, ensure_ascii=False, sort_keys=True)) + print("-" * 80) + print(f"Summary: PASS={pass_count}, WARN={warn_count}, FAIL={fail_count}") + print(f"Total duration: {total_duration:.2f}s") + + if args.json_out: + payload = { + "target_path": TARGET_PATH, + "suite": args.suite, + "python": platform.python_version(), + "torch": torch.__version__, + "transformers": transformers.__version__, + "total_duration_s": total_duration, + "results": [asdict(result) for result in results], + } + with open(args.json_out, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True) + + return 1 if fail_count else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/blackbox_test_agent_memory_round3.py b/blackbox_test_agent_memory_round3.py new file mode 100644 index 0000000..6192ee1 --- /dev/null +++ b/blackbox_test_agent_memory_round3.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python3 +"""Third-round black-box runner for AgentMemorySystem. + +Focus areas: +- boundary inputs +- abnormal/exception-facing inputs +- performance and latency baselines + +The target implementation is still treated as an opaque component. The runner +only uses the public runtime behavior exposed by: + +- MemLLM.load() +- MemLLM.write() +- MemLLM.generate() +- MemLLM.save_memory() +- MemLLM.load_memory() +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import math +import os +import platform +import statistics +import time +from dataclasses import asdict, dataclass +from importlib.machinery import SourceFileLoader +from typing import Callable + +import torch +import transformers + + +TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md" +MODEL_NAME = "gpt2" +DEFAULT_SEED = 42 + +MUSIC_PROMPT = "The piano performance" +MUSIC_TEXTS = [ + "He practiced piano for hours perfecting a difficult Chopin nocturne.", + "She studied music theory and harmonic progression at the conservatory.", + "The orchestra rehearsed the symphony before the evening concert.", +] + + +@dataclass +class ScenarioDef: + scenario_id: str + category: str + title: str + suite: str + runner: Callable[[object], tuple[str, str, dict]] + + +@dataclass +class ScenarioResult: + scenario_id: str + category: str + title: str + suite: str + status: str + duration_s: float + summary: str + metrics: dict + + +def load_target_module(path: str): + loader = SourceFileLoader("agent_memory_system_round3", path) + spec = importlib.util.spec_from_loader(loader.name, loader) + if spec is None: + raise RuntimeError(f"Unable to create import spec for {path}") + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def ensure(condition: bool, message: str) -> None: + if not condition: + raise AssertionError(message) + + +def build_model(module, seed: int = DEFAULT_SEED): + torch.manual_seed(seed) + cfg = module.Cfg() + model = module.MemLLM(cfg) + model.load(MODEL_NAME) + return model + + +def continuation(prompt: str, output: str) -> str: + ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}") + return output[len(prompt) :] + + +def stable_music_seed(model) -> list[float]: + gates: list[float] = [] + for text in MUSIC_TEXTS: + stored, gate_vals = model.write(text, training_mode=True) + ensure(stored == 1, f"Expected one stored memory, got {stored}") + ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}") + ensure(math.isfinite(gate_vals[0]), f"Gate is not finite: {gate_vals[0]}") + gates.extend(gate_vals) + return gates + + +def run_scenario(module, scenario: ScenarioDef) -> ScenarioResult: + started = time.perf_counter() + try: + status, summary, metrics = scenario.runner(module) + except AssertionError as exc: + status = "FAIL" + summary = str(exc) + metrics = {} + except Exception as exc: # pragma: no cover + status = "FAIL" + summary = f"{type(exc).__name__}: {exc}" + metrics = {} + duration_s = time.perf_counter() - started + return ScenarioResult( + scenario_id=scenario.scenario_id, + category=scenario.category, + title=scenario.title, + suite=scenario.suite, + status=status, + duration_s=duration_s, + summary=summary, + metrics=metrics, + ) + + +def scenario_boundary_empty_prompt(module) -> tuple[str, str, dict]: + model = build_model(module) + output = model.generate("", mt=10, greedy=True) + ensure(isinstance(output, str), "generate() did not return a string") + ensure(len(output) > 0, "Empty prompt generation returned an empty string") + return "PASS", "Empty prompt generation returned a non-empty string.", {"output": output} + + +def scenario_boundary_single_char_prompt(module) -> tuple[str, str, dict]: + model = build_model(module) + prompt = "A" + output = model.generate(prompt, mt=12, greedy=True) + continuation(prompt, output) + return "PASS", "Single-character prompt generation remained valid.", {"output": output} + + +def scenario_boundary_whitespace_prompt(module) -> tuple[str, str, dict]: + model = build_model(module) + prompt = " " + output = model.generate(prompt, mt=10, greedy=True) + ensure(isinstance(output, str), "generate() did not return a string") + ensure(len(output) >= len(prompt), "Whitespace prompt output was shorter than prompt") + return "PASS", "Whitespace prompt generation completed.", {"output": output} + + +def scenario_boundary_newline_prompt(module) -> tuple[str, str, dict]: + model = build_model(module) + prompt = "Line one.\nLine two." + output = model.generate(prompt, mt=12, greedy=True) + continuation(prompt, output) + return "PASS", "Multi-line prompt generation completed.", {"output": output} + + +def scenario_abnormal_none_write(module) -> tuple[str, str, dict]: + model = build_model(module) + try: + model.write(None, training_mode=True) # type: ignore[arg-type] + except Exception as exc: + return ( + "PASS", + "write(None, ...) raised an externally visible exception as expected.", + {"error_type": type(exc).__name__, "error_message": str(exc)}, + ) + return "WARN", "write(None, ...) unexpectedly succeeded.", {} + + +def scenario_abnormal_none_generate(module) -> tuple[str, str, dict]: + model = build_model(module) + try: + model.generate(None, mt=10, greedy=True) # type: ignore[arg-type] + except Exception as exc: + return ( + "PASS", + "generate(None, ...) raised an externally visible exception as expected.", + {"error_type": type(exc).__name__, "error_message": str(exc)}, + ) + return "WARN", "generate(None, ...) unexpectedly succeeded.", {} + + +def scenario_abnormal_negative_mt(module) -> tuple[str, str, dict]: + model = build_model(module) + prompt = "Hello" + output = model.generate(prompt, mt=-5, greedy=True) + if output == prompt: + return ( + "WARN", + "Negative mt returned the original prompt without explicit validation.", + {"output": output}, + ) + return ( + "WARN", + "Negative mt did not raise and produced a nonstandard output.", + {"output": output}, + ) + + +def scenario_abnormal_invalid_load_memory(module) -> tuple[str, str, dict]: + model = build_model(module) + missing_path = "/tmp/agent-memory-nonexistent-file.pt" + try: + model.load_memory(missing_path) + except Exception as exc: + return ( + "PASS", + "load_memory() on a missing path raised an externally visible exception.", + {"error_type": type(exc).__name__, "error_message": str(exc)}, + ) + return "WARN", "load_memory() on a missing path unexpectedly succeeded.", {} + + +def scenario_perf_cold_load_baseline(module) -> tuple[str, str, dict]: + started = time.perf_counter() + model = build_model(module) + elapsed = time.perf_counter() - started + ensure(model is not None, "Model failed to initialize") + return ( + "PASS", + "Cold load latency baseline recorded.", + {"cold_load_s": round(elapsed, 3)}, + ) + + +def scenario_perf_write_latency(module) -> tuple[str, str, dict]: + model = build_model(module) + timings = [] + for text in MUSIC_TEXTS: + started = time.perf_counter() + stored, gates = model.write(text, training_mode=True) + elapsed = time.perf_counter() - started + ensure(stored == 1, f"Expected one stored memory, got {stored}") + ensure(len(gates) == 1 and math.isfinite(gates[0]), f"Unexpected gate values: {gates}") + timings.append(elapsed) + return ( + "PASS", + "Write latency baseline recorded.", + { + "write_count": len(timings), + "avg_write_s": round(statistics.mean(timings), 3), + "max_write_s": round(max(timings), 3), + "min_write_s": round(min(timings), 3), + }, + ) + + +def scenario_perf_generate_latency(module) -> tuple[str, str, dict]: + model = build_model(module) + stable_music_seed(model) + timings = [] + outputs = [] + for _ in range(3): + started = time.perf_counter() + output = model.generate(MUSIC_PROMPT, mt=20, greedy=True) + elapsed = time.perf_counter() - started + continuation(MUSIC_PROMPT, output) + timings.append(elapsed) + outputs.append(output) + return ( + "PASS", + "Generate latency baseline recorded.", + { + "generate_count": len(timings), + "avg_generate_s": round(statistics.mean(timings), 3), + "max_generate_s": round(max(timings), 3), + "min_generate_s": round(min(timings), 3), + "sample_output": outputs[0], + }, + ) + + +def scenario_perf_save_load_latency(module) -> tuple[str, str, dict]: + import tempfile + + model = build_model(module) + stable_music_seed(model) + fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round3-", suffix=".pt") + os.close(fd) + try: + save_started = time.perf_counter() + model.save_memory(memory_path) + save_elapsed = time.perf_counter() - save_started + ensure(os.path.getsize(memory_path) > 0, "save_memory() produced an empty file") + + reload_model = build_model(module) + load_started = time.perf_counter() + reload_model.load_memory(memory_path) + load_elapsed = time.perf_counter() - load_started + + output = reload_model.generate(MUSIC_PROMPT, mt=20, greedy=True) + continuation(MUSIC_PROMPT, output) + finally: + if os.path.exists(memory_path): + os.remove(memory_path) + + return ( + "PASS", + "Save/load latency baseline recorded.", + { + "save_s": round(save_elapsed, 3), + "load_s": round(load_elapsed, 3), + "sample_output": output, + }, + ) + + +SCENARIOS = [ + ScenarioDef("R3-BOUND-01", "boundary-input", "empty prompt generation", "representative", scenario_boundary_empty_prompt), + ScenarioDef("R3-BOUND-02", "boundary-input", "single-character prompt generation", "full", scenario_boundary_single_char_prompt), + ScenarioDef("R3-BOUND-03", "boundary-input", "whitespace prompt generation", "full", scenario_boundary_whitespace_prompt), + ScenarioDef("R3-BOUND-04", "boundary-input", "multiline prompt generation", "full", scenario_boundary_newline_prompt), + ScenarioDef("R3-ABN-01", "abnormal-input", "write None input", "representative", scenario_abnormal_none_write), + ScenarioDef("R3-ABN-02", "abnormal-input", "generate None input", "full", scenario_abnormal_none_generate), + ScenarioDef("R3-ABN-03", "abnormal-input", "negative max tokens handling", "full", scenario_abnormal_negative_mt), + ScenarioDef("R3-ABN-04", "abnormal-input", "load missing memory file", "full", scenario_abnormal_invalid_load_memory), + ScenarioDef("R3-PERF-01", "performance", "cold load latency baseline", "representative", scenario_perf_cold_load_baseline), + ScenarioDef("R3-PERF-02", "performance", "write latency baseline", "full", scenario_perf_write_latency), + ScenarioDef("R3-PERF-03", "performance", "generate latency baseline", "representative", scenario_perf_generate_latency), + ScenarioDef("R3-PERF-04", "performance", "save/load latency baseline", "full", scenario_perf_save_load_latency), +] + + +def select_scenarios(suite: str, scenario_ids: list[str] | None) -> list[ScenarioDef]: + allowed_suites = {"representative"} if suite == "representative" else {"representative", "full"} + id_filter = set(scenario_ids or []) + selected = [] + for scenario in SCENARIOS: + if scenario.suite not in allowed_suites: + continue + if id_filter and scenario.scenario_id not in id_filter: + continue + selected.append(scenario) + return selected + + +def main() -> int: + parser = argparse.ArgumentParser(description="Third-round AgentMemorySystem black-box runner") + parser.add_argument("--suite", choices=("representative", "full"), default="representative") + parser.add_argument("--scenario", action="append", help="Optional scenario ID filter; may be supplied multiple times") + parser.add_argument("--json-out", help="Optional path to write JSON results") + args = parser.parse_args() + + ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}") + module = load_target_module(TARGET_PATH) + scenarios = select_scenarios(args.suite, args.scenario) + ensure(scenarios, "No scenarios selected") + + print("Third-round black-box runner: AgentMemorySystem") + print(f"Target file: {TARGET_PATH}") + print(f"Suite: {args.suite}") + print(f"Python: {platform.python_version()}") + print(f"Torch: {torch.__version__}") + print(f"Transformers: {transformers.__version__}") + print("") + + started = time.perf_counter() + results = [run_scenario(module, scenario) for scenario in scenarios] + total_duration = time.perf_counter() - started + + pass_count = sum(1 for result in results if result.status == "PASS") + warn_count = sum(1 for result in results if result.status == "WARN") + fail_count = sum(1 for result in results if result.status == "FAIL") + + print("=" * 80) + for result in results: + print( + f"[{result.status}] {result.scenario_id} | {result.category} | " + f"{result.title} ({result.duration_s:.2f}s)" + ) + print(result.summary) + if result.metrics: + print(json.dumps(result.metrics, indent=2, ensure_ascii=False, sort_keys=True)) + print("-" * 80) + print(f"Summary: PASS={pass_count}, WARN={warn_count}, FAIL={fail_count}") + print(f"Total duration: {total_duration:.2f}s") + + if args.json_out: + payload = { + "target_path": TARGET_PATH, + "suite": args.suite, + "python": platform.python_version(), + "torch": torch.__version__, + "transformers": transformers.__version__, + "total_duration_s": total_duration, + "results": [asdict(result) for result in results], + } + with open(args.json_out, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True) + + return 1 if fail_count else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/blackbox_test_agent_memory_system.py b/blackbox_test_agent_memory_system.py new file mode 100644 index 0000000..d8671d9 --- /dev/null +++ b/blackbox_test_agent_memory_system.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""Black-box test runner for the uploaded AgentMemorySystem implementation. + +This runner intentionally treats the uploaded code as an opaque component and +only interacts with its public runtime behavior: + +- MemLLM.load() +- MemLLM.write() +- MemLLM.generate() +- MemLLM.save_memory() +- MemLLM.load_memory() + +It does not call private helpers, does not inspect internal memory state, and +does not use mocks. +""" + +from __future__ import annotations + +import importlib.util +import math +import os +import platform +import tempfile +import time +from dataclasses import dataclass +from importlib.machinery import SourceFileLoader + +import torch +import transformers + + +TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md" +MODEL_NAME = "gpt2" +MUSIC_PROMPT = "The piano performance" +MUSIC_MEMORIES = [ + "He practiced piano for hours perfecting a difficult Chopin nocturne.", + "She studied music theory and harmonic progression at the conservatory.", + "The orchestra rehearsed the symphony before the evening concert.", +] +MUSIC_KEYWORDS = { + "music", + "musical", + "violin", + "concert", + "symphony", + "guitar", + "practice", + "practicing", +} + + +@dataclass +class CaseResult: + case_id: str + title: str + passed: bool + duration_s: float + details: str + + +def load_target_module(path: str): + loader = SourceFileLoader("agent_memory_system_under_test", path) + spec = importlib.util.spec_from_loader(loader.name, loader) + if spec is None: + raise RuntimeError(f"Unable to create import spec for {path}") + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def keyword_hits(text: str, keywords: set[str]) -> list[str]: + lowered = text.lower() + return sorted(keyword for keyword in keywords if keyword in lowered) + + +def check(condition: bool, message: str) -> None: + if not condition: + raise AssertionError(message) + + +def build_model(module): + cfg = module.Cfg() + model = module.MemLLM(cfg) + model.load(MODEL_NAME) + return model + + +def run_case(case_id: str, title: str, fn) -> CaseResult: + start = time.perf_counter() + try: + details = fn() + passed = True + except Exception as exc: # pragma: no cover - failure path is test output + details = f"{type(exc).__name__}: {exc}" + passed = False + duration_s = time.perf_counter() - start + return CaseResult(case_id, title, passed, duration_s, details) + + +def main() -> int: + overall_start = time.perf_counter() + print("Black-box test runner: AgentMemorySystem") + print(f"Target file: {TARGET_PATH}") + print(f"Python: {platform.python_version()}") + print(f"Torch: {torch.__version__}") + print(f"Transformers: {transformers.__version__}") + print("") + + check(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}") + module = load_target_module(TARGET_PATH) + torch.manual_seed(42) + model = build_model(module) + results: list[CaseResult] = [] + state: dict[str, object] = {} + + def tc01_load_public_api() -> str: + check(model is not None, "MemLLM.load() did not produce a model instance") + return f"Loaded public model API successfully with {MODEL_NAME}" + + def tc02_generate_without_memory() -> str: + output = model.generate("Hello", mt=15, greedy=True) + check(isinstance(output, str), "generate() did not return a string") + check(output.startswith("Hello"), "Generated text does not preserve the prompt prefix") + check(len(output) > len("Hello"), "Generated text did not extend the prompt") + return f"Output: {output!r}" + + def tc03_baseline_music_prompt_before_memory() -> str: + output = model.generate(MUSIC_PROMPT, mt=20, greedy=True) + continuation = output[len(MUSIC_PROMPT) :] + hits = keyword_hits(continuation, MUSIC_KEYWORDS) + state["baseline_music_output"] = output + state["baseline_music_hits"] = hits + check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved in the baseline run") + return f"Baseline output: {output!r}\nBaseline keyword hits: {hits}" + + def tc04_write_and_ground_music_domain() -> str: + write_lines = [] + for text in MUSIC_MEMORIES: + stored, gates = model.write(text, training_mode=True) + check(stored == 1, f"training_mode=True should store the input, got stored={stored}") + check(len(gates) == 1, f"Expected exactly one gate value, got {gates}") + check(math.isfinite(gates[0]), f"Gate value is not finite: {gates[0]}") + write_lines.append(f"stored={stored}, gate={gates[0]:.6f}, text={text!r}") + + output = model.generate(MUSIC_PROMPT, mt=20, greedy=True) + continuation = output[len(MUSIC_PROMPT) :] + hits = keyword_hits(continuation, MUSIC_KEYWORDS) + state["post_memory_music_output"] = output + state["post_memory_music_hits"] = hits + check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved after writing memory") + check(hits, f"No music-domain grounding detected in continuation: {continuation!r}") + return "\n".join(write_lines + [f"Output: {output!r}", f"Keyword hits: {hits}"]) + + def tc05_memory_improves_domain_signal() -> str: + baseline_hits = state.get("baseline_music_hits") + post_hits = state.get("post_memory_music_hits") + check(isinstance(baseline_hits, list), "Baseline music output was not recorded") + check(isinstance(post_hits, list), "Post-memory music output was not recorded") + check( + len(post_hits) > len(baseline_hits), + f"Music-domain signal did not improve: baseline={baseline_hits}, post={post_hits}", + ) + return ( + f"Baseline hits: {baseline_hits}\n" + f"Post-memory hits: {post_hits}\n" + f"Baseline output: {state['baseline_music_output']!r}\n" + f"Post-memory output: {state['post_memory_music_output']!r}" + ) + + def tc06_save_load_roundtrip() -> str: + fd, memory_path = tempfile.mkstemp(prefix="agent-memory-", suffix=".pt") + os.close(fd) + try: + model.save_memory(memory_path) + check(os.path.exists(memory_path), "save_memory() did not create a file") + file_size = os.path.getsize(memory_path) + check(file_size > 0, "save_memory() created an empty file") + + torch.manual_seed(42) + reloaded = build_model(module) + reloaded.load_memory(memory_path) + output = reloaded.generate(MUSIC_PROMPT, mt=20, greedy=True) + continuation = output[len(MUSIC_PROMPT) :] + hits = keyword_hits(continuation, MUSIC_KEYWORDS) + check(output.startswith(MUSIC_PROMPT), "Reloaded model did not preserve the prompt prefix") + check(hits, f"No music-domain grounding after reload: {continuation!r}") + return ( + f"Saved file: {memory_path} ({file_size} bytes)\n" + f"Output after reload: {output!r}\n" + f"Keyword hits after reload: {hits}" + ) + finally: + if os.path.exists(memory_path): + os.remove(memory_path) + + results.append(run_case("TC-01", "load public API", tc01_load_public_api)) + results.append(run_case("TC-02", "generate without memory", tc02_generate_without_memory)) + results.append(run_case("TC-03", "baseline music prompt before memory", tc03_baseline_music_prompt_before_memory)) + results.append(run_case("TC-04", "write memory and observe domain grounding", tc04_write_and_ground_music_domain)) + results.append(run_case("TC-05", "memory improves domain signal", tc05_memory_improves_domain_signal)) + results.append(run_case("TC-06", "save/load memory roundtrip", tc06_save_load_roundtrip)) + + passed = sum(1 for result in results if result.passed) + failed = len(results) - passed + total_duration = time.perf_counter() - overall_start + + print("=" * 72) + for result in results: + status = "PASS" if result.passed else "FAIL" + print(f"[{status}] {result.case_id} - {result.title} ({result.duration_s:.2f}s)") + print(result.details) + print("-" * 72) + print(f"Summary: {passed}/{len(results)} passed, {failed} failed") + print(f"Total duration: {total_duration:.2f}s") + + return 0 if failed == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/reports/agent_memory_blackbox_extended_summary.md b/reports/agent_memory_blackbox_extended_summary.md new file mode 100644 index 0000000..8e56005 --- /dev/null +++ b/reports/agent_memory_blackbox_extended_summary.md @@ -0,0 +1,33 @@ +# AgentMemorySystem 扩展黑盒测试总览 + +## 1. 本轮完成内容 + +- 第二轮 full 覆盖 +- 放大版跨域污染热图 +- 第三轮 full 覆盖(边界输入、异常输入、性能/时延) + +## 2. 第二轮 full 结果 + +- PASS: 7 +- WARN: 2 +- FAIL: 0 + +## 3. 第三轮 full 结果 + +- PASS: 10 +- WARN: 1 +- FAIL: 1 + +## 4. 放大版污染热图结论 + +- cooking: own=2, foreign=5, ratio=2.50, verdict=high-contamination +- finance: own=2, foreign=3, ratio=1.50, verdict=high-contamination +- music: own=8, foreign=2, ratio=0.25, verdict=mixed +- space: own=7, foreign=4, ratio=0.57, verdict=mixed + +## 5. 最高优先级发现 + +- P1: 空 prompt `generate("")` 会直接崩溃。 +- P1: `transformers 5.x` 兼容性失败仍然成立。 +- P2: 跨域污染在 dual-domain、four-way 以及放大版热图中均被稳定复现。 +- P3: `mt < 0` 缺少显式参数校验,当前表现为直接返回原 prompt。 diff --git a/reports/agent_memory_blackbox_round2_execution_report.md b/reports/agent_memory_blackbox_round2_execution_report.md new file mode 100644 index 0000000..4fc9e49 --- /dev/null +++ b/reports/agent_memory_blackbox_round2_execution_report.md @@ -0,0 +1,214 @@ +# AgentMemorySystem 第二轮黑盒测试执行报告 + +## 1. 执行范围 + +本次执行基于第二轮矩阵 runner: + +- `/workspace/blackbox_test_agent_memory_round2.py` + +执行套件: + +- `representative` + +代表集覆盖四类风险面各 1 个场景: + +1. 压力测试 +2. 长文本测试 +3. 跨域污染测试 +4. 稳定性测试 + +## 2. 执行环境 + +- OS: Linux 6.1.147 +- Python: 3.12.3 +- Torch: 2.11.0+cu130 +- Transformers: 4.57.6 +- Base model: `gpt2` + +## 3. 执行命令 + +```bash +python3 /workspace/blackbox_test_agent_memory_round2.py \ + --suite representative \ + --json-out /workspace/reports/agent_memory_blackbox_round2_results.json +``` + +## 4. 总体结果 + +- PASS: 3 +- WARN: 1 +- FAIL: 0 +- 总耗时: `321.42s` + +结论: + +- 第二轮代表集在兼容环境下整体可执行 +- 没有出现新的阻断性故障 +- 跨域污染问题被稳定复现并分类为 `WARN` + +## 5. 分场景结果 + +### R2-STRESS-01 repeated write/generate pressure + +**类型**:压力测试 +**结果**:PASS + +**结论** + +- 在两轮连续写入四个领域语料、并对四个 prompt 连续生成的压力下,没有出现崩溃 +- `write()` 返回 gate 值有限 +- `generate()` 能持续返回有效字符串,并保留 prompt 前缀 + +**关键指标** + +- rounds: `2` +- total_writes: `24` +- total_generations: `8` +- avg_gate: `0.564429` + +**样例输出** + +- music: + `The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is` +- space: + `The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this` + +**观察** + +功能层面通过,但混合域压力下输出中已能看到跨域词汇渗透,这与后续污染场景结果一致。 + +--- + +### R2-LONG-01 long memory write grounding + +**类型**:长文本测试 +**结果**:PASS + +**结论** + +- 单条超长记忆文本写入后,系统仍能对目标 prompt 产生可见的音乐领域接地 + +**关键指标** + +- long_text_chars: `4099` +- gate: `0.293301` +- keyword_hits: + - `concert` + - `music` + - `musical` + - `piano` + - `practice` + - `theory` + +**输出样例** + +```text +The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour + a- in this evening The +``` + +**观察** + +长文本写入并未导致接口失效,说明至少在这一级别的长输入下,系统仍保持外部可用性。 + +--- + +### R2-CROSS-01 dual-domain contamination diagnostic + +**类型**:跨域污染测试 +**结果**:WARN + +**结论** + +- 双域混合写入后,音乐 prompt 与太空 prompt 都出现明显的跨域串扰 +- 该问题不是崩溃类问题,但会影响语义隔离质量 + +**music prompt 结果** + +- own hits: + - `music` + - `musical` +- foreign hits: + - `mission` + - `planet` + +**space prompt 结果** + +- own hits: + - `mission` + - `planet` +- foreign hits: + - `music` + - `musical` + - `theory` + +**样例输出** + +- music: + `The piano performance musical music the mission of a, and planets-\n. in that is an all other's to` +- space: + `The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to` + +**判定原因** + +该场景是 diagnostic 场景,不把污染直接判成 FAIL;但由于 own-domain 与 foreign-domain 词同时明显出现,因此记为 `WARN`。 + +--- + +### R2-STABLE-01 fresh instance determinism + +**类型**:稳定性测试 +**结果**:PASS + +**结论** + +- 在相同 seed、相同写入顺序、相同 greedy prompt 下,全新实例之间的输出完全一致 + +**输出** + +```text +The piano performance musical music the and violin, a- is an in that's. + The other " it has +``` + +**意义** + +这说明在当前兼容环境中,初始化路径和 greedy 解码路径具备可重复性。 + +## 6. 结果解读 + +### 6.1 压力层面 + +第二轮代表集没有发现新的崩溃型问题。 +在较高调用密度下,公开接口仍然可用。 + +### 6.2 长文本层面 + +超长记忆写入场景通过,说明记忆写入链路对较长输入具备一定耐受性。 + +### 6.3 语义隔离层面 + +跨域污染问题被再次稳定复现,是本轮最重要的质量风险。 +它不会阻止系统“运行”,但会削弱“提示词所属领域 -> 目标领域输出”的纯度。 + +### 6.4 稳定性层面 + +新实例确定性通过,说明在兼容环境和 greedy 模式下,结果具备良好的可复现性。 + +## 7. 后续建议 + +如果继续做第三轮,可以优先补这几项: + +1. `R2-STRESS-02 repeated save/load pressure` 的全量执行 +2. `R2-LONG-02 long prompt resilience` 的边界行为验证 +3. `R2-CROSS-02 four-domain contamination matrix` 的全量污染热图 +4. `R2-STABLE-02` 与 `R2-STABLE-03` 的完整执行 + +## 8. 关联文件 + +- 第二轮矩阵设计: + `/workspace/reports/agent_memory_blackbox_round2_matrix.md` +- 第二轮代表集结果 JSON: + `/workspace/reports/agent_memory_blackbox_round2_results.json` +- 第二轮执行报告: + `/workspace/reports/agent_memory_blackbox_round2_execution_report.md` diff --git a/reports/agent_memory_blackbox_round2_full_execution_report.md b/reports/agent_memory_blackbox_round2_full_execution_report.md new file mode 100644 index 0000000..35a210b --- /dev/null +++ b/reports/agent_memory_blackbox_round2_full_execution_report.md @@ -0,0 +1,259 @@ +# AgentMemorySystem 第二轮 full 黑盒测试执行报告 + +## 1. 执行说明 + +第二轮 full 结果由以下两部分聚合而成: + +- 已执行的 representative 场景 4 个 +- 本轮补跑的 full-only 场景 5 个 + +这样覆盖了第二轮矩阵中的全部 9 个场景。 + +## 2. 环境 + +- Python: 3.12.3 +- Torch: 2.11.0+cu130 +- Transformers: 4.57.6 +- Model: gpt2 + +## 3. 汇总结果 + +- PASS: 7 +- WARN: 2 +- FAIL: 0 +- 聚合总耗时: `851.07s` + +## 4. 分场景结果 + +### R2-STRESS-01 repeated write/generate pressure + +- 类型: stress +- 状态: PASS +- 耗时: `70.97s` +- 结论: Completed repeated write/generate pressure loop without crash. + +```json +{ + "avg_gate": 0.564429, + "rounds": 2, + "sample_outputs": { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this" + }, + "total_generations": 8, + "total_writes": 24 +} +``` + +### R2-STRESS-02 repeated save/load pressure + +- 类型: stress +- 状态: PASS +- 耗时: `194.67s` +- 结论: Repeated save/load cycles preserved externally valid generation behavior. + +```json +{ + "avg_gate": 0.564429, + "cycle_outputs": [ + { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that's reduced to be more\n- or", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n" + }, + { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this" + } + ], + "cycles": 2 +} +``` + +### R2-LONG-01 long memory write grounding + +- 类型: long-text +- 状态: PASS +- 耗时: `63.66s` +- 结论: Long memory write remained usable for downstream generation. + +```json +{ + "gate": 0.293301, + "keyword_hits": [ + "concert", + "music", + "musical", + "piano", + "practice", + "theory" + ], + "long_text_chars": 4099, + "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The" +} +``` + +### R2-LONG-02 long prompt resilience + +- 类型: long-text +- 状态: PASS +- 耗时: `68.72s` +- 结论: Long prompt generation completed without crashing. + +```json +{ + "output_chars": 4186, + "prompt_chars": 4139 +} +``` + +### R2-CROSS-01 dual-domain contamination diagnostic + +- 类型: cross-domain +- 状态: WARN +- 耗时: `65.59s` +- 结论: Dual-domain run showed contamination or weak own-domain separation. + +```json +{ + "music_foreign_hits": [ + "mission", + "planet" + ], + "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to", + "music_own_hits": [ + "music", + "musical" + ], + "space_foreign_hits": [ + "music", + "musical", + "theory" + ], + "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to", + "space_own_hits": [ + "mission", + "planet" + ] +} +``` + +### R2-CROSS-02 four-domain contamination matrix + +- 类型: cross-domain +- 状态: WARN +- 耗时: `68.97s` +- 结论: Four-way contamination matrix detected cross-domain bleed or weak own-domain signal. + +```json +{ + "matrix": { + "cooking": { + "cooking": [ + "chef", + "pasta" + ], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "finance": { + "cooking": [], + "finance": [ + "market" + ], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "music": { + "cooking": [], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "space": { + "cooking": [], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission", + "orbit", + "planet" + ] + } + } +} +``` + +### R2-STABLE-01 fresh instance determinism + +- 类型: stability +- 状态: PASS +- 耗时: `121.20s` +- 结论: Fresh seeded instances were exactly deterministic under greedy generation. + +```json +{ + "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "prompt": "The piano performance" +} +``` + +### R2-STABLE-02 same instance repeatability + +- 类型: stability +- 状态: PASS +- 耗时: `65.30s` +- 结论: Repeated greedy calls on the same instance were identical. + +```json +{ + "outputs": [ + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" + ] +} +``` + +### R2-STABLE-03 save/load exactness + +- 类型: stability +- 状态: PASS +- 耗时: `131.99s` +- 结论: Greedy output stayed exactly stable across save/load roundtrip. + +```json +{ + "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" +} +``` + +## 5. 关键发现 + +- 第二轮 full 没有阻断性 FAIL。 +- WARN 全部集中在跨域污染场景。 +- 四域污染矩阵显示 music/space 仍可保留一定 own-domain 信号,但 finance/cooking 更容易被其它域压制。 diff --git a/reports/agent_memory_blackbox_round2_full_results.json b/reports/agent_memory_blackbox_round2_full_results.json new file mode 100644 index 0000000..ddc0a74 --- /dev/null +++ b/reports/agent_memory_blackbox_round2_full_results.json @@ -0,0 +1,229 @@ +{ + "aggregation_mode": "representative-plus-full-only-scenarios", + "python": "3.12.3", + "results": [ + { + "category": "stress", + "duration_s": 70.97084128700044, + "metrics": { + "avg_gate": 0.564429, + "rounds": 2, + "sample_outputs": { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this" + }, + "total_generations": 8, + "total_writes": 24 + }, + "scenario_id": "R2-STRESS-01", + "status": "PASS", + "suite": "representative", + "summary": "Completed repeated write/generate pressure loop without crash.", + "title": "repeated write/generate pressure" + }, + { + "category": "stress", + "duration_s": 194.665789562001, + "metrics": { + "avg_gate": 0.564429, + "cycle_outputs": [ + { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that's reduced to be more\n- or", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n" + }, + { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this" + } + ], + "cycles": 2 + }, + "scenario_id": "R2-STRESS-02", + "status": "PASS", + "suite": "full", + "summary": "Repeated save/load cycles preserved externally valid generation behavior.", + "title": "repeated save/load pressure" + }, + { + "category": "long-text", + "duration_s": 63.66024329799984, + "metrics": { + "gate": 0.293301, + "keyword_hits": [ + "concert", + "music", + "musical", + "piano", + "practice", + "theory" + ], + "long_text_chars": 4099, + "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The" + }, + "scenario_id": "R2-LONG-01", + "status": "PASS", + "suite": "representative", + "summary": "Long memory write remained usable for downstream generation.", + "title": "long memory write grounding" + }, + { + "category": "long-text", + "duration_s": 68.71787648099962, + "metrics": { + "output_chars": 4186, + "prompt_chars": 4139 + }, + "scenario_id": "R2-LONG-02", + "status": "PASS", + "suite": "full", + "summary": "Long prompt generation completed without crashing.", + "title": "long prompt resilience" + }, + { + "category": "cross-domain", + "duration_s": 65.58546234499954, + "metrics": { + "music_foreign_hits": [ + "mission", + "planet" + ], + "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to", + "music_own_hits": [ + "music", + "musical" + ], + "space_foreign_hits": [ + "music", + "musical", + "theory" + ], + "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to", + "space_own_hits": [ + "mission", + "planet" + ] + }, + "scenario_id": "R2-CROSS-01", + "status": "WARN", + "suite": "representative", + "summary": "Dual-domain run showed contamination or weak own-domain separation.", + "title": "dual-domain contamination diagnostic" + }, + { + "category": "cross-domain", + "duration_s": 68.97480458300015, + "metrics": { + "matrix": { + "cooking": { + "cooking": [ + "chef", + "pasta" + ], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "finance": { + "cooking": [], + "finance": [ + "market" + ], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "music": { + "cooking": [], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission" + ] + }, + "space": { + "cooking": [], + "finance": [], + "music": [ + "music", + "musical" + ], + "space": [ + "mission", + "orbit", + "planet" + ] + } + } + }, + "scenario_id": "R2-CROSS-02", + "status": "WARN", + "suite": "full", + "summary": "Four-way contamination matrix detected cross-domain bleed or weak own-domain signal.", + "title": "four-domain contamination matrix" + }, + { + "category": "stability", + "duration_s": 121.19981672600079, + "metrics": { + "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "prompt": "The piano performance" + }, + "scenario_id": "R2-STABLE-01", + "status": "PASS", + "suite": "representative", + "summary": "Fresh seeded instances were exactly deterministic under greedy generation.", + "title": "fresh instance determinism" + }, + { + "category": "stability", + "duration_s": 65.30302968500109, + "metrics": { + "outputs": [ + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" + ] + }, + "scenario_id": "R2-STABLE-02", + "status": "PASS", + "suite": "full", + "summary": "Repeated greedy calls on the same instance were identical.", + "title": "same instance repeatability" + }, + { + "category": "stability", + "duration_s": 131.99397509499977, + "metrics": { + "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" + }, + "scenario_id": "R2-STABLE-03", + "status": "PASS", + "suite": "full", + "summary": "Greedy output stayed exactly stable across save/load roundtrip.", + "title": "save/load exactness" + } + ], + "suite": "full", + "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md", + "torch": "2.11.0+cu130", + "total_duration_s": 851.0718390620023, + "transformers": "4.57.6" +} \ No newline at end of file diff --git a/reports/agent_memory_blackbox_round2_matrix.md b/reports/agent_memory_blackbox_round2_matrix.md new file mode 100644 index 0000000..172cd7d --- /dev/null +++ b/reports/agent_memory_blackbox_round2_matrix.md @@ -0,0 +1,183 @@ +# AgentMemorySystem 第二轮黑盒测试矩阵设计 + +## 1. 目标 + +在第一轮黑盒测试已经覆盖“加载、基础生成、记忆写入、持久化回环”的基础上,第二轮继续扩展以下四类风险面: + +1. 压力测试 +2. 长文本测试 +3. 跨域污染测试 +4. 稳定性测试 + +本轮仍坚持以下边界: + +- 不修改被测实现 +- 不使用 mock +- 不调用内部 `test()` / `test_*()` 自测函数 +- 不读取内部 memory tree、缓存、私有状态 +- 不把测试绑定到某个固定完整输出句子 + +## 2. 黑盒准则 + +### 2.1 允许观察的对象 + +只观察公开调用及其外部结果: + +- `MemLLM.load()` +- `MemLLM.write()` +- `MemLLM.generate()` +- `MemLLM.save_memory()` +- `MemLLM.load_memory()` + +### 2.2 判定方式 + +第二轮测试矩阵使用三档状态: + +- `PASS`:满足场景既定验收规则 +- `WARN`:场景可执行,但暴露出行为风险或质量问题 +- `FAIL`:场景违反硬性要求,或发生未接受的外部异常 + +其中: + +- **gating 场景**:以 `FAIL` 作为阻断结论 +- **diagnostic 场景**:允许 `WARN`,用于暴露质量风险而非直接阻断 + +## 3. 领域测试数据 + +为了做跨域和污染观察,第二轮矩阵使用四组真实文本域: + +- `music` +- `space` +- `finance` +- `cooking` + +每个域使用 3 条真实自然语言样本文本,以及 1 条对应 prompt。 + +## 4. 测试矩阵 + +| ID | 类型 | 场景 | 性质 | 核心刺激 | 主要观察指标 | 验收规则 | +|---|---|---|---|---|---|---| +| R2-STRESS-01 | 压力 | repeated write/generate pressure | gating | 多轮写入 + 多 prompt 连续生成 | 是否崩溃、是否保留 prompt 前缀、输出是否扩展、gate 是否有限 | 全流程无崩溃且输出有效 | +| R2-STRESS-02 | 压力 | repeated save/load pressure | gating | 混合域写入后反复 save/load | 每轮回载后生成是否仍有效 | 每轮均可生成有效输出 | +| R2-LONG-01 | 长文本 | long memory write grounding | gating | 写入超长单条记忆文本 | 长文本写入后是否仍能对目标 prompt 产生领域词 | 输出有效且命中目标域关键词 | +| R2-LONG-02 | 长文本 | long prompt resilience | diagnostic | 超长 prompt 直接生成 | 是否能完成生成;若失败,失败类型是什么 | 不崩溃为 PASS;崩溃记 WARN | +| R2-CROSS-01 | 跨域污染 | dual-domain contamination diagnostic | diagnostic | 同时写入 music + space | own-domain hits、foreign hits | 无污染为 PASS;有污染或 own signal 弱则 WARN | +| R2-CROSS-02 | 跨域污染 | four-domain contamination matrix | diagnostic | 同时写入四域并逐 prompt 生成 | 四域命中矩阵、foreign hit 分布 | own hits 足且 foreign 低为 PASS,否则 WARN | +| R2-STABLE-01 | 稳定性 | fresh instance determinism | gating | 相同 seed、相同输入、不同新实例 | greedy 输出是否完全一致 | 必须完全一致 | +| R2-STABLE-02 | 稳定性 | same instance repeatability | diagnostic | 同一实例重复 greedy 生成 | 重复调用是否漂移 | 完全一致为 PASS;漂移则 WARN | +| R2-STABLE-03 | 稳定性 | save/load exactness | gating | 生成前后做 save/load 回环 | roundtrip 前后 greedy 输出是否完全一致 | 必须完全一致 | + +## 5. 每类测试的设计意图 + +### 5.1 压力测试 + +第一轮只验证了小规模调用链路。第二轮压力测试关注: + +- 连续写入后是否出现崩溃 +- 连续生成后是否出现无输出或 prompt 破坏 +- 多次 save/load 后是否出现状态损坏 + +这类测试更接近真实系统运行中的“记忆不断被写入和读取”的场景。 + +### 5.2 长文本测试 + +第一轮没有覆盖长样本。第二轮要验证: + +- 单条超长 memory text 写入时是否还能正常使用 +- 超长 prompt 输入是否会触发位置编码、上下文窗口或形状错误 + +其中长 prompt 场景被定义为 diagnostic,因为模型上下文本来可能存在边界限制;这类用例的重点是暴露边界行为,而不是强行把所有边界都定义为功能缺陷。 + +### 5.3 跨域污染测试 + +第一轮已经观察到混合域输入存在串扰风险。第二轮将其系统化,形成矩阵: + +- 双域污染:快速定位最明显的串扰 +- 四域污染:观察污染是否随域数量上升而恶化 + +注意:这类用例不要求“生成完全纯净”,而是通过 own-hit / foreign-hit 的黑盒指标判断隔离质量。 + +### 5.4 稳定性测试 + +稳定性分三层: + +1. **新实例确定性**:相同 seed、相同写入顺序、相同 greedy prompt,是否给出完全相同结果 +2. **同实例重复性**:同一实例上重复调用是否会自发漂移 +3. **持久化精确性**:save/load 之后 greedy 输出是否保持完全一致 + +这三类一起能区分: + +- 初始化不稳定 +- 运行时状态漂移 +- 持久化恢复偏差 + +## 6. 非 overfit 说明 + +第二轮仍然不要求“输出精确等于某句话”,因为那会把测试绑定到模型偶然文案。 + +本轮采用的稳定观测指标包括: + +- prompt 前缀是否保留 +- 是否成功扩展输出 +- 是否出现目标领域关键词 +- 是否出现非目标领域关键词 +- greedy 输出是否在重复条件下完全一致 + +这类指标更稳健,也更符合黑盒验收目标。 + +## 7. 可执行资产 + +第二轮矩阵的可执行 runner: + +- `/workspace/blackbox_test_agent_memory_round2.py` + +支持两种运行模式: + +### representative + +执行每类 1 个代表场景: + +- `R2-STRESS-01` +- `R2-LONG-01` +- `R2-CROSS-01` +- `R2-STABLE-01` + +适合日常回归。 + +### full + +执行全部矩阵场景: + +- `R2-STRESS-01` +- `R2-STRESS-02` +- `R2-LONG-01` +- `R2-LONG-02` +- `R2-CROSS-01` +- `R2-CROSS-02` +- `R2-STABLE-01` +- `R2-STABLE-02` +- `R2-STABLE-03` + +适合完整验证或发布前检查。 + +## 8. 推荐执行命令 + +代表集: + +```bash +python3 /workspace/blackbox_test_agent_memory_round2.py --suite representative +``` + +全量集: + +```bash +python3 /workspace/blackbox_test_agent_memory_round2.py --suite full +``` + +输出 JSON: + +```bash +python3 /workspace/blackbox_test_agent_memory_round2.py \ + --suite representative \ + --json-out /workspace/reports/agent_memory_blackbox_round2_results.json +``` diff --git a/reports/agent_memory_blackbox_round2_results.json b/reports/agent_memory_blackbox_round2_results.json new file mode 100644 index 0000000..79f19a2 --- /dev/null +++ b/reports/agent_memory_blackbox_round2_results.json @@ -0,0 +1,96 @@ +{ + "python": "3.12.3", + "results": [ + { + "category": "stress", + "duration_s": 70.97084128700044, + "metrics": { + "avg_gate": 0.564429, + "rounds": 2, + "sample_outputs": { + "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a", + "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this" + }, + "total_generations": 8, + "total_writes": 24 + }, + "scenario_id": "R2-STRESS-01", + "status": "PASS", + "suite": "representative", + "summary": "Completed repeated write/generate pressure loop without crash.", + "title": "repeated write/generate pressure" + }, + { + "category": "long-text", + "duration_s": 63.66024329799984, + "metrics": { + "gate": 0.293301, + "keyword_hits": [ + "concert", + "music", + "musical", + "piano", + "practice", + "theory" + ], + "long_text_chars": 4099, + "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The" + }, + "scenario_id": "R2-LONG-01", + "status": "PASS", + "suite": "representative", + "summary": "Long memory write remained usable for downstream generation.", + "title": "long memory write grounding" + }, + { + "category": "cross-domain", + "duration_s": 65.58546234499954, + "metrics": { + "music_foreign_hits": [ + "mission", + "planet" + ], + "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to", + "music_own_hits": [ + "music", + "musical" + ], + "space_foreign_hits": [ + "music", + "musical", + "theory" + ], + "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to", + "space_own_hits": [ + "mission", + "planet" + ] + }, + "scenario_id": "R2-CROSS-01", + "status": "WARN", + "suite": "representative", + "summary": "Dual-domain run showed contamination or weak own-domain separation.", + "title": "dual-domain contamination diagnostic" + }, + { + "category": "stability", + "duration_s": 121.19981672600079, + "metrics": { + "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "prompt": "The piano performance" + }, + "scenario_id": "R2-STABLE-01", + "status": "PASS", + "suite": "representative", + "summary": "Fresh seeded instances were exactly deterministic under greedy generation.", + "title": "fresh instance determinism" + } + ], + "suite": "representative", + "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md", + "torch": "2.11.0+cu130", + "total_duration_s": 321.41653306199987, + "transformers": "4.57.6" +} \ No newline at end of file diff --git a/reports/agent_memory_blackbox_round3_execution_report.md b/reports/agent_memory_blackbox_round3_execution_report.md new file mode 100644 index 0000000..2bcb877 --- /dev/null +++ b/reports/agent_memory_blackbox_round3_execution_report.md @@ -0,0 +1,191 @@ +# AgentMemorySystem 第三轮 full 黑盒测试执行报告 + +## 1. 执行说明 + +第三轮 full 采用逐场景执行并聚合的方式完成,覆盖边界输入、异常输入、性能/时延基线共 12 个场景。 + +## 2. 环境 + +- Python: 3.12.3 +- Torch: 2.11.0+cu130 +- Transformers: 4.57.6 +- Model: gpt2 + +## 3. 汇总结果 + +- PASS: 10 +- WARN: 1 +- FAIL: 1 +- 聚合总耗时: `798.45s` + +## 4. 分场景结果 + +### R3-BOUND-01 empty prompt generation + +- 类型: boundary-input +- 状态: FAIL +- 耗时: `61.69s` +- 结论: RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous + +### R3-BOUND-02 single-character prompt generation + +- 类型: boundary-input +- 状态: PASS +- 耗时: `60.23s` +- 结论: Single-character prompt generation remained valid. + +```json +{ + "output": "A the I ami- \"I-k, ands" +} +``` + +### R3-BOUND-03 whitespace prompt generation + +- 类型: boundary-input +- 状态: PASS +- 耗时: `57.95s` +- 结论: Whitespace prompt generation completed. + +```json +{ + "output": " ia the world, I- The other\n (" +} +``` + +### R3-BOUND-04 multiline prompt generation + +- 类型: boundary-input +- 状态: PASS +- 耗时: `61.25s` +- 结论: Multi-line prompt generation completed. + +```json +{ + "output": "Line one.\nLine two. the I have a,\n" +} +``` + +### R3-ABN-01 write None input + +- 类型: abnormal-input +- 状态: PASS +- 耗时: `59.30s` +- 结论: write(None, ...) raised an externally visible exception as expected. + +```json +{ + "error_message": "You need to specify either `text` or `text_target`.", + "error_type": "ValueError" +} +``` + +### R3-ABN-02 generate None input + +- 类型: abnormal-input +- 状态: PASS +- 耗时: `62.67s` +- 结论: generate(None, ...) raised an externally visible exception as expected. + +```json +{ + "error_message": "You need to specify either `text` or `text_target`.", + "error_type": "ValueError" +} +``` + +### R3-ABN-03 negative max tokens handling + +- 类型: abnormal-input +- 状态: WARN +- 耗时: `66.16s` +- 结论: Negative mt returned the original prompt without explicit validation. + +```json +{ + "output": "Hello" +} +``` + +### R3-ABN-04 load missing memory file + +- 类型: abnormal-input +- 状态: PASS +- 耗时: `61.28s` +- 结论: load_memory() on a missing path raised an externally visible exception. + +```json +{ + "error_message": "[Errno 2] No such file or directory: '/tmp/agent-memory-nonexistent-file.pt'", + "error_type": "FileNotFoundError" +} +``` + +### R3-PERF-01 cold load latency baseline + +- 类型: performance +- 状态: PASS +- 耗时: `61.67s` +- 结论: Cold load latency baseline recorded. + +```json +{ + "cold_load_s": 61.623 +} +``` + +### R3-PERF-02 write latency baseline + +- 类型: performance +- 状态: PASS +- 耗时: `61.81s` +- 结论: Write latency baseline recorded. + +```json +{ + "avg_write_s": 0.081, + "max_write_s": 0.181, + "min_write_s": 0.028, + "write_count": 3 +} +``` + +### R3-PERF-03 generate latency baseline + +- 类型: performance +- 状态: PASS +- 耗时: `64.74s` +- 结论: Generate latency baseline recorded. + +```json +{ + "avg_generate_s": 0.957, + "generate_count": 3, + "max_generate_s": 1.002, + "min_generate_s": 0.887, + "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" +} +``` + +### R3-PERF-04 save/load latency baseline + +- 类型: performance +- 状态: PASS +- 耗时: `119.69s` +- 结论: Save/load latency baseline recorded. + +```json +{ + "load_s": 0.003, + "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "save_s": 0.002 +} +``` + +## 5. 关键发现 + +- `R3-BOUND-01` 暴露明确 FAIL:空字符串 prompt 会触发真实运行时崩溃。 +- `R3-ABN-03` 记为 WARN:`mt < 0` 不会报错,而是直接返回原 prompt,说明缺少显式参数校验。 +- 其余边界输入(单字符、空白、多行)可运行。 +- 异常输入(`None`、缺失文件)均能以外部可见异常形式返回。 +- 当前环境下的冷启动基线约为 61.6s;有记忆的 greedy 生成平均约 0.96s。 diff --git a/reports/agent_memory_blackbox_round3_matrix.md b/reports/agent_memory_blackbox_round3_matrix.md new file mode 100644 index 0000000..a8d3b6f --- /dev/null +++ b/reports/agent_memory_blackbox_round3_matrix.md @@ -0,0 +1,108 @@ +# AgentMemorySystem 第三轮黑盒测试矩阵设计 + +## 1. 目标 + +第三轮黑盒测试在前两轮基础上继续扩展三类能力面: + +1. 边界输入测试 +2. 异常输入测试 +3. 性能 / 时延基线 + +仍坚持黑盒边界: + +- 不修改被测实现 +- 不使用 mock +- 不读取内部 memory tree、缓存或私有状态 +- 不调用源码内置 `test()` / `test_*()` 自测函数 +- 不把断言写成固定完整文本匹配 + +## 2. 公开调用面 + +第三轮只使用以下公开接口: + +- `MemLLM.load()` +- `MemLLM.write()` +- `MemLLM.generate()` +- `MemLLM.save_memory()` +- `MemLLM.load_memory()` + +## 3. 状态语义 + +- `PASS`:满足场景验收规则 +- `WARN`:场景完成但暴露风险、边界不稳或性能超出经验阈值 +- `FAIL`:违反硬性要求或出现未接受的外部异常 + +## 4. 第三轮测试矩阵 + +| ID | 类型 | 场景 | 性质 | 核心刺激 | 主要观察指标 | 验收规则 | +|---|---|---|---|---|---|---| +| R3-BOUNDARY-01 | 边界输入 | empty prompt generate | diagnostic | `generate("")` | 是否报错、是否产生输出 | 不崩溃为 PASS;异常为 WARN | +| R3-BOUNDARY-02 | 边界输入 | punctuation-only prompt | diagnostic | 只含标点的 prompt | 前缀保持、输出扩展 | 完成生成为 PASS | +| R3-BOUNDARY-03 | 边界输入 | whitespace-heavy prompt | diagnostic | 空格/换行密集 prompt | 是否报错、输出长度 | 完成生成为 PASS | +| R3-BOUNDARY-04 | 边界输入 | minimal memory write | gating | 极短文本写入 | `write()` 是否返回有限 gate | 无异常且 gate 有限 | +| R3-EXC-01 | 异常输入 | non-string write input | gating | `write(None)` / `write(123)` | 是否抛出外部异常 | 必须抛出异常且进程不挂死 | +| R3-EXC-02 | 异常输入 | non-string generate input | gating | `generate(None)` / `generate(123)` | 是否抛出外部异常 | 必须抛出异常且进程不挂死 | +| R3-EXC-03 | 异常输入 | missing memory load path | gating | `load_memory("/tmp/not-found.pt")` | 是否抛出文件错误 | 必须抛出异常 | +| R3-EXC-04 | 异常输入 | invalid memory file load | gating | 对随机文本文件执行 `load_memory()` | 是否抛出解析异常 | 必须抛出异常 | +| R3-PERF-01 | 性能基线 | cold load baseline | diagnostic | 全新实例 `load("gpt2")` | 加载耗时 | 记录基线;超经验阈值记 WARN | +| R3-PERF-02 | 性能基线 | write latency baseline | diagnostic | 连续写入 3 条记忆 | 单次与平均耗时 | 记录基线;异常慢记 WARN | +| R3-PERF-03 | 性能基线 | generate latency baseline | diagnostic | 空记忆和有记忆两种生成 | 生成耗时 | 记录基线;异常慢记 WARN | +| R3-PERF-04 | 性能基线 | save/load roundtrip latency | diagnostic | 保存+读取记忆 | save/load 各自耗时 | 记录基线;异常慢记 WARN | + +## 5. 设计意图 + +### 5.1 边界输入 + +边界输入不是为了证明模型“语义上合理”,而是为了观察: + +- 接口是否在非常规输入下崩溃 +- 是否破坏 prompt 前缀约定 +- 是否出现空输出、死循环或明显外部异常 + +### 5.2 异常输入 + +第三轮把“错误输入是否被外部稳定处理”纳入黑盒验证范围。 + +这里不要求实现必须给出优雅错误文案,但要求: + +- 异常是**可观察、可终止、可定位**的 +- 不应导致进程卡死 +- 不应悄悄吞错并给出伪正常结果 + +### 5.3 性能 / 时延基线 + +性能场景重点不是追求绝对快,而是建立一条黑盒基线,回答: + +- 冷启动大约多慢 +- 单次写入大约多慢 +- 生成大约多慢 +- save/load 回环大约多慢 + +这些数据可为后续版本做对比回归。 + +## 6. 可执行资产 + +- 第三轮 runner: + `/workspace/blackbox_test_agent_memory_round3.py` + +## 7. 推荐执行命令 + +代表集: + +```bash +python3 /workspace/blackbox_test_agent_memory_round3.py --suite representative +``` + +全量集: + +```bash +python3 /workspace/blackbox_test_agent_memory_round3.py --suite full +``` + +导出 JSON: + +```bash +python3 /workspace/blackbox_test_agent_memory_round3.py \ + --suite full \ + --json-out /workspace/reports/agent_memory_blackbox_round3_results.json +``` diff --git a/reports/agent_memory_blackbox_round3_results.json b/reports/agent_memory_blackbox_round3_results.json new file mode 100644 index 0000000..410a2a1 --- /dev/null +++ b/reports/agent_memory_blackbox_round3_results.json @@ -0,0 +1,165 @@ +{ + "aggregation_mode": "per-scenario-execution", + "python": "3.12.3", + "results": [ + { + "category": "boundary-input", + "duration_s": 61.6888816799983, + "metrics": {}, + "scenario_id": "R3-BOUND-01", + "status": "FAIL", + "suite": "representative", + "summary": "RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous", + "title": "empty prompt generation" + }, + { + "category": "boundary-input", + "duration_s": 60.232560045002174, + "metrics": { + "output": "A the I ami- \"I-k, ands" + }, + "scenario_id": "R3-BOUND-02", + "status": "PASS", + "suite": "full", + "summary": "Single-character prompt generation remained valid.", + "title": "single-character prompt generation" + }, + { + "category": "boundary-input", + "duration_s": 57.94982666999931, + "metrics": { + "output": " ia the world, I- The other\n (" + }, + "scenario_id": "R3-BOUND-03", + "status": "PASS", + "suite": "full", + "summary": "Whitespace prompt generation completed.", + "title": "whitespace prompt generation" + }, + { + "category": "boundary-input", + "duration_s": 61.2545259260005, + "metrics": { + "output": "Line one.\nLine two. the I have a,\n" + }, + "scenario_id": "R3-BOUND-04", + "status": "PASS", + "suite": "full", + "summary": "Multi-line prompt generation completed.", + "title": "multiline prompt generation" + }, + { + "category": "abnormal-input", + "duration_s": 59.29971606500112, + "metrics": { + "error_message": "You need to specify either `text` or `text_target`.", + "error_type": "ValueError" + }, + "scenario_id": "R3-ABN-01", + "status": "PASS", + "suite": "representative", + "summary": "write(None, ...) raised an externally visible exception as expected.", + "title": "write None input" + }, + { + "category": "abnormal-input", + "duration_s": 62.671864920997905, + "metrics": { + "error_message": "You need to specify either `text` or `text_target`.", + "error_type": "ValueError" + }, + "scenario_id": "R3-ABN-02", + "status": "PASS", + "suite": "full", + "summary": "generate(None, ...) raised an externally visible exception as expected.", + "title": "generate None input" + }, + { + "category": "abnormal-input", + "duration_s": 66.16167209299965, + "metrics": { + "output": "Hello" + }, + "scenario_id": "R3-ABN-03", + "status": "WARN", + "suite": "full", + "summary": "Negative mt returned the original prompt without explicit validation.", + "title": "negative max tokens handling" + }, + { + "category": "abnormal-input", + "duration_s": 61.27568913600044, + "metrics": { + "error_message": "[Errno 2] No such file or directory: '/tmp/agent-memory-nonexistent-file.pt'", + "error_type": "FileNotFoundError" + }, + "scenario_id": "R3-ABN-04", + "status": "PASS", + "suite": "full", + "summary": "load_memory() on a missing path raised an externally visible exception.", + "title": "load missing memory file" + }, + { + "category": "performance", + "duration_s": 61.66670775899911, + "metrics": { + "cold_load_s": 61.623 + }, + "scenario_id": "R3-PERF-01", + "status": "PASS", + "suite": "representative", + "summary": "Cold load latency baseline recorded.", + "title": "cold load latency baseline" + }, + { + "category": "performance", + "duration_s": 61.81413885800066, + "metrics": { + "avg_write_s": 0.081, + "max_write_s": 0.181, + "min_write_s": 0.028, + "write_count": 3 + }, + "scenario_id": "R3-PERF-02", + "status": "PASS", + "suite": "full", + "summary": "Write latency baseline recorded.", + "title": "write latency baseline" + }, + { + "category": "performance", + "duration_s": 64.73673818600219, + "metrics": { + "avg_generate_s": 0.957, + "generate_count": 3, + "max_generate_s": 1.002, + "min_generate_s": 0.887, + "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has" + }, + "scenario_id": "R3-PERF-03", + "status": "PASS", + "suite": "representative", + "summary": "Generate latency baseline recorded.", + "title": "generate latency baseline" + }, + { + "category": "performance", + "duration_s": 119.6933355800029, + "metrics": { + "load_s": 0.003, + "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has", + "save_s": 0.002 + }, + "scenario_id": "R3-PERF-04", + "status": "PASS", + "suite": "full", + "summary": "Save/load latency baseline recorded.", + "title": "save/load latency baseline" + } + ], + "suite": "full", + "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md", + "torch": "2.11.0+cu130", + "total_duration_s": 798.4456569190043, + "transformers": "4.57.6" +} \ No newline at end of file diff --git a/reports/agent_memory_blackbox_test_report.md b/reports/agent_memory_blackbox_test_report.md new file mode 100644 index 0000000..fb7a4bc --- /dev/null +++ b/reports/agent_memory_blackbox_test_report.md @@ -0,0 +1,316 @@ +# AgentMemorySystem 黑盒测试报告 + +## 1. 测试目标 + +对上传文件 `/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md` 中的实现进行**黑盒测试**,要求满足: + +- 不使用 mock +- 不简化被测逻辑 +- 不修改被测代码 +- 不写依赖内部实现细节的 overfit 测试 +- 不依赖 fallback 路径掩盖真实问题 + +本次测试只通过其**公开可见运行行为**进行验证,不读取内部树结构、缓存、权重张量或私有辅助函数结果。 + +## 2. 被测对象 + +上传文档中包含完整可执行 Python 实现。黑盒测试仅使用以下公开调用方式: + +- `MemLLM.load()` +- `MemLLM.write()` +- `MemLLM.generate()` +- `MemLLM.save_memory()` +- `MemLLM.load_memory()` + +## 3. 测试环境 + +- OS: Linux 6.1.147 +- Python: 3.12.3 +- Torch: 2.11.0+cu130 +- 模型: `gpt2` + +本次测试先后验证了两套 `transformers` 环境: + +1. `transformers 5.5.4` +2. `transformers 4.57.6` + +## 4. 测试方法说明 + +### 4.1 黑盒边界 + +为了保持黑盒属性,测试中: + +- 不调用被测文件自带的 `test()`、`test_*()` 内部测试函数 +- 不读取 `amm.tree.store`、`_wte_neighbor_cache` 等内部状态 +- 不通过 monkey patch、stub、fake model、替身 tokenizer 等方式替换真实依赖 +- 不改源码、不降级功能、不删除逻辑 + +### 4.2 真实执行方式 + +测试采用真实依赖和真实模型执行: + +- 真实加载 `gpt2` +- 真实调用 `write()` 写入记忆 +- 真实调用 `generate()` 观察文本输出 +- 真实保存/加载记忆文件 + +### 4.3 非 overfit 原则 + +断言不绑定某个固定完整句子,而只检查稳定且对外有意义的行为,例如: + +- 是否成功加载 +- 是否保留 prompt 前缀 +- 是否在写入记忆后出现目标领域词 +- 保存/加载后是否保留该领域响应能力 + +这避免了把测试写成“必须生成某一字不差文本”的脆弱用例。 + +## 5. 测试过程 + +### 步骤 A:定位公开入口 + +确认上传文档是完整可执行实现,并识别公开调用面: + +- `load("gpt2")` +- `write(text, training_mode=True)` +- `generate(prompt, mt=..., greedy=True)` +- `save_memory(path)` +- `load_memory(path)` + +### 步骤 B:环境准备 + +初始环境缺少 `torch` 和 `transformers`,先安装真实运行所需依赖。 + +### 步骤 C:兼容性复现 + +在 `transformers 5.5.4` 下直接进行真实调用,结果 `generate()` 崩溃,报错为: + +`IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)` + +崩溃栈位于 GPT-2 block 前向过程中,说明当前实现与 `transformers 5.x` 存在兼容性问题。 + +### 步骤 D:在兼容环境下执行正式黑盒用例 + +将 `transformers` 切换到 `4.57.6` 后重新执行同样的真实调用,功能恢复正常,然后运行独立黑盒测试驱动: + +- 文件:`/workspace/blackbox_test_agent_memory_system.py` + +## 6. 正式测试用例与结果 + +### TC-01 加载公开 API + +**目标** +验证 `MemLLM.load("gpt2")` 可在真实环境成功完成初始化。 + +**结果** +通过。 + +--- + +### TC-02 空记忆下生成 + +**目标** +验证未写入记忆时,`generate()` 可返回非空字符串,并保留输入 prompt 前缀。 + +**输入** +`prompt = "Hello"` + +**结果** +通过。 + +**输出样例** + +```text +'Hello the other a- I have in this, (the.\n "I' +``` + +--- + +### TC-03 写入前的音乐提示基线 + +**目标** +记录未写入音乐记忆前,对音乐 prompt 的基线输出。 + +**输入** +`prompt = "The piano performance"` + +**结果** +通过。 + +**输出样例** + +```text +'The piano performance of the and, "The world (the-theon the other people in a. The on' +``` + +**观察** +未出现命中的音乐领域关键词。 + +--- + +### TC-04 写入音乐记忆后观察领域接地 + +**目标** +在写入真实音乐语料后,验证 `generate()` 是否出现可观察的音乐领域信号。 + +**写入内容** + +1. `He practiced piano for hours perfecting a difficult Chopin nocturne.` +2. `She studied music theory and harmonic progression at the conservatory.` +3. `The orchestra rehearsed the symphony before the evening concert.` + +**结果** +通过。 + +**门控返回值** + +- `0.552463` +- `0.654567` +- `0.569074` + +**输出样例** + +```text +'The piano performance musical music the and violin, a- is an in that\'s.\n The other " it has' +``` + +**命中关键词** + +- `music` +- `musical` +- `violin` + +**结论** +从黑盒角度看,写入记忆后,生成结果出现了明确的音乐领域词,说明外部可观察的领域接地增强成立。 + +--- + +### TC-05 记忆前后领域信号增强 + +**目标** +比较 TC-03 与 TC-04,确认写入记忆后领域信号相对增强。 + +**结果** +通过。 + +**对比** + +- 写入前关键词命中:`[]` +- 写入后关键词命中:`['music', 'musical', 'violin']` + +**结论** +在黑盒观察层面,写入记忆前后确实产生了显著可见差异。 + +--- + +### TC-06 记忆保存/加载回环 + +**目标** +验证 `save_memory()` 与 `load_memory()` 后,模型仍保留可观察的音乐领域响应能力。 + +**结果** +通过。 + +**中间文件** + +- 大小:`25116 bytes` + +**重载后输出样例** + +```text +'The piano performance musical music the and violin, a- is an in that\'s.\n The other " it has' +``` + +**重载后关键词** + +- `music` +- `musical` +- `violin` + +**结论** +从外部行为看,记忆持久化与恢复功能成立。 + +## 7. 汇总结果 + +在 `transformers 4.57.6` 环境下,正式黑盒测试结果: + +- 通过:6 +- 失败:0 + +总耗时约: + +- `130.65s` + +## 8. 发现的问题与风险 + +### P1:与 `transformers 5.x` 不兼容 + +**现象** +在 `transformers 5.5.4` 环境中,公开接口 `generate()` 真实执行时直接崩溃。 + +**外部影响** +这意味着如果用户在较新的 `transformers` 环境部署该实现,核心生成能力不可用。 + +**复现条件** + +1. 安装 `torch 2.11.0+cu130` +2. 安装 `transformers 5.5.4` +3. 加载上传实现 +4. 执行: + +```python +m = MemLLM(Cfg()) +m.load("gpt2") +m.generate("Hello", mt=15, greedy=True) +``` + +**结果** +抛出: + +```text +IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2) +``` + +### P2:跨域污染风险 + +在额外探测中,如果同时写入音乐和太空两组记忆,`"The piano performance"` 与 `"The space telescope"` 两个 prompt 都可能混入另一领域词汇。 + +这说明该系统从黑盒现象上存在一定的**领域边界串扰**。 +本问题不影响本次主测试的“功能是否可用”结论,但会影响更高要求的语义隔离质量。 + +## 9. 结论 + +### 9.1 功能结论 + +在**兼容环境 `transformers 4.57.6`** 下,被测实现从黑盒角度表现为: + +- 可以真实加载 +- 可以在空记忆下生成 +- 可以写入记忆并改变后续生成 +- 可以在目标 prompt 上体现领域接地增强 +- 可以保存和恢复记忆行为 + +### 9.2 质量结论 + +该实现具备可运行的外部功能闭环,但存在一个明确的工程风险: + +- 对 `transformers 5.x` 的兼容性失败 + +因此,如果用于真实交付或部署,建议至少将运行环境版本要求显式固定,或后续再做兼容性修复验证。 + +## 10. 交付物 + +本次新增的测试资产: + +- 黑盒测试驱动:`/workspace/blackbox_test_agent_memory_system.py` +- 测试报告:`/workspace/reports/agent_memory_blackbox_test_report.md` + +## 11. 复现命令 + +在当前仓库根目录执行: + +```bash +python3 /workspace/blackbox_test_agent_memory_system.py +``` + +如果要复现兼容性问题,可在 `transformers 5.x` 环境下执行真实调用进行验证。 diff --git a/reports/agent_memory_cross_domain_heatmap.json b/reports/agent_memory_cross_domain_heatmap.json new file mode 100644 index 0000000..34e414b --- /dev/null +++ b/reports/agent_memory_cross_domain_heatmap.json @@ -0,0 +1,368 @@ +{ + "avg_gate": 0.564429484307766, + "duration_s": 75.81960409800013, + "matrix": { + "cooking::p1": { + "cooking": { + "count": 1, + "hits": [ + "chef" + ] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 2, + "hits": [ + "music", + "musical" + ] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "cooking::p2": { + "cooking": { + "count": 1, + "hits": [ + "pasta" + ] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 3, + "hits": [ + "music", + "musical", + "practice" + ] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "cooking::p3": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 0, + "hits": [] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "finance::p1": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 1, + "hits": [ + "market" + ] + }, + "music": { + "count": 2, + "hits": [ + "music", + "musical" + ] + }, + "space": { + "count": 1, + "hits": [ + "mission" + ] + } + }, + "finance::p2": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 1, + "hits": [ + "portfolio" + ] + }, + "music": { + "count": 0, + "hits": [] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "finance::p3": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 0, + "hits": [] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "music::p1": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 2, + "hits": [ + "music", + "musical" + ] + }, + "space": { + "count": 1, + "hits": [ + "mission" + ] + } + }, + "music::p2": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 3, + "hits": [ + "music", + "musical", + "violin" + ] + }, + "space": { + "count": 0, + "hits": [] + } + }, + "music::p3": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 3, + "hits": [ + "music", + "musical", + "theory" + ] + }, + "space": { + "count": 1, + "hits": [ + "mission" + ] + } + }, + "space::p1": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 2, + "hits": [ + "music", + "musical" + ] + }, + "space": { + "count": 3, + "hits": [ + "mission", + "orbit", + "planet" + ] + } + }, + "space::p2": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 0, + "hits": [] + }, + "space": { + "count": 2, + "hits": [ + "mission", + "planet" + ] + } + }, + "space::p3": { + "cooking": { + "count": 0, + "hits": [] + }, + "finance": { + "count": 0, + "hits": [] + }, + "music": { + "count": 2, + "hits": [ + "music", + "musical" + ] + }, + "space": { + "count": 2, + "hits": [ + "mission", + "planet" + ] + } + } + }, + "model_name": "gpt2", + "outputs": { + "cooking::p1": { + "output": "The chef prepared culinary chef of increased pastry, musical the team and reduce a.\n- or that's in this", + "prompt": "The chef prepared" + }, + "cooking::p2": { + "output": "The pasta course pasta practice musical, the team of a and increased by an- or in that is to be all", + "prompt": "The pasta course" + }, + "cooking::p3": { + "output": "The dessert service service services team and increased the information, a new- is an in that's.\n of it", + "prompt": "The dessert service" + }, + "finance::p1": { + "output": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all", + "prompt": "The market outlook" + }, + "finance::p2": { + "output": "The portfolio manager increased portfolio management team the and reduced a, or that's reduce stock-\n. in this is", + "prompt": "The portfolio manager" + }, + "finance::p3": { + "output": "The quarterly earnings call increased reduce the team and reduced-, a decreased in this lowered its\n. of an all that", + "prompt": "The quarterly earnings call" + }, + "music::p1": { + "output": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is", + "prompt": "The piano performance" + }, + "music::p2": { + "output": "The violin recital violin musical music the team, a- and increased in this is an all of that's.\n", + "prompt": "The violin recital" + }, + "music::p3": { + "output": "The music theory lesson theory musical mission of the \" and increased., a-\n in this is an I am more", + "prompt": "The music theory lesson" + }, + "space::p1": { + "output": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n", + "prompt": "The space telescope" + }, + "space::p2": { + "output": "The Mars mission mission missions planets. Mission Missions mission\n reduce team of the world, and increased to a- or", + "prompt": "The Mars mission" + }, + "space::p3": { + "output": "The orbital research team mission team musical. increased the planets and reduce, a- or in this is an all of that", + "prompt": "The orbital research team" + } + }, + "own_vs_foreign": { + "cooking": { + "foreign_hits_count": 5, + "foreign_to_own_ratio": 2.5, + "own_hits_count": 2, + "verdict": "high-contamination" + }, + "finance": { + "foreign_hits_count": 3, + "foreign_to_own_ratio": 1.5, + "own_hits_count": 2, + "verdict": "high-contamination" + }, + "music": { + "foreign_hits_count": 2, + "foreign_to_own_ratio": 0.25, + "own_hits_count": 8, + "verdict": "mixed" + }, + "space": { + "foreign_hits_count": 4, + "foreign_to_own_ratio": 0.5714285714285714, + "own_hits_count": 7, + "verdict": "mixed" + } + }, + "prompt_rows": [ + "music::p1", + "music::p2", + "music::p3", + "space::p1", + "space::p2", + "space::p3", + "finance::p1", + "finance::p2", + "finance::p3", + "cooking::p1", + "cooking::p2", + "cooking::p3" + ], + "python": "3.12.3", + "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md", + "torch": "2.11.0+cu130", + "transformers": "4.57.6" +} \ No newline at end of file diff --git a/reports/agent_memory_cross_domain_heatmap.md b/reports/agent_memory_cross_domain_heatmap.md new file mode 100644 index 0000000..a0a26ac --- /dev/null +++ b/reports/agent_memory_cross_domain_heatmap.md @@ -0,0 +1,267 @@ +# AgentMemorySystem 跨域污染热图报告 + +## 1. 执行环境 + +- Python: 3.12.3 +- Torch: 2.11.0+cu130 +- Transformers: 4.57.6 +- Model: gpt2 +- 总耗时: `75.82s` + +## 2. 说明 + +该报告通过黑盒方式同时写入多个领域语料,再对各领域多个 prompt 变体进行生成, +统计 continuation 中命中的各领域关键词数量。 + +热图符号说明: + +- `0`: 无命中 +- `1`: 低污染/低命中 +- `2`: 中等 +- `4`: 高 + +## 3. 放大版关键词命中计数矩阵 + +| prompt variant\\keyword | music | space | finance | cooking | +|---|---|---|---|---| +| music::p1 | 2 (2) | 1 (1) | 0 (0) | 0 (0) | +| music::p2 | 3 (2) | 0 (0) | 0 (0) | 0 (0) | +| music::p3 | 3 (2) | 1 (1) | 0 (0) | 0 (0) | +| space::p1 | 2 (2) | 3 (2) | 0 (0) | 0 (0) | +| space::p2 | 0 (0) | 2 (2) | 0 (0) | 0 (0) | +| space::p3 | 2 (2) | 2 (2) | 0 (0) | 0 (0) | +| finance::p1 | 2 (2) | 1 (1) | 1 (1) | 0 (0) | +| finance::p2 | 0 (0) | 0 (0) | 1 (1) | 0 (0) | +| finance::p3 | 0 (0) | 0 (0) | 0 (0) | 0 (0) | +| cooking::p1 | 2 (2) | 0 (0) | 0 (0) | 1 (1) | +| cooking::p2 | 3 (2) | 0 (0) | 0 (0) | 1 (1) | +| cooking::p3 | 0 (0) | 0 (0) | 0 (0) | 0 (0) | + +## 4. 按领域汇总的 own-domain vs foreign-domain + +| prompt domain | own hits | foreign hits | foreign/own ratio | verdict | +|---|---:|---:|---:|---| +| music | 8 | 2 | 0.25 | mixed | +| space | 7 | 4 | 0.57 | mixed | +| finance | 2 | 3 | 1.50 | high-contamination | +| cooking | 2 | 5 | 2.50 | high-contamination | + +## 5. 各 prompt 变体生成样例 + +### music::p1 + +**Prompt**: `The piano performance` + +**Output**: + +```text +The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is +``` + +**命中详情**: + +- music: ['music', 'musical'] +- space: ['mission'] +- finance: [] +- cooking: [] + +### music::p2 + +**Prompt**: `The violin recital` + +**Output**: + +```text +The violin recital violin musical music the team, a- and increased in this is an all of that's. + +``` + +**命中详情**: + +- music: ['music', 'musical', 'violin'] +- space: [] +- finance: [] +- cooking: [] + +### music::p3 + +**Prompt**: `The music theory lesson` + +**Output**: + +```text +The music theory lesson theory musical mission of the " and increased., a- + in this is an I am more +``` + +**命中详情**: + +- music: ['music', 'musical', 'theory'] +- space: ['mission'] +- finance: [] +- cooking: [] + +### space::p1 + +**Prompt**: `The space telescope` + +**Output**: + +```text +The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's + +``` + +**命中详情**: + +- music: ['music', 'musical'] +- space: ['mission', 'orbit', 'planet'] +- finance: [] +- cooking: [] + +### space::p2 + +**Prompt**: `The Mars mission` + +**Output**: + +```text +The Mars mission mission missions planets. Mission Missions mission + reduce team of the world, and increased to a- or +``` + +**命中详情**: + +- music: [] +- space: ['mission', 'planet'] +- finance: [] +- cooking: [] + +### space::p3 + +**Prompt**: `The orbital research team` + +**Output**: + +```text +The orbital research team mission team musical. increased the planets and reduce, a- or in this is an all of that +``` + +**命中详情**: + +- music: ['music', 'musical'] +- space: ['mission', 'planet'] +- finance: [] +- cooking: [] + +### finance::p1 + +**Prompt**: `The market outlook` + +**Output**: + +```text +The market outlook musical market the mission of increased a, and reduce in that is an- +. to be all +``` + +**命中详情**: + +- music: ['music', 'musical'] +- space: ['mission'] +- finance: ['market'] +- cooking: [] + +### finance::p2 + +**Prompt**: `The portfolio manager` + +**Output**: + +```text +The portfolio manager increased portfolio management team the and reduced a, or that's reduce stock- +. in this is +``` + +**命中详情**: + +- music: [] +- space: [] +- finance: ['portfolio'] +- cooking: [] + +### finance::p3 + +**Prompt**: `The quarterly earnings call` + +**Output**: + +```text +The quarterly earnings call increased reduce the team and reduced-, a decreased in this lowered its +. of an all that +``` + +**命中详情**: + +- music: [] +- space: [] +- finance: [] +- cooking: [] + +### cooking::p1 + +**Prompt**: `The chef prepared` + +**Output**: + +```text +The chef prepared culinary chef of increased pastry, musical the team and reduce a. +- or that's in this +``` + +**命中详情**: + +- music: ['music', 'musical'] +- space: [] +- finance: [] +- cooking: ['chef'] + +### cooking::p2 + +**Prompt**: `The pasta course` + +**Output**: + +```text +The pasta course pasta practice musical, the team of a and increased by an- or in that is to be all +``` + +**命中详情**: + +- music: ['music', 'musical', 'practice'] +- space: [] +- finance: [] +- cooking: ['pasta'] + +### cooking::p3 + +**Prompt**: `The dessert service` + +**Output**: + +```text +The dessert service service services team and increased the information, a new- is an in that's. + of it +``` + +**命中详情**: + +- music: [] +- space: [] +- finance: [] +- cooking: [] + +## 6. 结论 + +如果 foreign hits 在多个 prompt 上持续显著非零,则说明系统存在跨域污染。 +如果 own hits 明显高于 foreign hits,则说明仍保留一定的领域接地能力。 diff --git a/reports/generate_cross_domain_contamination_heatmap.py b/reports/generate_cross_domain_contamination_heatmap.py new file mode 100644 index 0000000..8977711 --- /dev/null +++ b/reports/generate_cross_domain_contamination_heatmap.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +"""Generate a black-box cross-domain contamination heatmap report. + +The script uses only the public runtime behavior of the uploaded +AgentMemorySystem implementation: + +- MemLLM.load() +- MemLLM.write() +- MemLLM.generate() + +It writes all domain corpora into a single model instance, probes each domain +prompt, counts keyword hits by domain, and emits both JSON and Markdown +artifacts for heatmap-style inspection. +""" + +from __future__ import annotations + +import importlib.util +import json +import os +import platform +import time +from importlib.machinery import SourceFileLoader + +import torch +import transformers + + +TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md" +MODEL_NAME = "gpt2" +DEFAULT_SEED = 42 +JSON_OUT = "/workspace/reports/agent_memory_cross_domain_heatmap.json" +MD_OUT = "/workspace/reports/agent_memory_cross_domain_heatmap.md" + +PROMPTS = { + "music": [ + "The piano performance", + "The violin recital", + "The music theory lesson", + ], + "space": [ + "The space telescope", + "The Mars mission", + "The orbital research team", + ], + "finance": [ + "The market outlook", + "The portfolio manager", + "The quarterly earnings call", + ], + "cooking": [ + "The chef prepared", + "The pasta course", + "The dessert service", + ], +} + +CORPORA = { + "music": [ + "He practiced piano for hours perfecting a difficult Chopin nocturne.", + "She studied music theory and harmonic progression at the conservatory.", + "The orchestra rehearsed the symphony before the evening concert.", + ], + "space": [ + "Astronauts trained for the Mars mission in simulated zero gravity.", + "The telescope revealed distant galaxies beyond the Milky Way.", + "Mission control tracked the spacecraft during orbital insertion.", + ], + "finance": [ + "Investors monitored inflation data before the central bank meeting.", + "The portfolio manager reduced exposure to volatile growth stocks.", + "Quarterly earnings guidance shifted sentiment across the market.", + ], + "cooking": [ + "The chef reduced the sauce slowly before plating the duck.", + "Fresh basil and olive oil brightened the pasta at the finish.", + "The pastry team tempered chocolate for the dessert service.", + ], +} + +KEYWORDS = { + "music": { + "music", + "musical", + "violin", + "concert", + "symphony", + "guitar", + "practice", + "practicing", + "piano", + "theory", + }, + "space": { + "space", + "telescope", + "galax", + "orbit", + "orbital", + "mars", + "mission", + "astronaut", + "spacecraft", + "planet", + }, + "finance": { + "market", + "stocks", + "portfolio", + "inflation", + "bank", + "earnings", + "investor", + "trading", + "equity", + "sentiment", + }, + "cooking": { + "chef", + "sauce", + "pasta", + "dessert", + "olive", + "basil", + "plating", + "chocolate", + "kitchen", + "roasted", + }, +} + +HEATMAP_SCALE = [ + (0, "0", "none"), + (1, "1", "low"), + (2, "2", "moderate"), + (3, "3", "moderate"), + (4, "4", "high"), +] + + +def ensure(condition: bool, message: str) -> None: + if not condition: + raise AssertionError(message) + + +def load_target_module(path: str): + loader = SourceFileLoader("agent_memory_heatmap", path) + spec = importlib.util.spec_from_loader(loader.name, loader) + if spec is None: + raise RuntimeError(f"Unable to create import spec for {path}") + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def build_model(module): + torch.manual_seed(DEFAULT_SEED) + model = module.MemLLM(module.Cfg()) + model.load(MODEL_NAME) + return model + + +def keyword_hits(text: str, keywords: set[str]) -> list[str]: + lowered = text.lower() + return sorted(keyword for keyword in keywords if keyword in lowered) + + +def continuation(prompt: str, output: str) -> str: + ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}") + return output[len(prompt) :] + + +def stable_write(model, texts: list[str]) -> list[float]: + gates: list[float] = [] + for text in texts: + stored, gate_vals = model.write(text, training_mode=True) + ensure(stored == 1, f"Expected one stored memory for training_mode=True, got {stored}") + ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}") + gates.extend(gate_vals) + return gates + + +def heat_symbol(count: int) -> str: + if count <= 0: + return "0" + if count == 1: + return "1" + if count <= 3: + return "2" + return "4" + + +def build_markdown(payload: dict) -> str: + domains = list(PROMPTS) + matrix = payload["matrix"] + outputs = payload["outputs"] + own_foreign = payload["own_vs_foreign"] + prompt_rows = payload["prompt_rows"] + + lines = [ + "# AgentMemorySystem 跨域污染热图报告", + "", + "## 1. 执行环境", + "", + f"- Python: {payload['python']}", + f"- Torch: {payload['torch']}", + f"- Transformers: {payload['transformers']}", + f"- Model: {payload['model_name']}", + f"- 总耗时: `{payload['duration_s']:.2f}s`", + "", + "## 2. 说明", + "", + "该报告通过黑盒方式同时写入多个领域语料,再对各领域多个 prompt 变体进行生成,", + "统计 continuation 中命中的各领域关键词数量。", + "", + "热图符号说明:", + "", + "- `0`: 无命中", + "- `1`: 低污染/低命中", + "- `2`: 中等", + "- `4`: 高", + "", + "## 3. 放大版关键词命中计数矩阵", + "", + ] + + header = "| prompt variant\\\\keyword | " + " | ".join(domains) + " |" + sep = "|" + "---|" * (len(domains) + 1) + lines.extend([header, sep]) + for row_id in prompt_rows: + row_cells = [row_id] + for keyword_domain in domains: + count = matrix[row_id][keyword_domain]["count"] + symbol = heat_symbol(count) + row_cells.append(f"{count} ({symbol})") + lines.append("| " + " | ".join(row_cells) + " |") + + lines.extend( + [ + "", + "## 4. 按领域汇总的 own-domain vs foreign-domain", + "", + "| prompt domain | own hits | foreign hits | foreign/own ratio | verdict |", + "|---|---:|---:|---:|---|", + ] + ) + for domain in domains: + summary = own_foreign[domain] + ratio = summary["foreign_to_own_ratio"] + ratio_text = "inf" if ratio is None else f"{ratio:.2f}" + lines.append( + f"| {domain} | {summary['own_hits_count']} | {summary['foreign_hits_count']} | " + f"{ratio_text} | {summary['verdict']} |" + ) + + lines.extend(["", "## 5. 各 prompt 变体生成样例", ""]) + for row_id in prompt_rows: + domain = row_id.split("::", 1)[0] + lines.extend( + [ + f"### {row_id}", + "", + f"**Prompt**: `{outputs[row_id]['prompt']}`", + "", + f"**Output**:", + "", + "```text", + outputs[row_id]["output"], + "```", + "", + "**命中详情**:", + "", + ] + ) + for keyword_domain in domains: + hits = matrix[row_id][keyword_domain]["hits"] + lines.append(f"- {keyword_domain}: {hits}") + lines.append("") + + lines.extend( + [ + "## 6. 结论", + "", + "如果 foreign hits 在多个 prompt 上持续显著非零,则说明系统存在跨域污染。", + "如果 own hits 明显高于 foreign hits,则说明仍保留一定的领域接地能力。", + "", + ] + ) + return "\n".join(lines) + + +def main() -> int: + ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}") + started = time.perf_counter() + module = load_target_module(TARGET_PATH) + model = build_model(module) + gates: list[float] = [] + for domain in PROMPTS: + gates.extend(stable_write(model, CORPORA[domain])) + + matrix: dict[str, dict[str, dict[str, object]]] = {} + outputs: dict[str, dict[str, str]] = {} + own_vs_foreign: dict[str, dict[str, object]] = {} + prompt_rows: list[str] = [] + domains = list(PROMPTS) + aggregate: dict[str, dict[str, int | float | None]] = { + domain: {"own": 0, "foreign": 0} for domain in domains + } + for prompt_domain, prompts in PROMPTS.items(): + for idx, prompt in enumerate(prompts, start=1): + row_id = f"{prompt_domain}::p{idx}" + prompt_rows.append(row_id) + output = model.generate(prompt, mt=20, greedy=True) + cont = continuation(prompt, output) + outputs[row_id] = {"prompt": prompt, "output": output} + row = {} + own_count = 0 + foreign_count = 0 + for keyword_domain in domains: + hits = keyword_hits(cont, KEYWORDS[keyword_domain]) + count = len(hits) + row[keyword_domain] = { + "hits": hits, + "count": count, + } + if keyword_domain == prompt_domain: + own_count += count + else: + foreign_count += count + matrix[row_id] = row + aggregate[prompt_domain]["own"] += own_count + aggregate[prompt_domain]["foreign"] += foreign_count + + for domain in domains: + own_count = int(aggregate[domain]["own"]) + foreign_count = int(aggregate[domain]["foreign"]) + ratio = None if own_count == 0 else foreign_count / own_count + verdict = "clean" + if own_count == 0 or foreign_count >= own_count: + verdict = "high-contamination" + elif foreign_count > 0: + verdict = "mixed" + own_vs_foreign[domain] = { + "own_hits_count": own_count, + "foreign_hits_count": foreign_count, + "foreign_to_own_ratio": ratio, + "verdict": verdict, + } + + duration = time.perf_counter() - started + payload = { + "target_path": TARGET_PATH, + "model_name": MODEL_NAME, + "python": platform.python_version(), + "torch": torch.__version__, + "transformers": transformers.__version__, + "duration_s": duration, + "avg_gate": sum(gates) / len(gates) if gates else None, + "prompt_rows": prompt_rows, + "matrix": matrix, + "outputs": outputs, + "own_vs_foreign": own_vs_foreign, + } + + with open(JSON_OUT, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True) + with open(MD_OUT, "w", encoding="utf-8") as handle: + handle.write(build_markdown(payload)) + + print("Cross-domain contamination heatmap generated.") + print(f"JSON: {JSON_OUT}") + print(f"Markdown: {MD_OUT}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/reports/repro_transformers5_generate_failure.py b/reports/repro_transformers5_generate_failure.py new file mode 100644 index 0000000..515b110 --- /dev/null +++ b/reports/repro_transformers5_generate_failure.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Minimal black-box reproducer for the transformers 5.x generate failure.""" + +from __future__ import annotations + +import importlib.util +from importlib.machinery import SourceFileLoader + +import torch +import transformers + + +TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md" + + +def load_target_module(path: str): + loader = SourceFileLoader("agent_memory_system_repro", path) + spec = importlib.util.spec_from_loader(loader.name, loader) + if spec is None: + raise RuntimeError(f"Unable to create import spec for {path}") + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def main() -> int: + print(f"torch={torch.__version__}") + print(f"transformers={transformers.__version__}") + + module = load_target_module(TARGET_PATH) + torch.manual_seed(42) + model = module.MemLLM(module.Cfg()) + model.load("gpt2") + print(model.generate("Hello", mt=15, greedy=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())