diff --git a/blackbox_test_agent_memory_round2.py b/blackbox_test_agent_memory_round2.py
new file mode 100644
index 0000000..902dd92
--- /dev/null
+++ b/blackbox_test_agent_memory_round2.py
@@ -0,0 +1,552 @@
+#!/usr/bin/env python3
+"""Second-round black-box matrix runner for AgentMemorySystem.
+
+This runner extends the first-round smoke tests with broader black-box
+coverage:
+
+- stress scenarios
+- long-text scenarios
+- cross-domain contamination diagnostics
+- stability scenarios
+
+The runner still treats the uploaded implementation as an opaque component and
+only uses its public runtime behavior:
+
+- MemLLM.load()
+- MemLLM.write()
+- MemLLM.generate()
+- MemLLM.save_memory()
+- MemLLM.load_memory()
+
+Status semantics:
+- PASS: scenario met its acceptance rule
+- WARN: scenario finished, but a diagnostic risk was observed
+- FAIL: scenario violated a gating requirement or raised an unexpected error
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import json
+import math
+import os
+import platform
+import tempfile
+import time
+from dataclasses import asdict, dataclass
+from importlib.machinery import SourceFileLoader
+from typing import Callable
+
+import torch
+import transformers
+
+
+TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
+MODEL_NAME = "gpt2"
+DEFAULT_SEED = 42
+
+PROMPTS = {
+    "music": "The piano performance",
+    "space": "The space telescope",
+    "finance": "The market outlook",
+    "cooking": "The chef prepared",
+}
+
+CORPORA = {
+    "music": [
+        "He practiced piano for hours perfecting a difficult Chopin nocturne.",
+        "She studied music theory and harmonic progression at the conservatory.",
+        "The orchestra rehearsed the symphony before the evening concert.",
+    ],
+    "space": [
+        "Astronauts trained for the Mars mission in simulated zero gravity.",
+        "The telescope revealed distant galaxies beyond the Milky Way.",
+        "Mission control tracked the spacecraft during orbital insertion.",
+    ],
+    "finance": [
+        "Investors monitored inflation data before the central bank meeting.",
+        "The portfolio manager reduced exposure to volatile growth stocks.",
+        "Quarterly earnings guidance shifted sentiment across the market.",
+    ],
+    "cooking": [
+        "The chef reduced the sauce slowly before plating the duck.",
+        "Fresh basil and olive oil brightened the pasta at the finish.",
+        "The pastry team tempered chocolate for the dessert service.",
+    ],
+}
+
+KEYWORDS = {
+    "music": {
+        "music",
+        "musical",
+        "violin",
+        "concert",
+        "symphony",
+        "guitar",
+        "practice",
+        "practicing",
+        "piano",
+        "theory",
+    },
+    "space": {
+        "space",
+        "telescope",
+        "galax",
+        "orbit",
+        "orbital",
+        "mars",
+        "mission",
+        "astronaut",
+        "spacecraft",
+        "planet",
+    },
+    "finance": {
+        "market",
+        "stocks",
+        "portfolio",
+        "inflation",
+        "bank",
+        "earnings",
+        "investor",
+        "trading",
+        "equity",
+        "sentiment",
+    },
+    "cooking": {
+        "chef",
+        "sauce",
+        "pasta",
+        "dessert",
+        "olive",
+        "basil",
+        "plating",
+        "chocolate",
+        "kitchen",
+        "roasted",
+    },
+}
+
+
+@dataclass
+class ScenarioDef:
+    scenario_id: str
+    category: str
+    title: str
+    suite: str
+    runner: Callable[[object], tuple[str, str, dict]]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_id: str
+    category: str
+    title: str
+    suite: str
+    status: str
+    duration_s: float
+    summary: str
+    metrics: dict
+
+
+def load_target_module(path: str):
+    loader = SourceFileLoader("agent_memory_system_round2", path)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    if spec is None:
+        raise RuntimeError(f"Unable to create import spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+def build_model(module, seed: int = DEFAULT_SEED):
+    torch.manual_seed(seed)
+    cfg = module.Cfg()
+    model = module.MemLLM(cfg)
+    model.load(MODEL_NAME)
+    return model
+
+
+def ensure(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def keyword_hits(text: str, keywords: set[str]) -> list[str]:
+    lowered = text.lower()
+    return sorted(keyword for keyword in keywords if keyword in lowered)
+
+
+def continuation(prompt: str, output: str) -> str:
+    ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}")
+    return output[len(prompt) :]
+
+
+def mean_gate(values: list[float]) -> float:
+    ensure(values, "No gate values were collected")
+    return float(sum(values) / len(values))
+
+
+def stable_write(model, texts: list[str]) -> list[float]:
+    gates: list[float] = []
+    for text in texts:
+        stored, gate_vals = model.write(text, training_mode=True)
+        ensure(stored == 1, f"Expected one stored memory for training_mode=True, got {stored}")
+        ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}")
+        ensure(math.isfinite(gate_vals[0]), f"Gate value is not finite: {gate_vals[0]}")
+        gates.extend(gate_vals)
+    return gates
+
+
+def run_scenario(module, scenario: ScenarioDef) -> ScenarioResult:
+    start = time.perf_counter()
+    try:
+        status, summary, metrics = scenario.runner(module)
+    except AssertionError as exc:
+        status = "FAIL"
+        summary = str(exc)
+        metrics = {}
+    except Exception as exc:  # pragma: no cover - surfaced in terminal output
+        status = "FAIL"
+        summary = f"{type(exc).__name__}: {exc}"
+        metrics = {}
+    duration_s = time.perf_counter() - start
+    return ScenarioResult(
+        scenario_id=scenario.scenario_id,
+        category=scenario.category,
+        title=scenario.title,
+        suite=scenario.suite,
+        status=status,
+        duration_s=duration_s,
+        summary=summary,
+        metrics=metrics,
+    )
+
+
+def scenario_stress_write_generate(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    gate_values: list[float] = []
+    prompt_outputs: dict[str, str] = {}
+    rounds = 2
+    for _ in range(rounds):
+        for domain in ("music", "space", "finance", "cooking"):
+            gate_values.extend(stable_write(model, CORPORA[domain]))
+        for domain, prompt in PROMPTS.items():
+            output = model.generate(prompt, mt=20, greedy=True)
+            ensure(isinstance(output, str), f"generate() did not return a string for {domain}")
+            ensure(len(output) > len(prompt), f"generate() did not extend prompt for {domain}")
+            continuation(prompt, output)
+            prompt_outputs[domain] = output
+    return (
+        "PASS",
+        "Completed repeated write/generate pressure loop without crash.",
+        {
+            "rounds": rounds,
+            "total_writes": rounds * sum(len(v) for v in CORPORA.values()),
+            "total_generations": rounds * len(PROMPTS),
+            "avg_gate": round(mean_gate(gate_values), 6),
+            "sample_outputs": prompt_outputs,
+        },
+    )
+
+
+def scenario_stress_save_load_cycles(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    gate_values: list[float] = []
+    for domain in ("music", "space", "finance", "cooking"):
+        gate_values.extend(stable_write(model, CORPORA[domain]))
+
+    outputs_by_cycle: list[dict[str, str]] = []
+    cycles = 2
+    fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round2-", suffix=".pt")
+    os.close(fd)
+    current = model
+    try:
+        for _ in range(cycles):
+            current.save_memory(memory_path)
+            ensure(os.path.getsize(memory_path) > 0, "save_memory() produced an empty file")
+            reloaded = build_model(module)
+            reloaded.load_memory(memory_path)
+            cycle_outputs = {}
+            for domain, prompt in PROMPTS.items():
+                output = reloaded.generate(prompt, mt=20, greedy=True)
+                continuation(prompt, output)
+                cycle_outputs[domain] = output
+            outputs_by_cycle.append(cycle_outputs)
+            current = reloaded
+    finally:
+        if os.path.exists(memory_path):
+            os.remove(memory_path)
+
+    return (
+        "PASS",
+        "Repeated save/load cycles preserved externally valid generation behavior.",
+        {
+            "cycles": cycles,
+            "avg_gate": round(mean_gate(gate_values), 6),
+            "cycle_outputs": outputs_by_cycle,
+        },
+    )
+
+
+def scenario_long_memory_write(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    long_text = " ".join(CORPORA["music"] * 20)
+    stored, gates = model.write(long_text, training_mode=True)
+    ensure(stored == 1, f"Expected one stored memory for long text, got {stored}")
+    ensure(len(gates) == 1 and math.isfinite(gates[0]), f"Unexpected gate values: {gates}")
+
+    prompt = PROMPTS["music"]
+    output = model.generate(prompt, mt=25, greedy=True)
+    cont = continuation(prompt, output)
+    hits = keyword_hits(cont, KEYWORDS["music"])
+    ensure(hits, f"Long-memory write did not yield music-domain hits: {cont!r}")
+    return (
+        "PASS",
+        "Long memory write remained usable for downstream generation.",
+        {
+            "long_text_chars": len(long_text),
+            "gate": round(gates[0], 6),
+            "output": output,
+            "keyword_hits": hits,
+        },
+    )
+
+
+def scenario_long_prompt_resilience(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    prompt = ("The piano performance review discussed harmony and rhythm in detail. " * 60).strip()
+    try:
+        output = model.generate(prompt, mt=10, greedy=True)
+        continuation(prompt, output)
+        return (
+            "PASS",
+            "Long prompt generation completed without crashing.",
+            {
+                "prompt_chars": len(prompt),
+                "output_chars": len(output),
+            },
+        )
+    except Exception as exc:
+        return (
+            "WARN",
+            "Long prompt generation raised an externally visible error.",
+            {
+                "prompt_chars": len(prompt),
+                "error_type": type(exc).__name__,
+                "error_message": str(exc),
+            },
+        )
+
+
+def scenario_cross_domain_dual(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    stable_write(model, CORPORA["music"] + CORPORA["space"])
+
+    music_output = model.generate(PROMPTS["music"], mt=20, greedy=True)
+    space_output = model.generate(PROMPTS["space"], mt=20, greedy=True)
+    music_cont = continuation(PROMPTS["music"], music_output)
+    space_cont = continuation(PROMPTS["space"], space_output)
+
+    music_own = keyword_hits(music_cont, KEYWORDS["music"])
+    music_foreign = keyword_hits(music_cont, KEYWORDS["space"])
+    space_own = keyword_hits(space_cont, KEYWORDS["space"])
+    space_foreign = keyword_hits(space_cont, KEYWORDS["music"])
+
+    contamination_detected = bool(music_foreign or space_foreign)
+    missing_own_signal = not music_own or not space_own
+    status = "PASS"
+    summary = "Dual-domain prompts preserved own-domain signal without obvious contamination."
+    if contamination_detected or missing_own_signal:
+        status = "WARN"
+        summary = "Dual-domain run showed contamination or weak own-domain separation."
+
+    return (
+        status,
+        summary,
+        {
+            "music_output": music_output,
+            "space_output": space_output,
+            "music_own_hits": music_own,
+            "music_foreign_hits": music_foreign,
+            "space_own_hits": space_own,
+            "space_foreign_hits": space_foreign,
+        },
+    )
+
+
+def scenario_cross_domain_fourway(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    for domain in ("music", "space", "finance", "cooking"):
+        stable_write(model, CORPORA[domain])
+
+    matrix: dict[str, dict[str, list[str]]] = {}
+    warning = False
+    for domain, prompt in PROMPTS.items():
+        output = model.generate(prompt, mt=20, greedy=True)
+        cont = continuation(prompt, output)
+        row = {}
+        for keyword_domain, keywords in KEYWORDS.items():
+            row[keyword_domain] = keyword_hits(cont, keywords)
+        matrix[domain] = row
+        own = row[domain]
+        foreign = {
+            k: v for k, v in row.items() if k != domain and v
+        }
+        if not own or foreign:
+            warning = True
+
+    status = "WARN" if warning else "PASS"
+    summary = (
+        "Four-way contamination matrix detected cross-domain bleed or weak own-domain signal."
+        if warning
+        else "Four-way contamination matrix looked clean."
+    )
+    return status, summary, {"matrix": matrix}
+
+
+def scenario_stability_fresh_instance(module) -> tuple[str, str, dict]:
+    model_a = build_model(module, seed=DEFAULT_SEED)
+    stable_write(model_a, CORPORA["music"])
+    output_a = model_a.generate(PROMPTS["music"], mt=20, greedy=True)
+
+    model_b = build_model(module, seed=DEFAULT_SEED)
+    stable_write(model_b, CORPORA["music"])
+    output_b = model_b.generate(PROMPTS["music"], mt=20, greedy=True)
+
+    ensure(output_a == output_b, "Fresh seeded instances produced different greedy outputs")
+    return (
+        "PASS",
+        "Fresh seeded instances were exactly deterministic under greedy generation.",
+        {
+            "prompt": PROMPTS["music"],
+            "output": output_a,
+        },
+    )
+
+
+def scenario_stability_same_instance(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    stable_write(model, CORPORA["music"])
+    outputs = [model.generate(PROMPTS["music"], mt=20, greedy=True) for _ in range(3)]
+    identical = outputs[0] == outputs[1] == outputs[2]
+    return (
+        "PASS" if identical else "WARN",
+        "Repeated greedy calls on the same instance were identical." if identical else "Repeated greedy calls drifted on the same instance.",
+        {
+            "outputs": outputs,
+        },
+    )
+
+
+def scenario_stability_roundtrip_exactness(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    stable_write(model, CORPORA["music"])
+    baseline = model.generate(PROMPTS["music"], mt=20, greedy=True)
+
+    fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round2-exact-", suffix=".pt")
+    os.close(fd)
+    try:
+        model.save_memory(memory_path)
+        reloaded = build_model(module)
+        reloaded.load_memory(memory_path)
+        after_reload = reloaded.generate(PROMPTS["music"], mt=20, greedy=True)
+    finally:
+        if os.path.exists(memory_path):
+            os.remove(memory_path)
+
+    ensure(baseline == after_reload, "Greedy output changed after save/load roundtrip")
+    return (
+        "PASS",
+        "Greedy output stayed exactly stable across save/load roundtrip.",
+        {
+            "output": baseline,
+        },
+    )
+
+
+SCENARIOS = [
+    ScenarioDef("R2-STRESS-01", "stress", "repeated write/generate pressure", "representative", scenario_stress_write_generate),
+    ScenarioDef("R2-STRESS-02", "stress", "repeated save/load pressure", "full", scenario_stress_save_load_cycles),
+    ScenarioDef("R2-LONG-01", "long-text", "long memory write grounding", "representative", scenario_long_memory_write),
+    ScenarioDef("R2-LONG-02", "long-text", "long prompt resilience", "full", scenario_long_prompt_resilience),
+    ScenarioDef("R2-CROSS-01", "cross-domain", "dual-domain contamination diagnostic", "representative", scenario_cross_domain_dual),
+    ScenarioDef("R2-CROSS-02", "cross-domain", "four-domain contamination matrix", "full", scenario_cross_domain_fourway),
+    ScenarioDef("R2-STABLE-01", "stability", "fresh instance determinism", "representative", scenario_stability_fresh_instance),
+    ScenarioDef("R2-STABLE-02", "stability", "same instance repeatability", "full", scenario_stability_same_instance),
+    ScenarioDef("R2-STABLE-03", "stability", "save/load exactness", "full", scenario_stability_roundtrip_exactness),
+]
+
+
+def select_scenarios(suite: str, scenario_ids: list[str] | None) -> list[ScenarioDef]:
+    selected = []
+    allowed_suites = {"representative"} if suite == "representative" else {"representative", "full"}
+    id_filter = set(scenario_ids or [])
+    for scenario in SCENARIOS:
+        if scenario.suite not in allowed_suites:
+            continue
+        if id_filter and scenario.scenario_id not in id_filter:
+            continue
+        selected.append(scenario)
+    return selected
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Second-round AgentMemorySystem black-box test matrix runner")
+    parser.add_argument("--suite", choices=("representative", "full"), default="representative")
+    parser.add_argument("--scenario", action="append", help="Optional scenario ID filter; may be supplied multiple times")
+    parser.add_argument("--json-out", help="Optional path to write JSON results")
+    args = parser.parse_args()
+
+    ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}")
+    module = load_target_module(TARGET_PATH)
+    scenarios = select_scenarios(args.suite, args.scenario)
+    ensure(scenarios, "No scenarios selected")
+
+    print("Second-round black-box matrix runner: AgentMemorySystem")
+    print(f"Target file: {TARGET_PATH}")
+    print(f"Suite: {args.suite}")
+    print(f"Python: {platform.python_version()}")
+    print(f"Torch: {torch.__version__}")
+    print(f"Transformers: {transformers.__version__}")
+    print("")
+
+    started = time.perf_counter()
+    results = [run_scenario(module, scenario) for scenario in scenarios]
+    total_duration = time.perf_counter() - started
+
+    pass_count = sum(1 for result in results if result.status == "PASS")
+    warn_count = sum(1 for result in results if result.status == "WARN")
+    fail_count = sum(1 for result in results if result.status == "FAIL")
+
+    print("=" * 80)
+    for result in results:
+        print(
+            f"[{result.status}] {result.scenario_id} | {result.category} | "
+            f"{result.title} ({result.duration_s:.2f}s)"
+        )
+        print(result.summary)
+        if result.metrics:
+            print(json.dumps(result.metrics, indent=2, ensure_ascii=False, sort_keys=True))
+        print("-" * 80)
+    print(f"Summary: PASS={pass_count}, WARN={warn_count}, FAIL={fail_count}")
+    print(f"Total duration: {total_duration:.2f}s")
+
+    if args.json_out:
+        payload = {
+            "target_path": TARGET_PATH,
+            "suite": args.suite,
+            "python": platform.python_version(),
+            "torch": torch.__version__,
+            "transformers": transformers.__version__,
+            "total_duration_s": total_duration,
+            "results": [asdict(result) for result in results],
+        }
+        with open(args.json_out, "w", encoding="utf-8") as handle:
+            json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True)
+
+    return 1 if fail_count else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/blackbox_test_agent_memory_round3.py b/blackbox_test_agent_memory_round3.py
new file mode 100644
index 0000000..6192ee1
--- /dev/null
+++ b/blackbox_test_agent_memory_round3.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""Third-round black-box runner for AgentMemorySystem.
+
+Focus areas:
+- boundary inputs
+- abnormal/exception-facing inputs
+- performance and latency baselines
+
+The target implementation is still treated as an opaque component. The runner
+only uses the public runtime behavior exposed by:
+
+- MemLLM.load()
+- MemLLM.write()
+- MemLLM.generate()
+- MemLLM.save_memory()
+- MemLLM.load_memory()
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import json
+import math
+import os
+import platform
+import statistics
+import time
+from dataclasses import asdict, dataclass
+from importlib.machinery import SourceFileLoader
+from typing import Callable
+
+import torch
+import transformers
+
+
+TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
+MODEL_NAME = "gpt2"
+DEFAULT_SEED = 42
+
+MUSIC_PROMPT = "The piano performance"
+MUSIC_TEXTS = [
+    "He practiced piano for hours perfecting a difficult Chopin nocturne.",
+    "She studied music theory and harmonic progression at the conservatory.",
+    "The orchestra rehearsed the symphony before the evening concert.",
+]
+
+
+@dataclass
+class ScenarioDef:
+    scenario_id: str
+    category: str
+    title: str
+    suite: str
+    runner: Callable[[object], tuple[str, str, dict]]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_id: str
+    category: str
+    title: str
+    suite: str
+    status: str
+    duration_s: float
+    summary: str
+    metrics: dict
+
+
+def load_target_module(path: str):
+    loader = SourceFileLoader("agent_memory_system_round3", path)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    if spec is None:
+        raise RuntimeError(f"Unable to create import spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+def ensure(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def build_model(module, seed: int = DEFAULT_SEED):
+    torch.manual_seed(seed)
+    cfg = module.Cfg()
+    model = module.MemLLM(cfg)
+    model.load(MODEL_NAME)
+    return model
+
+
+def continuation(prompt: str, output: str) -> str:
+    ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}")
+    return output[len(prompt) :]
+
+
+def stable_music_seed(model) -> list[float]:
+    gates: list[float] = []
+    for text in MUSIC_TEXTS:
+        stored, gate_vals = model.write(text, training_mode=True)
+        ensure(stored == 1, f"Expected one stored memory, got {stored}")
+        ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}")
+        ensure(math.isfinite(gate_vals[0]), f"Gate is not finite: {gate_vals[0]}")
+        gates.extend(gate_vals)
+    return gates
+
+
+def run_scenario(module, scenario: ScenarioDef) -> ScenarioResult:
+    started = time.perf_counter()
+    try:
+        status, summary, metrics = scenario.runner(module)
+    except AssertionError as exc:
+        status = "FAIL"
+        summary = str(exc)
+        metrics = {}
+    except Exception as exc:  # pragma: no cover
+        status = "FAIL"
+        summary = f"{type(exc).__name__}: {exc}"
+        metrics = {}
+    duration_s = time.perf_counter() - started
+    return ScenarioResult(
+        scenario_id=scenario.scenario_id,
+        category=scenario.category,
+        title=scenario.title,
+        suite=scenario.suite,
+        status=status,
+        duration_s=duration_s,
+        summary=summary,
+        metrics=metrics,
+    )
+
+
+def scenario_boundary_empty_prompt(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    output = model.generate("", mt=10, greedy=True)
+    ensure(isinstance(output, str), "generate() did not return a string")
+    ensure(len(output) > 0, "Empty prompt generation returned an empty string")
+    return "PASS", "Empty prompt generation returned a non-empty string.", {"output": output}
+
+
+def scenario_boundary_single_char_prompt(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    prompt = "A"
+    output = model.generate(prompt, mt=12, greedy=True)
+    continuation(prompt, output)
+    return "PASS", "Single-character prompt generation remained valid.", {"output": output}
+
+
+def scenario_boundary_whitespace_prompt(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    prompt = "   "
+    output = model.generate(prompt, mt=10, greedy=True)
+    ensure(isinstance(output, str), "generate() did not return a string")
+    ensure(len(output) >= len(prompt), "Whitespace prompt output was shorter than prompt")
+    return "PASS", "Whitespace prompt generation completed.", {"output": output}
+
+
+def scenario_boundary_newline_prompt(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    prompt = "Line one.\nLine two."
+    output = model.generate(prompt, mt=12, greedy=True)
+    continuation(prompt, output)
+    return "PASS", "Multi-line prompt generation completed.", {"output": output}
+
+
+def scenario_abnormal_none_write(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    try:
+        model.write(None, training_mode=True)  # type: ignore[arg-type]
+    except Exception as exc:
+        return (
+            "PASS",
+            "write(None, ...) raised an externally visible exception as expected.",
+            {"error_type": type(exc).__name__, "error_message": str(exc)},
+        )
+    return "WARN", "write(None, ...) unexpectedly succeeded.", {}
+
+
+def scenario_abnormal_none_generate(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    try:
+        model.generate(None, mt=10, greedy=True)  # type: ignore[arg-type]
+    except Exception as exc:
+        return (
+            "PASS",
+            "generate(None, ...) raised an externally visible exception as expected.",
+            {"error_type": type(exc).__name__, "error_message": str(exc)},
+        )
+    return "WARN", "generate(None, ...) unexpectedly succeeded.", {}
+
+
+def scenario_abnormal_negative_mt(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    prompt = "Hello"
+    output = model.generate(prompt, mt=-5, greedy=True)
+    if output == prompt:
+        return (
+            "WARN",
+            "Negative mt returned the original prompt without explicit validation.",
+            {"output": output},
+        )
+    return (
+        "WARN",
+        "Negative mt did not raise and produced a nonstandard output.",
+        {"output": output},
+    )
+
+
+def scenario_abnormal_invalid_load_memory(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    missing_path = "/tmp/agent-memory-nonexistent-file.pt"
+    try:
+        model.load_memory(missing_path)
+    except Exception as exc:
+        return (
+            "PASS",
+            "load_memory() on a missing path raised an externally visible exception.",
+            {"error_type": type(exc).__name__, "error_message": str(exc)},
+        )
+    return "WARN", "load_memory() on a missing path unexpectedly succeeded.", {}
+
+
+def scenario_perf_cold_load_baseline(module) -> tuple[str, str, dict]:
+    started = time.perf_counter()
+    model = build_model(module)
+    elapsed = time.perf_counter() - started
+    ensure(model is not None, "Model failed to initialize")
+    return (
+        "PASS",
+        "Cold load latency baseline recorded.",
+        {"cold_load_s": round(elapsed, 3)},
+    )
+
+
+def scenario_perf_write_latency(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    timings = []
+    for text in MUSIC_TEXTS:
+        started = time.perf_counter()
+        stored, gates = model.write(text, training_mode=True)
+        elapsed = time.perf_counter() - started
+        ensure(stored == 1, f"Expected one stored memory, got {stored}")
+        ensure(len(gates) == 1 and math.isfinite(gates[0]), f"Unexpected gate values: {gates}")
+        timings.append(elapsed)
+    return (
+        "PASS",
+        "Write latency baseline recorded.",
+        {
+            "write_count": len(timings),
+            "avg_write_s": round(statistics.mean(timings), 3),
+            "max_write_s": round(max(timings), 3),
+            "min_write_s": round(min(timings), 3),
+        },
+    )
+
+
+def scenario_perf_generate_latency(module) -> tuple[str, str, dict]:
+    model = build_model(module)
+    stable_music_seed(model)
+    timings = []
+    outputs = []
+    for _ in range(3):
+        started = time.perf_counter()
+        output = model.generate(MUSIC_PROMPT, mt=20, greedy=True)
+        elapsed = time.perf_counter() - started
+        continuation(MUSIC_PROMPT, output)
+        timings.append(elapsed)
+        outputs.append(output)
+    return (
+        "PASS",
+        "Generate latency baseline recorded.",
+        {
+            "generate_count": len(timings),
+            "avg_generate_s": round(statistics.mean(timings), 3),
+            "max_generate_s": round(max(timings), 3),
+            "min_generate_s": round(min(timings), 3),
+            "sample_output": outputs[0],
+        },
+    )
+
+
+def scenario_perf_save_load_latency(module) -> tuple[str, str, dict]:
+    import tempfile
+
+    model = build_model(module)
+    stable_music_seed(model)
+    fd, memory_path = tempfile.mkstemp(prefix="agent-memory-round3-", suffix=".pt")
+    os.close(fd)
+    try:
+        save_started = time.perf_counter()
+        model.save_memory(memory_path)
+        save_elapsed = time.perf_counter() - save_started
+        ensure(os.path.getsize(memory_path) > 0, "save_memory() produced an empty file")
+
+        reload_model = build_model(module)
+        load_started = time.perf_counter()
+        reload_model.load_memory(memory_path)
+        load_elapsed = time.perf_counter() - load_started
+
+        output = reload_model.generate(MUSIC_PROMPT, mt=20, greedy=True)
+        continuation(MUSIC_PROMPT, output)
+    finally:
+        if os.path.exists(memory_path):
+            os.remove(memory_path)
+
+    return (
+        "PASS",
+        "Save/load latency baseline recorded.",
+        {
+            "save_s": round(save_elapsed, 3),
+            "load_s": round(load_elapsed, 3),
+            "sample_output": output,
+        },
+    )
+
+
+SCENARIOS = [
+    ScenarioDef("R3-BOUND-01", "boundary-input", "empty prompt generation", "representative", scenario_boundary_empty_prompt),
+    ScenarioDef("R3-BOUND-02", "boundary-input", "single-character prompt generation", "full", scenario_boundary_single_char_prompt),
+    ScenarioDef("R3-BOUND-03", "boundary-input", "whitespace prompt generation", "full", scenario_boundary_whitespace_prompt),
+    ScenarioDef("R3-BOUND-04", "boundary-input", "multiline prompt generation", "full", scenario_boundary_newline_prompt),
+    ScenarioDef("R3-ABN-01", "abnormal-input", "write None input", "representative", scenario_abnormal_none_write),
+    ScenarioDef("R3-ABN-02", "abnormal-input", "generate None input", "full", scenario_abnormal_none_generate),
+    ScenarioDef("R3-ABN-03", "abnormal-input", "negative max tokens handling", "full", scenario_abnormal_negative_mt),
+    ScenarioDef("R3-ABN-04", "abnormal-input", "load missing memory file", "full", scenario_abnormal_invalid_load_memory),
+    ScenarioDef("R3-PERF-01", "performance", "cold load latency baseline", "representative", scenario_perf_cold_load_baseline),
+    ScenarioDef("R3-PERF-02", "performance", "write latency baseline", "full", scenario_perf_write_latency),
+    ScenarioDef("R3-PERF-03", "performance", "generate latency baseline", "representative", scenario_perf_generate_latency),
+    ScenarioDef("R3-PERF-04", "performance", "save/load latency baseline", "full", scenario_perf_save_load_latency),
+]
+
+
+def select_scenarios(suite: str, scenario_ids: list[str] | None) -> list[ScenarioDef]:
+    allowed_suites = {"representative"} if suite == "representative" else {"representative", "full"}
+    id_filter = set(scenario_ids or [])
+    selected = []
+    for scenario in SCENARIOS:
+        if scenario.suite not in allowed_suites:
+            continue
+        if id_filter and scenario.scenario_id not in id_filter:
+            continue
+        selected.append(scenario)
+    return selected
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Third-round AgentMemorySystem black-box runner")
+    parser.add_argument("--suite", choices=("representative", "full"), default="representative")
+    parser.add_argument("--scenario", action="append", help="Optional scenario ID filter; may be supplied multiple times")
+    parser.add_argument("--json-out", help="Optional path to write JSON results")
+    args = parser.parse_args()
+
+    ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}")
+    module = load_target_module(TARGET_PATH)
+    scenarios = select_scenarios(args.suite, args.scenario)
+    ensure(scenarios, "No scenarios selected")
+
+    print("Third-round black-box runner: AgentMemorySystem")
+    print(f"Target file: {TARGET_PATH}")
+    print(f"Suite: {args.suite}")
+    print(f"Python: {platform.python_version()}")
+    print(f"Torch: {torch.__version__}")
+    print(f"Transformers: {transformers.__version__}")
+    print("")
+
+    started = time.perf_counter()
+    results = [run_scenario(module, scenario) for scenario in scenarios]
+    total_duration = time.perf_counter() - started
+
+    pass_count = sum(1 for result in results if result.status == "PASS")
+    warn_count = sum(1 for result in results if result.status == "WARN")
+    fail_count = sum(1 for result in results if result.status == "FAIL")
+
+    print("=" * 80)
+    for result in results:
+        print(
+            f"[{result.status}] {result.scenario_id} | {result.category} | "
+            f"{result.title} ({result.duration_s:.2f}s)"
+        )
+        print(result.summary)
+        if result.metrics:
+            print(json.dumps(result.metrics, indent=2, ensure_ascii=False, sort_keys=True))
+        print("-" * 80)
+    print(f"Summary: PASS={pass_count}, WARN={warn_count}, FAIL={fail_count}")
+    print(f"Total duration: {total_duration:.2f}s")
+
+    if args.json_out:
+        payload = {
+            "target_path": TARGET_PATH,
+            "suite": args.suite,
+            "python": platform.python_version(),
+            "torch": torch.__version__,
+            "transformers": transformers.__version__,
+            "total_duration_s": total_duration,
+            "results": [asdict(result) for result in results],
+        }
+        with open(args.json_out, "w", encoding="utf-8") as handle:
+            json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True)
+
+    return 1 if fail_count else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/blackbox_test_agent_memory_system.py b/blackbox_test_agent_memory_system.py
new file mode 100644
index 0000000..d8671d9
--- /dev/null
+++ b/blackbox_test_agent_memory_system.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""Black-box test runner for the uploaded AgentMemorySystem implementation.
+
+This runner intentionally treats the uploaded code as an opaque component and
+only interacts with its public runtime behavior:
+
+- MemLLM.load()
+- MemLLM.write()
+- MemLLM.generate()
+- MemLLM.save_memory()
+- MemLLM.load_memory()
+
+It does not call private helpers, does not inspect internal memory state, and
+does not use mocks.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import math
+import os
+import platform
+import tempfile
+import time
+from dataclasses import dataclass
+from importlib.machinery import SourceFileLoader
+
+import torch
+import transformers
+
+
+TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
+MODEL_NAME = "gpt2"
+MUSIC_PROMPT = "The piano performance"
+MUSIC_MEMORIES = [
+    "He practiced piano for hours perfecting a difficult Chopin nocturne.",
+    "She studied music theory and harmonic progression at the conservatory.",
+    "The orchestra rehearsed the symphony before the evening concert.",
+]
+MUSIC_KEYWORDS = {
+    "music",
+    "musical",
+    "violin",
+    "concert",
+    "symphony",
+    "guitar",
+    "practice",
+    "practicing",
+}
+
+
+@dataclass
+class CaseResult:
+    case_id: str
+    title: str
+    passed: bool
+    duration_s: float
+    details: str
+
+
+def load_target_module(path: str):
+    loader = SourceFileLoader("agent_memory_system_under_test", path)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    if spec is None:
+        raise RuntimeError(f"Unable to create import spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+def keyword_hits(text: str, keywords: set[str]) -> list[str]:
+    lowered = text.lower()
+    return sorted(keyword for keyword in keywords if keyword in lowered)
+
+
+def check(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def build_model(module):
+    cfg = module.Cfg()
+    model = module.MemLLM(cfg)
+    model.load(MODEL_NAME)
+    return model
+
+
+def run_case(case_id: str, title: str, fn) -> CaseResult:
+    start = time.perf_counter()
+    try:
+        details = fn()
+        passed = True
+    except Exception as exc:  # pragma: no cover - failure path is test output
+        details = f"{type(exc).__name__}: {exc}"
+        passed = False
+    duration_s = time.perf_counter() - start
+    return CaseResult(case_id, title, passed, duration_s, details)
+
+
+def main() -> int:
+    overall_start = time.perf_counter()
+    print("Black-box test runner: AgentMemorySystem")
+    print(f"Target file: {TARGET_PATH}")
+    print(f"Python: {platform.python_version()}")
+    print(f"Torch: {torch.__version__}")
+    print(f"Transformers: {transformers.__version__}")
+    print("")
+
+    check(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}")
+    module = load_target_module(TARGET_PATH)
+    torch.manual_seed(42)
+    model = build_model(module)
+    results: list[CaseResult] = []
+    state: dict[str, object] = {}
+
+    def tc01_load_public_api() -> str:
+        check(model is not None, "MemLLM.load() did not produce a model instance")
+        return f"Loaded public model API successfully with {MODEL_NAME}"
+
+    def tc02_generate_without_memory() -> str:
+        output = model.generate("Hello", mt=15, greedy=True)
+        check(isinstance(output, str), "generate() did not return a string")
+        check(output.startswith("Hello"), "Generated text does not preserve the prompt prefix")
+        check(len(output) > len("Hello"), "Generated text did not extend the prompt")
+        return f"Output: {output!r}"
+
+    def tc03_baseline_music_prompt_before_memory() -> str:
+        output = model.generate(MUSIC_PROMPT, mt=20, greedy=True)
+        continuation = output[len(MUSIC_PROMPT) :]
+        hits = keyword_hits(continuation, MUSIC_KEYWORDS)
+        state["baseline_music_output"] = output
+        state["baseline_music_hits"] = hits
+        check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved in the baseline run")
+        return f"Baseline output: {output!r}\nBaseline keyword hits: {hits}"
+
+    def tc04_write_and_ground_music_domain() -> str:
+        write_lines = []
+        for text in MUSIC_MEMORIES:
+            stored, gates = model.write(text, training_mode=True)
+            check(stored == 1, f"training_mode=True should store the input, got stored={stored}")
+            check(len(gates) == 1, f"Expected exactly one gate value, got {gates}")
+            check(math.isfinite(gates[0]), f"Gate value is not finite: {gates[0]}")
+            write_lines.append(f"stored={stored}, gate={gates[0]:.6f}, text={text!r}")
+
+        output = model.generate(MUSIC_PROMPT, mt=20, greedy=True)
+        continuation = output[len(MUSIC_PROMPT) :]
+        hits = keyword_hits(continuation, MUSIC_KEYWORDS)
+        state["post_memory_music_output"] = output
+        state["post_memory_music_hits"] = hits
+        check(output.startswith(MUSIC_PROMPT), "Prompt prefix was not preserved after writing memory")
+        check(hits, f"No music-domain grounding detected in continuation: {continuation!r}")
+        return "\n".join(write_lines + [f"Output: {output!r}", f"Keyword hits: {hits}"])
+
+    def tc05_memory_improves_domain_signal() -> str:
+        baseline_hits = state.get("baseline_music_hits")
+        post_hits = state.get("post_memory_music_hits")
+        check(isinstance(baseline_hits, list), "Baseline music output was not recorded")
+        check(isinstance(post_hits, list), "Post-memory music output was not recorded")
+        check(
+            len(post_hits) > len(baseline_hits),
+            f"Music-domain signal did not improve: baseline={baseline_hits}, post={post_hits}",
+        )
+        return (
+            f"Baseline hits: {baseline_hits}\n"
+            f"Post-memory hits: {post_hits}\n"
+            f"Baseline output: {state['baseline_music_output']!r}\n"
+            f"Post-memory output: {state['post_memory_music_output']!r}"
+        )
+
+    def tc06_save_load_roundtrip() -> str:
+        fd, memory_path = tempfile.mkstemp(prefix="agent-memory-", suffix=".pt")
+        os.close(fd)
+        try:
+            model.save_memory(memory_path)
+            check(os.path.exists(memory_path), "save_memory() did not create a file")
+            file_size = os.path.getsize(memory_path)
+            check(file_size > 0, "save_memory() created an empty file")
+
+            torch.manual_seed(42)
+            reloaded = build_model(module)
+            reloaded.load_memory(memory_path)
+            output = reloaded.generate(MUSIC_PROMPT, mt=20, greedy=True)
+            continuation = output[len(MUSIC_PROMPT) :]
+            hits = keyword_hits(continuation, MUSIC_KEYWORDS)
+            check(output.startswith(MUSIC_PROMPT), "Reloaded model did not preserve the prompt prefix")
+            check(hits, f"No music-domain grounding after reload: {continuation!r}")
+            return (
+                f"Saved file: {memory_path} ({file_size} bytes)\n"
+                f"Output after reload: {output!r}\n"
+                f"Keyword hits after reload: {hits}"
+            )
+        finally:
+            if os.path.exists(memory_path):
+                os.remove(memory_path)
+
+    results.append(run_case("TC-01", "load public API", tc01_load_public_api))
+    results.append(run_case("TC-02", "generate without memory", tc02_generate_without_memory))
+    results.append(run_case("TC-03", "baseline music prompt before memory", tc03_baseline_music_prompt_before_memory))
+    results.append(run_case("TC-04", "write memory and observe domain grounding", tc04_write_and_ground_music_domain))
+    results.append(run_case("TC-05", "memory improves domain signal", tc05_memory_improves_domain_signal))
+    results.append(run_case("TC-06", "save/load memory roundtrip", tc06_save_load_roundtrip))
+
+    passed = sum(1 for result in results if result.passed)
+    failed = len(results) - passed
+    total_duration = time.perf_counter() - overall_start
+
+    print("=" * 72)
+    for result in results:
+        status = "PASS" if result.passed else "FAIL"
+        print(f"[{status}] {result.case_id} - {result.title} ({result.duration_s:.2f}s)")
+        print(result.details)
+        print("-" * 72)
+    print(f"Summary: {passed}/{len(results)} passed, {failed} failed")
+    print(f"Total duration: {total_duration:.2f}s")
+
+    return 0 if failed == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/reports/agent_memory_blackbox_extended_summary.md b/reports/agent_memory_blackbox_extended_summary.md
new file mode 100644
index 0000000..8e56005
--- /dev/null
+++ b/reports/agent_memory_blackbox_extended_summary.md
@@ -0,0 +1,33 @@
+# AgentMemorySystem 扩展黑盒测试总览
+
+## 1. 本轮完成内容
+
+- 第二轮 full 覆盖
+- 放大版跨域污染热图
+- 第三轮 full 覆盖（边界输入、异常输入、性能/时延）
+
+## 2. 第二轮 full 结果
+
+- PASS: 7
+- WARN: 2
+- FAIL: 0
+
+## 3. 第三轮 full 结果
+
+- PASS: 10
+- WARN: 1
+- FAIL: 1
+
+## 4. 放大版污染热图结论
+
+- cooking: own=2, foreign=5, ratio=2.50, verdict=high-contamination
+- finance: own=2, foreign=3, ratio=1.50, verdict=high-contamination
+- music: own=8, foreign=2, ratio=0.25, verdict=mixed
+- space: own=7, foreign=4, ratio=0.57, verdict=mixed
+
+## 5. 最高优先级发现
+
+- P1: 空 prompt `generate("")` 会直接崩溃。
+- P1: `transformers 5.x` 兼容性失败仍然成立。
+- P2: 跨域污染在 dual-domain、four-way 以及放大版热图中均被稳定复现。
+- P3: `mt < 0` 缺少显式参数校验，当前表现为直接返回原 prompt。
diff --git a/reports/agent_memory_blackbox_round2_execution_report.md b/reports/agent_memory_blackbox_round2_execution_report.md
new file mode 100644
index 0000000..4fc9e49
--- /dev/null
+++ b/reports/agent_memory_blackbox_round2_execution_report.md
@@ -0,0 +1,214 @@
+# AgentMemorySystem 第二轮黑盒测试执行报告
+
+## 1. 执行范围
+
+本次执行基于第二轮矩阵 runner：
+
+- `/workspace/blackbox_test_agent_memory_round2.py`
+
+执行套件：
+
+- `representative`
+
+代表集覆盖四类风险面各 1 个场景：
+
+1. 压力测试
+2. 长文本测试
+3. 跨域污染测试
+4. 稳定性测试
+
+## 2. 执行环境
+
+- OS: Linux 6.1.147
+- Python: 3.12.3
+- Torch: 2.11.0+cu130
+- Transformers: 4.57.6
+- Base model: `gpt2`
+
+## 3. 执行命令
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round2.py \
+  --suite representative \
+  --json-out /workspace/reports/agent_memory_blackbox_round2_results.json
+```
+
+## 4. 总体结果
+
+- PASS: 3
+- WARN: 1
+- FAIL: 0
+- 总耗时: `321.42s`
+
+结论：
+
+- 第二轮代表集在兼容环境下整体可执行
+- 没有出现新的阻断性故障
+- 跨域污染问题被稳定复现并分类为 `WARN`
+
+## 5. 分场景结果
+
+### R2-STRESS-01 repeated write/generate pressure
+
+**类型**：压力测试  
+**结果**：PASS
+
+**结论**
+
+- 在两轮连续写入四个领域语料、并对四个 prompt 连续生成的压力下，没有出现崩溃
+- `write()` 返回 gate 值有限
+- `generate()` 能持续返回有效字符串，并保留 prompt 前缀
+
+**关键指标**
+
+- rounds: `2`
+- total_writes: `24`
+- total_generations: `8`
+- avg_gate: `0.564429`
+
+**样例输出**
+
+- music:
+  `The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is`
+- space:
+  `The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this`
+
+**观察**
+
+功能层面通过，但混合域压力下输出中已能看到跨域词汇渗透，这与后续污染场景结果一致。
+
+---
+
+### R2-LONG-01 long memory write grounding
+
+**类型**：长文本测试  
+**结果**：PASS
+
+**结论**
+
+- 单条超长记忆文本写入后，系统仍能对目标 prompt 产生可见的音乐领域接地
+
+**关键指标**
+
+- long_text_chars: `4099`
+- gate: `0.293301`
+- keyword_hits:
+  - `concert`
+  - `music`
+  - `musical`
+  - `piano`
+  - `practice`
+  - `theory`
+
+**输出样例**
+
+```text
+The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour
+ a- in this evening The
+```
+
+**观察**
+
+长文本写入并未导致接口失效，说明至少在这一级别的长输入下，系统仍保持外部可用性。
+
+---
+
+### R2-CROSS-01 dual-domain contamination diagnostic
+
+**类型**：跨域污染测试  
+**结果**：WARN
+
+**结论**
+
+- 双域混合写入后，音乐 prompt 与太空 prompt 都出现明显的跨域串扰
+- 该问题不是崩溃类问题，但会影响语义隔离质量
+
+**music prompt 结果**
+
+- own hits:
+  - `music`
+  - `musical`
+- foreign hits:
+  - `mission`
+  - `planet`
+
+**space prompt 结果**
+
+- own hits:
+  - `mission`
+  - `planet`
+- foreign hits:
+  - `music`
+  - `musical`
+  - `theory`
+
+**样例输出**
+
+- music:
+  `The piano performance musical music the mission of a, and planets-\n. in that is an all other's to`
+- space:
+  `The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to`
+
+**判定原因**
+
+该场景是 diagnostic 场景，不把污染直接判成 FAIL；但由于 own-domain 与 foreign-domain 词同时明显出现，因此记为 `WARN`。
+
+---
+
+### R2-STABLE-01 fresh instance determinism
+
+**类型**：稳定性测试  
+**结果**：PASS
+
+**结论**
+
+- 在相同 seed、相同写入顺序、相同 greedy prompt 下，全新实例之间的输出完全一致
+
+**输出**
+
+```text
+The piano performance musical music the and violin, a- is an in that's.
+ The other " it has
+```
+
+**意义**
+
+这说明在当前兼容环境中，初始化路径和 greedy 解码路径具备可重复性。
+
+## 6. 结果解读
+
+### 6.1 压力层面
+
+第二轮代表集没有发现新的崩溃型问题。  
+在较高调用密度下，公开接口仍然可用。
+
+### 6.2 长文本层面
+
+超长记忆写入场景通过，说明记忆写入链路对较长输入具备一定耐受性。
+
+### 6.3 语义隔离层面
+
+跨域污染问题被再次稳定复现，是本轮最重要的质量风险。  
+它不会阻止系统“运行”，但会削弱“提示词所属领域 -> 目标领域输出”的纯度。
+
+### 6.4 稳定性层面
+
+新实例确定性通过，说明在兼容环境和 greedy 模式下，结果具备良好的可复现性。
+
+## 7. 后续建议
+
+如果继续做第三轮，可以优先补这几项：
+
+1. `R2-STRESS-02 repeated save/load pressure` 的全量执行
+2. `R2-LONG-02 long prompt resilience` 的边界行为验证
+3. `R2-CROSS-02 four-domain contamination matrix` 的全量污染热图
+4. `R2-STABLE-02` 与 `R2-STABLE-03` 的完整执行
+
+## 8. 关联文件
+
+- 第二轮矩阵设计：
+  `/workspace/reports/agent_memory_blackbox_round2_matrix.md`
+- 第二轮代表集结果 JSON：
+  `/workspace/reports/agent_memory_blackbox_round2_results.json`
+- 第二轮执行报告：
+  `/workspace/reports/agent_memory_blackbox_round2_execution_report.md`
diff --git a/reports/agent_memory_blackbox_round2_full_execution_report.md b/reports/agent_memory_blackbox_round2_full_execution_report.md
new file mode 100644
index 0000000..35a210b
--- /dev/null
+++ b/reports/agent_memory_blackbox_round2_full_execution_report.md
@@ -0,0 +1,259 @@
+# AgentMemorySystem 第二轮 full 黑盒测试执行报告
+
+## 1. 执行说明
+
+第二轮 full 结果由以下两部分聚合而成：
+
+- 已执行的 representative 场景 4 个
+- 本轮补跑的 full-only 场景 5 个
+
+这样覆盖了第二轮矩阵中的全部 9 个场景。
+
+## 2. 环境
+
+- Python: 3.12.3
+- Torch: 2.11.0+cu130
+- Transformers: 4.57.6
+- Model: gpt2
+
+## 3. 汇总结果
+
+- PASS: 7
+- WARN: 2
+- FAIL: 0
+- 聚合总耗时: `851.07s`
+
+## 4. 分场景结果
+
+### R2-STRESS-01 repeated write/generate pressure
+
+- 类型: stress
+- 状态: PASS
+- 耗时: `70.97s`
+- 结论: Completed repeated write/generate pressure loop without crash.
+
+```json
+{
+  "avg_gate": 0.564429,
+  "rounds": 2,
+  "sample_outputs": {
+    "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+    "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+    "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+    "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this"
+  },
+  "total_generations": 8,
+  "total_writes": 24
+}
+```
+
+### R2-STRESS-02 repeated save/load pressure
+
+- 类型: stress
+- 状态: PASS
+- 耗时: `194.67s`
+- 结论: Repeated save/load cycles preserved externally valid generation behavior.
+
+```json
+{
+  "avg_gate": 0.564429,
+  "cycle_outputs": [
+    {
+      "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+      "finance": "The market outlook musical market the mission of increased a, and reduce in that's reduced to be more\n- or",
+      "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+      "space": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n"
+    },
+    {
+      "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+      "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+      "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+      "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this"
+    }
+  ],
+  "cycles": 2
+}
+```
+
+### R2-LONG-01 long memory write grounding
+
+- 类型: long-text
+- 状态: PASS
+- 耗时: `63.66s`
+- 结论: Long memory write remained usable for downstream generation.
+
+```json
+{
+  "gate": 0.293301,
+  "keyword_hits": [
+    "concert",
+    "music",
+    "musical",
+    "piano",
+    "practice",
+    "theory"
+  ],
+  "long_text_chars": 4099,
+  "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The"
+}
+```
+
+### R2-LONG-02 long prompt resilience
+
+- 类型: long-text
+- 状态: PASS
+- 耗时: `68.72s`
+- 结论: Long prompt generation completed without crashing.
+
+```json
+{
+  "output_chars": 4186,
+  "prompt_chars": 4139
+}
+```
+
+### R2-CROSS-01 dual-domain contamination diagnostic
+
+- 类型: cross-domain
+- 状态: WARN
+- 耗时: `65.59s`
+- 结论: Dual-domain run showed contamination or weak own-domain separation.
+
+```json
+{
+  "music_foreign_hits": [
+    "mission",
+    "planet"
+  ],
+  "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to",
+  "music_own_hits": [
+    "music",
+    "musical"
+  ],
+  "space_foreign_hits": [
+    "music",
+    "musical",
+    "theory"
+  ],
+  "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to",
+  "space_own_hits": [
+    "mission",
+    "planet"
+  ]
+}
+```
+
+### R2-CROSS-02 four-domain contamination matrix
+
+- 类型: cross-domain
+- 状态: WARN
+- 耗时: `68.97s`
+- 结论: Four-way contamination matrix detected cross-domain bleed or weak own-domain signal.
+
+```json
+{
+  "matrix": {
+    "cooking": {
+      "cooking": [
+        "chef",
+        "pasta"
+      ],
+      "finance": [],
+      "music": [
+        "music",
+        "musical"
+      ],
+      "space": [
+        "mission"
+      ]
+    },
+    "finance": {
+      "cooking": [],
+      "finance": [
+        "market"
+      ],
+      "music": [
+        "music",
+        "musical"
+      ],
+      "space": [
+        "mission"
+      ]
+    },
+    "music": {
+      "cooking": [],
+      "finance": [],
+      "music": [
+        "music",
+        "musical"
+      ],
+      "space": [
+        "mission"
+      ]
+    },
+    "space": {
+      "cooking": [],
+      "finance": [],
+      "music": [
+        "music",
+        "musical"
+      ],
+      "space": [
+        "mission",
+        "orbit",
+        "planet"
+      ]
+    }
+  }
+}
+```
+
+### R2-STABLE-01 fresh instance determinism
+
+- 类型: stability
+- 状态: PASS
+- 耗时: `121.20s`
+- 结论: Fresh seeded instances were exactly deterministic under greedy generation.
+
+```json
+{
+  "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+  "prompt": "The piano performance"
+}
+```
+
+### R2-STABLE-02 same instance repeatability
+
+- 类型: stability
+- 状态: PASS
+- 耗时: `65.30s`
+- 结论: Repeated greedy calls on the same instance were identical.
+
+```json
+{
+  "outputs": [
+    "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+    "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+    "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+  ]
+}
+```
+
+### R2-STABLE-03 save/load exactness
+
+- 类型: stability
+- 状态: PASS
+- 耗时: `131.99s`
+- 结论: Greedy output stayed exactly stable across save/load roundtrip.
+
+```json
+{
+  "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+}
+```
+
+## 5. 关键发现
+
+- 第二轮 full 没有阻断性 FAIL。
+- WARN 全部集中在跨域污染场景。
+- 四域污染矩阵显示 music/space 仍可保留一定 own-domain 信号，但 finance/cooking 更容易被其它域压制。
diff --git a/reports/agent_memory_blackbox_round2_full_results.json b/reports/agent_memory_blackbox_round2_full_results.json
new file mode 100644
index 0000000..ddc0a74
--- /dev/null
+++ b/reports/agent_memory_blackbox_round2_full_results.json
@@ -0,0 +1,229 @@
+{
+  "aggregation_mode": "representative-plus-full-only-scenarios",
+  "python": "3.12.3",
+  "results": [
+    {
+      "category": "stress",
+      "duration_s": 70.97084128700044,
+      "metrics": {
+        "avg_gate": 0.564429,
+        "rounds": 2,
+        "sample_outputs": {
+          "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+          "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+          "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+          "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this"
+        },
+        "total_generations": 8,
+        "total_writes": 24
+      },
+      "scenario_id": "R2-STRESS-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Completed repeated write/generate pressure loop without crash.",
+      "title": "repeated write/generate pressure"
+    },
+    {
+      "category": "stress",
+      "duration_s": 194.665789562001,
+      "metrics": {
+        "avg_gate": 0.564429,
+        "cycle_outputs": [
+          {
+            "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+            "finance": "The market outlook musical market the mission of increased a, and reduce in that's reduced to be more\n- or",
+            "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+            "space": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n"
+          },
+          {
+            "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+            "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+            "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+            "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this"
+          }
+        ],
+        "cycles": 2
+      },
+      "scenario_id": "R2-STRESS-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Repeated save/load cycles preserved externally valid generation behavior.",
+      "title": "repeated save/load pressure"
+    },
+    {
+      "category": "long-text",
+      "duration_s": 63.66024329799984,
+      "metrics": {
+        "gate": 0.293301,
+        "keyword_hits": [
+          "concert",
+          "music",
+          "musical",
+          "piano",
+          "practice",
+          "theory"
+        ],
+        "long_text_chars": 4099,
+        "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The"
+      },
+      "scenario_id": "R2-LONG-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Long memory write remained usable for downstream generation.",
+      "title": "long memory write grounding"
+    },
+    {
+      "category": "long-text",
+      "duration_s": 68.71787648099962,
+      "metrics": {
+        "output_chars": 4186,
+        "prompt_chars": 4139
+      },
+      "scenario_id": "R2-LONG-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Long prompt generation completed without crashing.",
+      "title": "long prompt resilience"
+    },
+    {
+      "category": "cross-domain",
+      "duration_s": 65.58546234499954,
+      "metrics": {
+        "music_foreign_hits": [
+          "mission",
+          "planet"
+        ],
+        "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to",
+        "music_own_hits": [
+          "music",
+          "musical"
+        ],
+        "space_foreign_hits": [
+          "music",
+          "musical",
+          "theory"
+        ],
+        "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to",
+        "space_own_hits": [
+          "mission",
+          "planet"
+        ]
+      },
+      "scenario_id": "R2-CROSS-01",
+      "status": "WARN",
+      "suite": "representative",
+      "summary": "Dual-domain run showed contamination or weak own-domain separation.",
+      "title": "dual-domain contamination diagnostic"
+    },
+    {
+      "category": "cross-domain",
+      "duration_s": 68.97480458300015,
+      "metrics": {
+        "matrix": {
+          "cooking": {
+            "cooking": [
+              "chef",
+              "pasta"
+            ],
+            "finance": [],
+            "music": [
+              "music",
+              "musical"
+            ],
+            "space": [
+              "mission"
+            ]
+          },
+          "finance": {
+            "cooking": [],
+            "finance": [
+              "market"
+            ],
+            "music": [
+              "music",
+              "musical"
+            ],
+            "space": [
+              "mission"
+            ]
+          },
+          "music": {
+            "cooking": [],
+            "finance": [],
+            "music": [
+              "music",
+              "musical"
+            ],
+            "space": [
+              "mission"
+            ]
+          },
+          "space": {
+            "cooking": [],
+            "finance": [],
+            "music": [
+              "music",
+              "musical"
+            ],
+            "space": [
+              "mission",
+              "orbit",
+              "planet"
+            ]
+          }
+        }
+      },
+      "scenario_id": "R2-CROSS-02",
+      "status": "WARN",
+      "suite": "full",
+      "summary": "Four-way contamination matrix detected cross-domain bleed or weak own-domain signal.",
+      "title": "four-domain contamination matrix"
+    },
+    {
+      "category": "stability",
+      "duration_s": 121.19981672600079,
+      "metrics": {
+        "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+        "prompt": "The piano performance"
+      },
+      "scenario_id": "R2-STABLE-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Fresh seeded instances were exactly deterministic under greedy generation.",
+      "title": "fresh instance determinism"
+    },
+    {
+      "category": "stability",
+      "duration_s": 65.30302968500109,
+      "metrics": {
+        "outputs": [
+          "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+          "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+          "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+        ]
+      },
+      "scenario_id": "R2-STABLE-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Repeated greedy calls on the same instance were identical.",
+      "title": "same instance repeatability"
+    },
+    {
+      "category": "stability",
+      "duration_s": 131.99397509499977,
+      "metrics": {
+        "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+      },
+      "scenario_id": "R2-STABLE-03",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Greedy output stayed exactly stable across save/load roundtrip.",
+      "title": "save/load exactness"
+    }
+  ],
+  "suite": "full",
+  "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md",
+  "torch": "2.11.0+cu130",
+  "total_duration_s": 851.0718390620023,
+  "transformers": "4.57.6"
+}
\ No newline at end of file
diff --git a/reports/agent_memory_blackbox_round2_matrix.md b/reports/agent_memory_blackbox_round2_matrix.md
new file mode 100644
index 0000000..172cd7d
--- /dev/null
+++ b/reports/agent_memory_blackbox_round2_matrix.md
@@ -0,0 +1,183 @@
+# AgentMemorySystem 第二轮黑盒测试矩阵设计
+
+## 1. 目标
+
+在第一轮黑盒测试已经覆盖“加载、基础生成、记忆写入、持久化回环”的基础上，第二轮继续扩展以下四类风险面：
+
+1. 压力测试
+2. 长文本测试
+3. 跨域污染测试
+4. 稳定性测试
+
+本轮仍坚持以下边界：
+
+- 不修改被测实现
+- 不使用 mock
+- 不调用内部 `test()` / `test_*()` 自测函数
+- 不读取内部 memory tree、缓存、私有状态
+- 不把测试绑定到某个固定完整输出句子
+
+## 2. 黑盒准则
+
+### 2.1 允许观察的对象
+
+只观察公开调用及其外部结果：
+
+- `MemLLM.load()`
+- `MemLLM.write()`
+- `MemLLM.generate()`
+- `MemLLM.save_memory()`
+- `MemLLM.load_memory()`
+
+### 2.2 判定方式
+
+第二轮测试矩阵使用三档状态：
+
+- `PASS`：满足场景既定验收规则
+- `WARN`：场景可执行，但暴露出行为风险或质量问题
+- `FAIL`：场景违反硬性要求，或发生未接受的外部异常
+
+其中：
+
+- **gating 场景**：以 `FAIL` 作为阻断结论
+- **diagnostic 场景**：允许 `WARN`，用于暴露质量风险而非直接阻断
+
+## 3. 领域测试数据
+
+为了做跨域和污染观察，第二轮矩阵使用四组真实文本域：
+
+- `music`
+- `space`
+- `finance`
+- `cooking`
+
+每个域使用 3 条真实自然语言样本文本，以及 1 条对应 prompt。
+
+## 4. 测试矩阵
+
+| ID | 类型 | 场景 | 性质 | 核心刺激 | 主要观察指标 | 验收规则 |
+|---|---|---|---|---|---|---|
+| R2-STRESS-01 | 压力 | repeated write/generate pressure | gating | 多轮写入 + 多 prompt 连续生成 | 是否崩溃、是否保留 prompt 前缀、输出是否扩展、gate 是否有限 | 全流程无崩溃且输出有效 |
+| R2-STRESS-02 | 压力 | repeated save/load pressure | gating | 混合域写入后反复 save/load | 每轮回载后生成是否仍有效 | 每轮均可生成有效输出 |
+| R2-LONG-01 | 长文本 | long memory write grounding | gating | 写入超长单条记忆文本 | 长文本写入后是否仍能对目标 prompt 产生领域词 | 输出有效且命中目标域关键词 |
+| R2-LONG-02 | 长文本 | long prompt resilience | diagnostic | 超长 prompt 直接生成 | 是否能完成生成；若失败，失败类型是什么 | 不崩溃为 PASS；崩溃记 WARN |
+| R2-CROSS-01 | 跨域污染 | dual-domain contamination diagnostic | diagnostic | 同时写入 music + space | own-domain hits、foreign hits | 无污染为 PASS；有污染或 own signal 弱则 WARN |
+| R2-CROSS-02 | 跨域污染 | four-domain contamination matrix | diagnostic | 同时写入四域并逐 prompt 生成 | 四域命中矩阵、foreign hit 分布 | own hits 足且 foreign 低为 PASS，否则 WARN |
+| R2-STABLE-01 | 稳定性 | fresh instance determinism | gating | 相同 seed、相同输入、不同新实例 | greedy 输出是否完全一致 | 必须完全一致 |
+| R2-STABLE-02 | 稳定性 | same instance repeatability | diagnostic | 同一实例重复 greedy 生成 | 重复调用是否漂移 | 完全一致为 PASS；漂移则 WARN |
+| R2-STABLE-03 | 稳定性 | save/load exactness | gating | 生成前后做 save/load 回环 | roundtrip 前后 greedy 输出是否完全一致 | 必须完全一致 |
+
+## 5. 每类测试的设计意图
+
+### 5.1 压力测试
+
+第一轮只验证了小规模调用链路。第二轮压力测试关注：
+
+- 连续写入后是否出现崩溃
+- 连续生成后是否出现无输出或 prompt 破坏
+- 多次 save/load 后是否出现状态损坏
+
+这类测试更接近真实系统运行中的“记忆不断被写入和读取”的场景。
+
+### 5.2 长文本测试
+
+第一轮没有覆盖长样本。第二轮要验证：
+
+- 单条超长 memory text 写入时是否还能正常使用
+- 超长 prompt 输入是否会触发位置编码、上下文窗口或形状错误
+
+其中长 prompt 场景被定义为 diagnostic，因为模型上下文本来可能存在边界限制；这类用例的重点是暴露边界行为，而不是强行把所有边界都定义为功能缺陷。
+
+### 5.3 跨域污染测试
+
+第一轮已经观察到混合域输入存在串扰风险。第二轮将其系统化，形成矩阵：
+
+- 双域污染：快速定位最明显的串扰
+- 四域污染：观察污染是否随域数量上升而恶化
+
+注意：这类用例不要求“生成完全纯净”，而是通过 own-hit / foreign-hit 的黑盒指标判断隔离质量。
+
+### 5.4 稳定性测试
+
+稳定性分三层：
+
+1. **新实例确定性**：相同 seed、相同写入顺序、相同 greedy prompt，是否给出完全相同结果
+2. **同实例重复性**：同一实例上重复调用是否会自发漂移
+3. **持久化精确性**：save/load 之后 greedy 输出是否保持完全一致
+
+这三类一起能区分：
+
+- 初始化不稳定
+- 运行时状态漂移
+- 持久化恢复偏差
+
+## 6. 非 overfit 说明
+
+第二轮仍然不要求“输出精确等于某句话”，因为那会把测试绑定到模型偶然文案。
+
+本轮采用的稳定观测指标包括：
+
+- prompt 前缀是否保留
+- 是否成功扩展输出
+- 是否出现目标领域关键词
+- 是否出现非目标领域关键词
+- greedy 输出是否在重复条件下完全一致
+
+这类指标更稳健，也更符合黑盒验收目标。
+
+## 7. 可执行资产
+
+第二轮矩阵的可执行 runner：
+
+- `/workspace/blackbox_test_agent_memory_round2.py`
+
+支持两种运行模式：
+
+### representative
+
+执行每类 1 个代表场景：
+
+- `R2-STRESS-01`
+- `R2-LONG-01`
+- `R2-CROSS-01`
+- `R2-STABLE-01`
+
+适合日常回归。
+
+### full
+
+执行全部矩阵场景：
+
+- `R2-STRESS-01`
+- `R2-STRESS-02`
+- `R2-LONG-01`
+- `R2-LONG-02`
+- `R2-CROSS-01`
+- `R2-CROSS-02`
+- `R2-STABLE-01`
+- `R2-STABLE-02`
+- `R2-STABLE-03`
+
+适合完整验证或发布前检查。
+
+## 8. 推荐执行命令
+
+代表集：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round2.py --suite representative
+```
+
+全量集：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round2.py --suite full
+```
+
+输出 JSON：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round2.py \
+  --suite representative \
+  --json-out /workspace/reports/agent_memory_blackbox_round2_results.json
+```
diff --git a/reports/agent_memory_blackbox_round2_results.json b/reports/agent_memory_blackbox_round2_results.json
new file mode 100644
index 0000000..79f19a2
--- /dev/null
+++ b/reports/agent_memory_blackbox_round2_results.json
@@ -0,0 +1,96 @@
+{
+  "python": "3.12.3",
+  "results": [
+    {
+      "category": "stress",
+      "duration_s": 70.97084128700044,
+      "metrics": {
+        "avg_gate": 0.564429,
+        "rounds": 2,
+        "sample_outputs": {
+          "cooking": "The chef prepared culinary chef increased pastry chefs reduced mission of reduce pasta cook and musicals, the team.\n a",
+          "finance": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+          "music": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+          "space": "The space telescope planets around musicals mission- the and increased. reduce team of a, that's\n in this"
+        },
+        "total_generations": 8,
+        "total_writes": 24
+      },
+      "scenario_id": "R2-STRESS-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Completed repeated write/generate pressure loop without crash.",
+      "title": "repeated write/generate pressure"
+    },
+    {
+      "category": "long-text",
+      "duration_s": 63.66024329799984,
+      "metrics": {
+        "gate": 0.293301,
+        "keyword_hits": [
+          "concert",
+          "music",
+          "musical",
+          "piano",
+          "practice",
+          "theory"
+        ],
+        "long_text_chars": 4099,
+        "output": "The piano performance piano Music Practice music practice musical theory concerting hard, night and hours of the morning hour\n a- in this evening The"
+      },
+      "scenario_id": "R2-LONG-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Long memory write remained usable for downstream generation.",
+      "title": "long memory write grounding"
+    },
+    {
+      "category": "cross-domain",
+      "duration_s": 65.58546234499954,
+      "metrics": {
+        "music_foreign_hits": [
+          "mission",
+          "planet"
+        ],
+        "music_output": "The piano performance musical music the mission of a, and planets-\n. in that is an all other's to",
+        "music_own_hits": [
+          "music",
+          "musical"
+        ],
+        "space_foreign_hits": [
+          "music",
+          "musical",
+          "theory"
+        ],
+        "space_output": "The space telescope planets beyond the musical theory of a- and mission\n, in that is an all other. to",
+        "space_own_hits": [
+          "mission",
+          "planet"
+        ]
+      },
+      "scenario_id": "R2-CROSS-01",
+      "status": "WARN",
+      "suite": "representative",
+      "summary": "Dual-domain run showed contamination or weak own-domain separation.",
+      "title": "dual-domain contamination diagnostic"
+    },
+    {
+      "category": "stability",
+      "duration_s": 121.19981672600079,
+      "metrics": {
+        "output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+        "prompt": "The piano performance"
+      },
+      "scenario_id": "R2-STABLE-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Fresh seeded instances were exactly deterministic under greedy generation.",
+      "title": "fresh instance determinism"
+    }
+  ],
+  "suite": "representative",
+  "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md",
+  "torch": "2.11.0+cu130",
+  "total_duration_s": 321.41653306199987,
+  "transformers": "4.57.6"
+}
\ No newline at end of file
diff --git a/reports/agent_memory_blackbox_round3_execution_report.md b/reports/agent_memory_blackbox_round3_execution_report.md
new file mode 100644
index 0000000..2bcb877
--- /dev/null
+++ b/reports/agent_memory_blackbox_round3_execution_report.md
@@ -0,0 +1,191 @@
+# AgentMemorySystem 第三轮 full 黑盒测试执行报告
+
+## 1. 执行说明
+
+第三轮 full 采用逐场景执行并聚合的方式完成，覆盖边界输入、异常输入、性能/时延基线共 12 个场景。
+
+## 2. 环境
+
+- Python: 3.12.3
+- Torch: 2.11.0+cu130
+- Transformers: 4.57.6
+- Model: gpt2
+
+## 3. 汇总结果
+
+- PASS: 10
+- WARN: 1
+- FAIL: 1
+- 聚合总耗时: `798.45s`
+
+## 4. 分场景结果
+
+### R3-BOUND-01 empty prompt generation
+
+- 类型: boundary-input
+- 状态: FAIL
+- 耗时: `61.69s`
+- 结论: RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous
+
+### R3-BOUND-02 single-character prompt generation
+
+- 类型: boundary-input
+- 状态: PASS
+- 耗时: `60.23s`
+- 结论: Single-character prompt generation remained valid.
+
+```json
+{
+  "output": "A the I ami- \"I-k, ands"
+}
+```
+
+### R3-BOUND-03 whitespace prompt generation
+
+- 类型: boundary-input
+- 状态: PASS
+- 耗时: `57.95s`
+- 结论: Whitespace prompt generation completed.
+
+```json
+{
+  "output": "   ia the world, I- The other\n ("
+}
+```
+
+### R3-BOUND-04 multiline prompt generation
+
+- 类型: boundary-input
+- 状态: PASS
+- 耗时: `61.25s`
+- 结论: Multi-line prompt generation completed.
+
+```json
+{
+  "output": "Line one.\nLine two. the I have a,\n"
+}
+```
+
+### R3-ABN-01 write None input
+
+- 类型: abnormal-input
+- 状态: PASS
+- 耗时: `59.30s`
+- 结论: write(None, ...) raised an externally visible exception as expected.
+
+```json
+{
+  "error_message": "You need to specify either `text` or `text_target`.",
+  "error_type": "ValueError"
+}
+```
+
+### R3-ABN-02 generate None input
+
+- 类型: abnormal-input
+- 状态: PASS
+- 耗时: `62.67s`
+- 结论: generate(None, ...) raised an externally visible exception as expected.
+
+```json
+{
+  "error_message": "You need to specify either `text` or `text_target`.",
+  "error_type": "ValueError"
+}
+```
+
+### R3-ABN-03 negative max tokens handling
+
+- 类型: abnormal-input
+- 状态: WARN
+- 耗时: `66.16s`
+- 结论: Negative mt returned the original prompt without explicit validation.
+
+```json
+{
+  "output": "Hello"
+}
+```
+
+### R3-ABN-04 load missing memory file
+
+- 类型: abnormal-input
+- 状态: PASS
+- 耗时: `61.28s`
+- 结论: load_memory() on a missing path raised an externally visible exception.
+
+```json
+{
+  "error_message": "[Errno 2] No such file or directory: '/tmp/agent-memory-nonexistent-file.pt'",
+  "error_type": "FileNotFoundError"
+}
+```
+
+### R3-PERF-01 cold load latency baseline
+
+- 类型: performance
+- 状态: PASS
+- 耗时: `61.67s`
+- 结论: Cold load latency baseline recorded.
+
+```json
+{
+  "cold_load_s": 61.623
+}
+```
+
+### R3-PERF-02 write latency baseline
+
+- 类型: performance
+- 状态: PASS
+- 耗时: `61.81s`
+- 结论: Write latency baseline recorded.
+
+```json
+{
+  "avg_write_s": 0.081,
+  "max_write_s": 0.181,
+  "min_write_s": 0.028,
+  "write_count": 3
+}
+```
+
+### R3-PERF-03 generate latency baseline
+
+- 类型: performance
+- 状态: PASS
+- 耗时: `64.74s`
+- 结论: Generate latency baseline recorded.
+
+```json
+{
+  "avg_generate_s": 0.957,
+  "generate_count": 3,
+  "max_generate_s": 1.002,
+  "min_generate_s": 0.887,
+  "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+}
+```
+
+### R3-PERF-04 save/load latency baseline
+
+- 类型: performance
+- 状态: PASS
+- 耗时: `119.69s`
+- 结论: Save/load latency baseline recorded.
+
+```json
+{
+  "load_s": 0.003,
+  "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+  "save_s": 0.002
+}
+```
+
+## 5. 关键发现
+
+- `R3-BOUND-01` 暴露明确 FAIL：空字符串 prompt 会触发真实运行时崩溃。
+- `R3-ABN-03` 记为 WARN：`mt < 0` 不会报错，而是直接返回原 prompt，说明缺少显式参数校验。
+- 其余边界输入（单字符、空白、多行）可运行。
+- 异常输入（`None`、缺失文件）均能以外部可见异常形式返回。
+- 当前环境下的冷启动基线约为 61.6s；有记忆的 greedy 生成平均约 0.96s。
diff --git a/reports/agent_memory_blackbox_round3_matrix.md b/reports/agent_memory_blackbox_round3_matrix.md
new file mode 100644
index 0000000..a8d3b6f
--- /dev/null
+++ b/reports/agent_memory_blackbox_round3_matrix.md
@@ -0,0 +1,108 @@
+# AgentMemorySystem 第三轮黑盒测试矩阵设计
+
+## 1. 目标
+
+第三轮黑盒测试在前两轮基础上继续扩展三类能力面：
+
+1. 边界输入测试
+2. 异常输入测试
+3. 性能 / 时延基线
+
+仍坚持黑盒边界：
+
+- 不修改被测实现
+- 不使用 mock
+- 不读取内部 memory tree、缓存或私有状态
+- 不调用源码内置 `test()` / `test_*()` 自测函数
+- 不把断言写成固定完整文本匹配
+
+## 2. 公开调用面
+
+第三轮只使用以下公开接口：
+
+- `MemLLM.load()`
+- `MemLLM.write()`
+- `MemLLM.generate()`
+- `MemLLM.save_memory()`
+- `MemLLM.load_memory()`
+
+## 3. 状态语义
+
+- `PASS`：满足场景验收规则
+- `WARN`：场景完成但暴露风险、边界不稳或性能超出经验阈值
+- `FAIL`：违反硬性要求或出现未接受的外部异常
+
+## 4. 第三轮测试矩阵
+
+| ID | 类型 | 场景 | 性质 | 核心刺激 | 主要观察指标 | 验收规则 |
+|---|---|---|---|---|---|---|
+| R3-BOUNDARY-01 | 边界输入 | empty prompt generate | diagnostic | `generate("")` | 是否报错、是否产生输出 | 不崩溃为 PASS；异常为 WARN |
+| R3-BOUNDARY-02 | 边界输入 | punctuation-only prompt | diagnostic | 只含标点的 prompt | 前缀保持、输出扩展 | 完成生成为 PASS |
+| R3-BOUNDARY-03 | 边界输入 | whitespace-heavy prompt | diagnostic | 空格/换行密集 prompt | 是否报错、输出长度 | 完成生成为 PASS |
+| R3-BOUNDARY-04 | 边界输入 | minimal memory write | gating | 极短文本写入 | `write()` 是否返回有限 gate | 无异常且 gate 有限 |
+| R3-EXC-01 | 异常输入 | non-string write input | gating | `write(None)` / `write(123)` | 是否抛出外部异常 | 必须抛出异常且进程不挂死 |
+| R3-EXC-02 | 异常输入 | non-string generate input | gating | `generate(None)` / `generate(123)` | 是否抛出外部异常 | 必须抛出异常且进程不挂死 |
+| R3-EXC-03 | 异常输入 | missing memory load path | gating | `load_memory("/tmp/not-found.pt")` | 是否抛出文件错误 | 必须抛出异常 |
+| R3-EXC-04 | 异常输入 | invalid memory file load | gating | 对随机文本文件执行 `load_memory()` | 是否抛出解析异常 | 必须抛出异常 |
+| R3-PERF-01 | 性能基线 | cold load baseline | diagnostic | 全新实例 `load("gpt2")` | 加载耗时 | 记录基线；超经验阈值记 WARN |
+| R3-PERF-02 | 性能基线 | write latency baseline | diagnostic | 连续写入 3 条记忆 | 单次与平均耗时 | 记录基线；异常慢记 WARN |
+| R3-PERF-03 | 性能基线 | generate latency baseline | diagnostic | 空记忆和有记忆两种生成 | 生成耗时 | 记录基线；异常慢记 WARN |
+| R3-PERF-04 | 性能基线 | save/load roundtrip latency | diagnostic | 保存+读取记忆 | save/load 各自耗时 | 记录基线；异常慢记 WARN |
+
+## 5. 设计意图
+
+### 5.1 边界输入
+
+边界输入不是为了证明模型“语义上合理”，而是为了观察：
+
+- 接口是否在非常规输入下崩溃
+- 是否破坏 prompt 前缀约定
+- 是否出现空输出、死循环或明显外部异常
+
+### 5.2 异常输入
+
+第三轮把“错误输入是否被外部稳定处理”纳入黑盒验证范围。
+
+这里不要求实现必须给出优雅错误文案，但要求：
+
+- 异常是**可观察、可终止、可定位**的
+- 不应导致进程卡死
+- 不应悄悄吞错并给出伪正常结果
+
+### 5.3 性能 / 时延基线
+
+性能场景重点不是追求绝对快，而是建立一条黑盒基线，回答：
+
+- 冷启动大约多慢
+- 单次写入大约多慢
+- 生成大约多慢
+- save/load 回环大约多慢
+
+这些数据可为后续版本做对比回归。
+
+## 6. 可执行资产
+
+- 第三轮 runner：
+  `/workspace/blackbox_test_agent_memory_round3.py`
+
+## 7. 推荐执行命令
+
+代表集：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round3.py --suite representative
+```
+
+全量集：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round3.py --suite full
+```
+
+导出 JSON：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_round3.py \
+  --suite full \
+  --json-out /workspace/reports/agent_memory_blackbox_round3_results.json
+```
diff --git a/reports/agent_memory_blackbox_round3_results.json b/reports/agent_memory_blackbox_round3_results.json
new file mode 100644
index 0000000..410a2a1
--- /dev/null
+++ b/reports/agent_memory_blackbox_round3_results.json
@@ -0,0 +1,165 @@
+{
+  "aggregation_mode": "per-scenario-execution",
+  "python": "3.12.3",
+  "results": [
+    {
+      "category": "boundary-input",
+      "duration_s": 61.6888816799983,
+      "metrics": {},
+      "scenario_id": "R3-BOUND-01",
+      "status": "FAIL",
+      "suite": "representative",
+      "summary": "RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous",
+      "title": "empty prompt generation"
+    },
+    {
+      "category": "boundary-input",
+      "duration_s": 60.232560045002174,
+      "metrics": {
+        "output": "A the I ami- \"I-k, ands"
+      },
+      "scenario_id": "R3-BOUND-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Single-character prompt generation remained valid.",
+      "title": "single-character prompt generation"
+    },
+    {
+      "category": "boundary-input",
+      "duration_s": 57.94982666999931,
+      "metrics": {
+        "output": "   ia the world, I- The other\n ("
+      },
+      "scenario_id": "R3-BOUND-03",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Whitespace prompt generation completed.",
+      "title": "whitespace prompt generation"
+    },
+    {
+      "category": "boundary-input",
+      "duration_s": 61.2545259260005,
+      "metrics": {
+        "output": "Line one.\nLine two. the I have a,\n"
+      },
+      "scenario_id": "R3-BOUND-04",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Multi-line prompt generation completed.",
+      "title": "multiline prompt generation"
+    },
+    {
+      "category": "abnormal-input",
+      "duration_s": 59.29971606500112,
+      "metrics": {
+        "error_message": "You need to specify either `text` or `text_target`.",
+        "error_type": "ValueError"
+      },
+      "scenario_id": "R3-ABN-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "write(None, ...) raised an externally visible exception as expected.",
+      "title": "write None input"
+    },
+    {
+      "category": "abnormal-input",
+      "duration_s": 62.671864920997905,
+      "metrics": {
+        "error_message": "You need to specify either `text` or `text_target`.",
+        "error_type": "ValueError"
+      },
+      "scenario_id": "R3-ABN-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "generate(None, ...) raised an externally visible exception as expected.",
+      "title": "generate None input"
+    },
+    {
+      "category": "abnormal-input",
+      "duration_s": 66.16167209299965,
+      "metrics": {
+        "output": "Hello"
+      },
+      "scenario_id": "R3-ABN-03",
+      "status": "WARN",
+      "suite": "full",
+      "summary": "Negative mt returned the original prompt without explicit validation.",
+      "title": "negative max tokens handling"
+    },
+    {
+      "category": "abnormal-input",
+      "duration_s": 61.27568913600044,
+      "metrics": {
+        "error_message": "[Errno 2] No such file or directory: '/tmp/agent-memory-nonexistent-file.pt'",
+        "error_type": "FileNotFoundError"
+      },
+      "scenario_id": "R3-ABN-04",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "load_memory() on a missing path raised an externally visible exception.",
+      "title": "load missing memory file"
+    },
+    {
+      "category": "performance",
+      "duration_s": 61.66670775899911,
+      "metrics": {
+        "cold_load_s": 61.623
+      },
+      "scenario_id": "R3-PERF-01",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Cold load latency baseline recorded.",
+      "title": "cold load latency baseline"
+    },
+    {
+      "category": "performance",
+      "duration_s": 61.81413885800066,
+      "metrics": {
+        "avg_write_s": 0.081,
+        "max_write_s": 0.181,
+        "min_write_s": 0.028,
+        "write_count": 3
+      },
+      "scenario_id": "R3-PERF-02",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Write latency baseline recorded.",
+      "title": "write latency baseline"
+    },
+    {
+      "category": "performance",
+      "duration_s": 64.73673818600219,
+      "metrics": {
+        "avg_generate_s": 0.957,
+        "generate_count": 3,
+        "max_generate_s": 1.002,
+        "min_generate_s": 0.887,
+        "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has"
+      },
+      "scenario_id": "R3-PERF-03",
+      "status": "PASS",
+      "suite": "representative",
+      "summary": "Generate latency baseline recorded.",
+      "title": "generate latency baseline"
+    },
+    {
+      "category": "performance",
+      "duration_s": 119.6933355800029,
+      "metrics": {
+        "load_s": 0.003,
+        "sample_output": "The piano performance musical music the and violin, a- is an in that's.\n The other \" it has",
+        "save_s": 0.002
+      },
+      "scenario_id": "R3-PERF-04",
+      "status": "PASS",
+      "suite": "full",
+      "summary": "Save/load latency baseline recorded.",
+      "title": "save/load latency baseline"
+    }
+  ],
+  "suite": "full",
+  "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md",
+  "torch": "2.11.0+cu130",
+  "total_duration_s": 798.4456569190043,
+  "transformers": "4.57.6"
+}
\ No newline at end of file
diff --git a/reports/agent_memory_blackbox_test_report.md b/reports/agent_memory_blackbox_test_report.md
new file mode 100644
index 0000000..fb7a4bc
--- /dev/null
+++ b/reports/agent_memory_blackbox_test_report.md
@@ -0,0 +1,316 @@
+# AgentMemorySystem 黑盒测试报告
+
+## 1. 测试目标
+
+对上传文件 `/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md` 中的实现进行**黑盒测试**，要求满足：
+
+- 不使用 mock
+- 不简化被测逻辑
+- 不修改被测代码
+- 不写依赖内部实现细节的 overfit 测试
+- 不依赖 fallback 路径掩盖真实问题
+
+本次测试只通过其**公开可见运行行为**进行验证，不读取内部树结构、缓存、权重张量或私有辅助函数结果。
+
+## 2. 被测对象
+
+上传文档中包含完整可执行 Python 实现。黑盒测试仅使用以下公开调用方式：
+
+- `MemLLM.load()`
+- `MemLLM.write()`
+- `MemLLM.generate()`
+- `MemLLM.save_memory()`
+- `MemLLM.load_memory()`
+
+## 3. 测试环境
+
+- OS: Linux 6.1.147
+- Python: 3.12.3
+- Torch: 2.11.0+cu130
+- 模型: `gpt2`
+
+本次测试先后验证了两套 `transformers` 环境：
+
+1. `transformers 5.5.4`
+2. `transformers 4.57.6`
+
+## 4. 测试方法说明
+
+### 4.1 黑盒边界
+
+为了保持黑盒属性，测试中：
+
+- 不调用被测文件自带的 `test()`、`test_*()` 内部测试函数
+- 不读取 `amm.tree.store`、`_wte_neighbor_cache` 等内部状态
+- 不通过 monkey patch、stub、fake model、替身 tokenizer 等方式替换真实依赖
+- 不改源码、不降级功能、不删除逻辑
+
+### 4.2 真实执行方式
+
+测试采用真实依赖和真实模型执行：
+
+- 真实加载 `gpt2`
+- 真实调用 `write()` 写入记忆
+- 真实调用 `generate()` 观察文本输出
+- 真实保存/加载记忆文件
+
+### 4.3 非 overfit 原则
+
+断言不绑定某个固定完整句子，而只检查稳定且对外有意义的行为，例如：
+
+- 是否成功加载
+- 是否保留 prompt 前缀
+- 是否在写入记忆后出现目标领域词
+- 保存/加载后是否保留该领域响应能力
+
+这避免了把测试写成“必须生成某一字不差文本”的脆弱用例。
+
+## 5. 测试过程
+
+### 步骤 A：定位公开入口
+
+确认上传文档是完整可执行实现，并识别公开调用面：
+
+- `load("gpt2")`
+- `write(text, training_mode=True)`
+- `generate(prompt, mt=..., greedy=True)`
+- `save_memory(path)`
+- `load_memory(path)`
+
+### 步骤 B：环境准备
+
+初始环境缺少 `torch` 和 `transformers`，先安装真实运行所需依赖。
+
+### 步骤 C：兼容性复现
+
+在 `transformers 5.5.4` 下直接进行真实调用，结果 `generate()` 崩溃，报错为：
+
+`IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)`
+
+崩溃栈位于 GPT-2 block 前向过程中，说明当前实现与 `transformers 5.x` 存在兼容性问题。
+
+### 步骤 D：在兼容环境下执行正式黑盒用例
+
+将 `transformers` 切换到 `4.57.6` 后重新执行同样的真实调用，功能恢复正常，然后运行独立黑盒测试驱动：
+
+- 文件：`/workspace/blackbox_test_agent_memory_system.py`
+
+## 6. 正式测试用例与结果
+
+### TC-01 加载公开 API
+
+**目标**  
+验证 `MemLLM.load("gpt2")` 可在真实环境成功完成初始化。
+
+**结果**  
+通过。
+
+---
+
+### TC-02 空记忆下生成
+
+**目标**  
+验证未写入记忆时，`generate()` 可返回非空字符串，并保留输入 prompt 前缀。
+
+**输入**  
+`prompt = "Hello"`
+
+**结果**  
+通过。
+
+**输出样例**
+
+```text
+'Hello the other a- I have in this, (the.\n "I'
+```
+
+---
+
+### TC-03 写入前的音乐提示基线
+
+**目标**  
+记录未写入音乐记忆前，对音乐 prompt 的基线输出。
+
+**输入**  
+`prompt = "The piano performance"`
+
+**结果**  
+通过。
+
+**输出样例**
+
+```text
+'The piano performance of the and, "The world (the-theon the other people in a. The on'
+```
+
+**观察**  
+未出现命中的音乐领域关键词。
+
+---
+
+### TC-04 写入音乐记忆后观察领域接地
+
+**目标**  
+在写入真实音乐语料后，验证 `generate()` 是否出现可观察的音乐领域信号。
+
+**写入内容**
+
+1. `He practiced piano for hours perfecting a difficult Chopin nocturne.`
+2. `She studied music theory and harmonic progression at the conservatory.`
+3. `The orchestra rehearsed the symphony before the evening concert.`
+
+**结果**  
+通过。
+
+**门控返回值**
+
+- `0.552463`
+- `0.654567`
+- `0.569074`
+
+**输出样例**
+
+```text
+'The piano performance musical music the and violin, a- is an in that\'s.\n The other " it has'
+```
+
+**命中关键词**
+
+- `music`
+- `musical`
+- `violin`
+
+**结论**  
+从黑盒角度看，写入记忆后，生成结果出现了明确的音乐领域词，说明外部可观察的领域接地增强成立。
+
+---
+
+### TC-05 记忆前后领域信号增强
+
+**目标**  
+比较 TC-03 与 TC-04，确认写入记忆后领域信号相对增强。
+
+**结果**  
+通过。
+
+**对比**
+
+- 写入前关键词命中：`[]`
+- 写入后关键词命中：`['music', 'musical', 'violin']`
+
+**结论**  
+在黑盒观察层面，写入记忆前后确实产生了显著可见差异。
+
+---
+
+### TC-06 记忆保存/加载回环
+
+**目标**  
+验证 `save_memory()` 与 `load_memory()` 后，模型仍保留可观察的音乐领域响应能力。
+
+**结果**  
+通过。
+
+**中间文件**
+
+- 大小：`25116 bytes`
+
+**重载后输出样例**
+
+```text
+'The piano performance musical music the and violin, a- is an in that\'s.\n The other " it has'
+```
+
+**重载后关键词**
+
+- `music`
+- `musical`
+- `violin`
+
+**结论**  
+从外部行为看，记忆持久化与恢复功能成立。
+
+## 7. 汇总结果
+
+在 `transformers 4.57.6` 环境下，正式黑盒测试结果：
+
+- 通过：6
+- 失败：0
+
+总耗时约：
+
+- `130.65s`
+
+## 8. 发现的问题与风险
+
+### P1：与 `transformers 5.x` 不兼容
+
+**现象**  
+在 `transformers 5.5.4` 环境中，公开接口 `generate()` 真实执行时直接崩溃。
+
+**外部影响**  
+这意味着如果用户在较新的 `transformers` 环境部署该实现，核心生成能力不可用。
+
+**复现条件**
+
+1. 安装 `torch 2.11.0+cu130`
+2. 安装 `transformers 5.5.4`
+3. 加载上传实现
+4. 执行：
+
+```python
+m = MemLLM(Cfg())
+m.load("gpt2")
+m.generate("Hello", mt=15, greedy=True)
+```
+
+**结果**  
+抛出：
+
+```text
+IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)
+```
+
+### P2：跨域污染风险
+
+在额外探测中，如果同时写入音乐和太空两组记忆，`"The piano performance"` 与 `"The space telescope"` 两个 prompt 都可能混入另一领域词汇。
+
+这说明该系统从黑盒现象上存在一定的**领域边界串扰**。  
+本问题不影响本次主测试的“功能是否可用”结论，但会影响更高要求的语义隔离质量。
+
+## 9. 结论
+
+### 9.1 功能结论
+
+在**兼容环境 `transformers 4.57.6`** 下，被测实现从黑盒角度表现为：
+
+- 可以真实加载
+- 可以在空记忆下生成
+- 可以写入记忆并改变后续生成
+- 可以在目标 prompt 上体现领域接地增强
+- 可以保存和恢复记忆行为
+
+### 9.2 质量结论
+
+该实现具备可运行的外部功能闭环，但存在一个明确的工程风险：
+
+- 对 `transformers 5.x` 的兼容性失败
+
+因此，如果用于真实交付或部署，建议至少将运行环境版本要求显式固定，或后续再做兼容性修复验证。
+
+## 10. 交付物
+
+本次新增的测试资产：
+
+- 黑盒测试驱动：`/workspace/blackbox_test_agent_memory_system.py`
+- 测试报告：`/workspace/reports/agent_memory_blackbox_test_report.md`
+
+## 11. 复现命令
+
+在当前仓库根目录执行：
+
+```bash
+python3 /workspace/blackbox_test_agent_memory_system.py
+```
+
+如果要复现兼容性问题，可在 `transformers 5.x` 环境下执行真实调用进行验证。
diff --git a/reports/agent_memory_cross_domain_heatmap.json b/reports/agent_memory_cross_domain_heatmap.json
new file mode 100644
index 0000000..34e414b
--- /dev/null
+++ b/reports/agent_memory_cross_domain_heatmap.json
@@ -0,0 +1,368 @@
+{
+  "avg_gate": 0.564429484307766,
+  "duration_s": 75.81960409800013,
+  "matrix": {
+    "cooking::p1": {
+      "cooking": {
+        "count": 1,
+        "hits": [
+          "chef"
+        ]
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 2,
+        "hits": [
+          "music",
+          "musical"
+        ]
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "cooking::p2": {
+      "cooking": {
+        "count": 1,
+        "hits": [
+          "pasta"
+        ]
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 3,
+        "hits": [
+          "music",
+          "musical",
+          "practice"
+        ]
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "cooking::p3": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 0,
+        "hits": []
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "finance::p1": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 1,
+        "hits": [
+          "market"
+        ]
+      },
+      "music": {
+        "count": 2,
+        "hits": [
+          "music",
+          "musical"
+        ]
+      },
+      "space": {
+        "count": 1,
+        "hits": [
+          "mission"
+        ]
+      }
+    },
+    "finance::p2": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 1,
+        "hits": [
+          "portfolio"
+        ]
+      },
+      "music": {
+        "count": 0,
+        "hits": []
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "finance::p3": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 0,
+        "hits": []
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "music::p1": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 2,
+        "hits": [
+          "music",
+          "musical"
+        ]
+      },
+      "space": {
+        "count": 1,
+        "hits": [
+          "mission"
+        ]
+      }
+    },
+    "music::p2": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 3,
+        "hits": [
+          "music",
+          "musical",
+          "violin"
+        ]
+      },
+      "space": {
+        "count": 0,
+        "hits": []
+      }
+    },
+    "music::p3": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 3,
+        "hits": [
+          "music",
+          "musical",
+          "theory"
+        ]
+      },
+      "space": {
+        "count": 1,
+        "hits": [
+          "mission"
+        ]
+      }
+    },
+    "space::p1": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 2,
+        "hits": [
+          "music",
+          "musical"
+        ]
+      },
+      "space": {
+        "count": 3,
+        "hits": [
+          "mission",
+          "orbit",
+          "planet"
+        ]
+      }
+    },
+    "space::p2": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 0,
+        "hits": []
+      },
+      "space": {
+        "count": 2,
+        "hits": [
+          "mission",
+          "planet"
+        ]
+      }
+    },
+    "space::p3": {
+      "cooking": {
+        "count": 0,
+        "hits": []
+      },
+      "finance": {
+        "count": 0,
+        "hits": []
+      },
+      "music": {
+        "count": 2,
+        "hits": [
+          "music",
+          "musical"
+        ]
+      },
+      "space": {
+        "count": 2,
+        "hits": [
+          "mission",
+          "planet"
+        ]
+      }
+    }
+  },
+  "model_name": "gpt2",
+  "outputs": {
+    "cooking::p1": {
+      "output": "The chef prepared culinary chef of increased pastry, musical the team and reduce a.\n- or that's in this",
+      "prompt": "The chef prepared"
+    },
+    "cooking::p2": {
+      "output": "The pasta course pasta practice musical, the team of a and increased by an- or in that is to be all",
+      "prompt": "The pasta course"
+    },
+    "cooking::p3": {
+      "output": "The dessert service service services team and increased the information, a new- is an in that's.\n of it",
+      "prompt": "The dessert service"
+    },
+    "finance::p1": {
+      "output": "The market outlook musical market the mission of increased a, and reduce in that is an-\n. to be all",
+      "prompt": "The market outlook"
+    },
+    "finance::p2": {
+      "output": "The portfolio manager increased portfolio management team the and reduced a, or that's reduce stock-\n. in this is",
+      "prompt": "The portfolio manager"
+    },
+    "finance::p3": {
+      "output": "The quarterly earnings call increased reduce the team and reduced-, a decreased in this lowered its\n. of an all that",
+      "prompt": "The quarterly earnings call"
+    },
+    "music::p1": {
+      "output": "The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is",
+      "prompt": "The piano performance"
+    },
+    "music::p2": {
+      "output": "The violin recital violin musical music the team, a- and increased in this is an all of that's.\n",
+      "prompt": "The violin recital"
+    },
+    "music::p3": {
+      "output": "The music theory lesson theory musical mission of the \" and increased., a-\n in this is an I am more",
+      "prompt": "The music theory lesson"
+    },
+    "space::p1": {
+      "output": "The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's\n",
+      "prompt": "The space telescope"
+    },
+    "space::p2": {
+      "output": "The Mars mission mission missions planets. Mission Missions mission\n reduce team of the world, and increased to a- or",
+      "prompt": "The Mars mission"
+    },
+    "space::p3": {
+      "output": "The orbital research team mission team musical. increased the planets and reduce, a- or in this is an all of that",
+      "prompt": "The orbital research team"
+    }
+  },
+  "own_vs_foreign": {
+    "cooking": {
+      "foreign_hits_count": 5,
+      "foreign_to_own_ratio": 2.5,
+      "own_hits_count": 2,
+      "verdict": "high-contamination"
+    },
+    "finance": {
+      "foreign_hits_count": 3,
+      "foreign_to_own_ratio": 1.5,
+      "own_hits_count": 2,
+      "verdict": "high-contamination"
+    },
+    "music": {
+      "foreign_hits_count": 2,
+      "foreign_to_own_ratio": 0.25,
+      "own_hits_count": 8,
+      "verdict": "mixed"
+    },
+    "space": {
+      "foreign_hits_count": 4,
+      "foreign_to_own_ratio": 0.5714285714285714,
+      "own_hits_count": 7,
+      "verdict": "mixed"
+    }
+  },
+  "prompt_rows": [
+    "music::p1",
+    "music::p2",
+    "music::p3",
+    "space::p1",
+    "space::p2",
+    "space::p3",
+    "finance::p1",
+    "finance::p2",
+    "finance::p3",
+    "cooking::p1",
+    "cooking::p2",
+    "cooking::p3"
+  ],
+  "python": "3.12.3",
+  "target_path": "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md",
+  "torch": "2.11.0+cu130",
+  "transformers": "4.57.6"
+}
\ No newline at end of file
diff --git a/reports/agent_memory_cross_domain_heatmap.md b/reports/agent_memory_cross_domain_heatmap.md
new file mode 100644
index 0000000..a0a26ac
--- /dev/null
+++ b/reports/agent_memory_cross_domain_heatmap.md
@@ -0,0 +1,267 @@
+# AgentMemorySystem 跨域污染热图报告
+
+## 1. 执行环境
+
+- Python: 3.12.3
+- Torch: 2.11.0+cu130
+- Transformers: 4.57.6
+- Model: gpt2
+- 总耗时: `75.82s`
+
+## 2. 说明
+
+该报告通过黑盒方式同时写入多个领域语料，再对各领域多个 prompt 变体进行生成，
+统计 continuation 中命中的各领域关键词数量。
+
+热图符号说明：
+
+- `0`: 无命中
+- `1`: 低污染/低命中
+- `2`: 中等
+- `4`: 高
+
+## 3. 放大版关键词命中计数矩阵
+
+| prompt variant\\keyword | music | space | finance | cooking |
+|---|---|---|---|---|
+| music::p1 | 2 (2) | 1 (1) | 0 (0) | 0 (0) |
+| music::p2 | 3 (2) | 0 (0) | 0 (0) | 0 (0) |
+| music::p3 | 3 (2) | 1 (1) | 0 (0) | 0 (0) |
+| space::p1 | 2 (2) | 3 (2) | 0 (0) | 0 (0) |
+| space::p2 | 0 (0) | 2 (2) | 0 (0) | 0 (0) |
+| space::p3 | 2 (2) | 2 (2) | 0 (0) | 0 (0) |
+| finance::p1 | 2 (2) | 1 (1) | 1 (1) | 0 (0) |
+| finance::p2 | 0 (0) | 0 (0) | 1 (1) | 0 (0) |
+| finance::p3 | 0 (0) | 0 (0) | 0 (0) | 0 (0) |
+| cooking::p1 | 2 (2) | 0 (0) | 0 (0) | 1 (1) |
+| cooking::p2 | 3 (2) | 0 (0) | 0 (0) | 1 (1) |
+| cooking::p3 | 0 (0) | 0 (0) | 0 (0) | 0 (0) |
+
+## 4. 按领域汇总的 own-domain vs foreign-domain
+
+| prompt domain | own hits | foreign hits | foreign/own ratio | verdict |
+|---|---:|---:|---:|---|
+| music | 8 | 2 | 0.25 | mixed |
+| space | 7 | 4 | 0.57 | mixed |
+| finance | 2 | 3 | 1.50 | high-contamination |
+| cooking | 2 | 5 | 2.50 | high-contamination |
+
+## 5. 各 prompt 变体生成样例
+
+### music::p1
+
+**Prompt**: `The piano performance`
+
+**Output**:
+
+```text
+The piano performance musical music the mission of increased, and reduce a- or in this reduced to be more that is
+```
+
+**命中详情**:
+
+- music: ['music', 'musical']
+- space: ['mission']
+- finance: []
+- cooking: []
+
+### music::p2
+
+**Prompt**: `The violin recital`
+
+**Output**:
+
+```text
+The violin recital violin musical music the team, a- and increased in this is an all of that's.
+
+```
+
+**命中详情**:
+
+- music: ['music', 'musical', 'violin']
+- space: []
+- finance: []
+- cooking: []
+
+### music::p3
+
+**Prompt**: `The music theory lesson`
+
+**Output**:
+
+```text
+The music theory lesson theory musical mission of the " and increased., a-
+ in this is an I am more
+```
+
+**命中详情**:
+
+- music: ['music', 'musical', 'theory']
+- space: ['mission']
+- finance: []
+- cooking: []
+
+### space::p1
+
+**Prompt**: `The space telescope`
+
+**Output**:
+
+```text
+The space telescope planets orbit around musical- the team, and mission of increased to reduce a. in that's
+
+```
+
+**命中详情**:
+
+- music: ['music', 'musical']
+- space: ['mission', 'orbit', 'planet']
+- finance: []
+- cooking: []
+
+### space::p2
+
+**Prompt**: `The Mars mission`
+
+**Output**:
+
+```text
+The Mars mission mission missions planets. Mission Missions mission
+ reduce team of the world, and increased to a- or
+```
+
+**命中详情**:
+
+- music: []
+- space: ['mission', 'planet']
+- finance: []
+- cooking: []
+
+### space::p3
+
+**Prompt**: `The orbital research team`
+
+**Output**:
+
+```text
+The orbital research team mission team musical. increased the planets and reduce, a- or in this is an all of that
+```
+
+**命中详情**:
+
+- music: ['music', 'musical']
+- space: ['mission', 'planet']
+- finance: []
+- cooking: []
+
+### finance::p1
+
+**Prompt**: `The market outlook`
+
+**Output**:
+
+```text
+The market outlook musical market the mission of increased a, and reduce in that is an-
+. to be all
+```
+
+**命中详情**:
+
+- music: ['music', 'musical']
+- space: ['mission']
+- finance: ['market']
+- cooking: []
+
+### finance::p2
+
+**Prompt**: `The portfolio manager`
+
+**Output**:
+
+```text
+The portfolio manager increased portfolio management team the and reduced a, or that's reduce stock-
+. in this is
+```
+
+**命中详情**:
+
+- music: []
+- space: []
+- finance: ['portfolio']
+- cooking: []
+
+### finance::p3
+
+**Prompt**: `The quarterly earnings call`
+
+**Output**:
+
+```text
+The quarterly earnings call increased reduce the team and reduced-, a decreased in this lowered its
+. of an all that
+```
+
+**命中详情**:
+
+- music: []
+- space: []
+- finance: []
+- cooking: []
+
+### cooking::p1
+
+**Prompt**: `The chef prepared`
+
+**Output**:
+
+```text
+The chef prepared culinary chef of increased pastry, musical the team and reduce a.
+- or that's in this
+```
+
+**命中详情**:
+
+- music: ['music', 'musical']
+- space: []
+- finance: []
+- cooking: ['chef']
+
+### cooking::p2
+
+**Prompt**: `The pasta course`
+
+**Output**:
+
+```text
+The pasta course pasta practice musical, the team of a and increased by an- or in that is to be all
+```
+
+**命中详情**:
+
+- music: ['music', 'musical', 'practice']
+- space: []
+- finance: []
+- cooking: ['pasta']
+
+### cooking::p3
+
+**Prompt**: `The dessert service`
+
+**Output**:
+
+```text
+The dessert service service services team and increased the information, a new- is an in that's.
+ of it
+```
+
+**命中详情**:
+
+- music: []
+- space: []
+- finance: []
+- cooking: []
+
+## 6. 结论
+
+如果 foreign hits 在多个 prompt 上持续显著非零，则说明系统存在跨域污染。
+如果 own hits 明显高于 foreign hits，则说明仍保留一定的领域接地能力。
diff --git a/reports/generate_cross_domain_contamination_heatmap.py b/reports/generate_cross_domain_contamination_heatmap.py
new file mode 100644
index 0000000..8977711
--- /dev/null
+++ b/reports/generate_cross_domain_contamination_heatmap.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""Generate a black-box cross-domain contamination heatmap report.
+
+The script uses only the public runtime behavior of the uploaded
+AgentMemorySystem implementation:
+
+- MemLLM.load()
+- MemLLM.write()
+- MemLLM.generate()
+
+It writes all domain corpora into a single model instance, probes each domain
+prompt, counts keyword hits by domain, and emits both JSON and Markdown
+artifacts for heatmap-style inspection.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import platform
+import time
+from importlib.machinery import SourceFileLoader
+
+import torch
+import transformers
+
+
+TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
+MODEL_NAME = "gpt2"
+DEFAULT_SEED = 42
+JSON_OUT = "/workspace/reports/agent_memory_cross_domain_heatmap.json"
+MD_OUT = "/workspace/reports/agent_memory_cross_domain_heatmap.md"
+
+PROMPTS = {
+    "music": [
+        "The piano performance",
+        "The violin recital",
+        "The music theory lesson",
+    ],
+    "space": [
+        "The space telescope",
+        "The Mars mission",
+        "The orbital research team",
+    ],
+    "finance": [
+        "The market outlook",
+        "The portfolio manager",
+        "The quarterly earnings call",
+    ],
+    "cooking": [
+        "The chef prepared",
+        "The pasta course",
+        "The dessert service",
+    ],
+}
+
+CORPORA = {
+    "music": [
+        "He practiced piano for hours perfecting a difficult Chopin nocturne.",
+        "She studied music theory and harmonic progression at the conservatory.",
+        "The orchestra rehearsed the symphony before the evening concert.",
+    ],
+    "space": [
+        "Astronauts trained for the Mars mission in simulated zero gravity.",
+        "The telescope revealed distant galaxies beyond the Milky Way.",
+        "Mission control tracked the spacecraft during orbital insertion.",
+    ],
+    "finance": [
+        "Investors monitored inflation data before the central bank meeting.",
+        "The portfolio manager reduced exposure to volatile growth stocks.",
+        "Quarterly earnings guidance shifted sentiment across the market.",
+    ],
+    "cooking": [
+        "The chef reduced the sauce slowly before plating the duck.",
+        "Fresh basil and olive oil brightened the pasta at the finish.",
+        "The pastry team tempered chocolate for the dessert service.",
+    ],
+}
+
+KEYWORDS = {
+    "music": {
+        "music",
+        "musical",
+        "violin",
+        "concert",
+        "symphony",
+        "guitar",
+        "practice",
+        "practicing",
+        "piano",
+        "theory",
+    },
+    "space": {
+        "space",
+        "telescope",
+        "galax",
+        "orbit",
+        "orbital",
+        "mars",
+        "mission",
+        "astronaut",
+        "spacecraft",
+        "planet",
+    },
+    "finance": {
+        "market",
+        "stocks",
+        "portfolio",
+        "inflation",
+        "bank",
+        "earnings",
+        "investor",
+        "trading",
+        "equity",
+        "sentiment",
+    },
+    "cooking": {
+        "chef",
+        "sauce",
+        "pasta",
+        "dessert",
+        "olive",
+        "basil",
+        "plating",
+        "chocolate",
+        "kitchen",
+        "roasted",
+    },
+}
+
+HEATMAP_SCALE = [
+    (0, "0", "none"),
+    (1, "1", "low"),
+    (2, "2", "moderate"),
+    (3, "3", "moderate"),
+    (4, "4", "high"),
+]
+
+
+def ensure(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def load_target_module(path: str):
+    loader = SourceFileLoader("agent_memory_heatmap", path)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    if spec is None:
+        raise RuntimeError(f"Unable to create import spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+def build_model(module):
+    torch.manual_seed(DEFAULT_SEED)
+    model = module.MemLLM(module.Cfg())
+    model.load(MODEL_NAME)
+    return model
+
+
+def keyword_hits(text: str, keywords: set[str]) -> list[str]:
+    lowered = text.lower()
+    return sorted(keyword for keyword in keywords if keyword in lowered)
+
+
+def continuation(prompt: str, output: str) -> str:
+    ensure(output.startswith(prompt), f"Output does not preserve prompt prefix: {output!r}")
+    return output[len(prompt) :]
+
+
+def stable_write(model, texts: list[str]) -> list[float]:
+    gates: list[float] = []
+    for text in texts:
+        stored, gate_vals = model.write(text, training_mode=True)
+        ensure(stored == 1, f"Expected one stored memory for training_mode=True, got {stored}")
+        ensure(len(gate_vals) == 1, f"Expected one gate value, got {gate_vals}")
+        gates.extend(gate_vals)
+    return gates
+
+
+def heat_symbol(count: int) -> str:
+    if count <= 0:
+        return "0"
+    if count == 1:
+        return "1"
+    if count <= 3:
+        return "2"
+    return "4"
+
+
+def build_markdown(payload: dict) -> str:
+    domains = list(PROMPTS)
+    matrix = payload["matrix"]
+    outputs = payload["outputs"]
+    own_foreign = payload["own_vs_foreign"]
+    prompt_rows = payload["prompt_rows"]
+
+    lines = [
+        "# AgentMemorySystem 跨域污染热图报告",
+        "",
+        "## 1. 执行环境",
+        "",
+        f"- Python: {payload['python']}",
+        f"- Torch: {payload['torch']}",
+        f"- Transformers: {payload['transformers']}",
+        f"- Model: {payload['model_name']}",
+        f"- 总耗时: `{payload['duration_s']:.2f}s`",
+        "",
+        "## 2. 说明",
+        "",
+        "该报告通过黑盒方式同时写入多个领域语料，再对各领域多个 prompt 变体进行生成，",
+        "统计 continuation 中命中的各领域关键词数量。",
+        "",
+        "热图符号说明：",
+        "",
+        "- `0`: 无命中",
+        "- `1`: 低污染/低命中",
+        "- `2`: 中等",
+        "- `4`: 高",
+        "",
+        "## 3. 放大版关键词命中计数矩阵",
+        "",
+    ]
+
+    header = "| prompt variant\\\\keyword | " + " | ".join(domains) + " |"
+    sep = "|" + "---|" * (len(domains) + 1)
+    lines.extend([header, sep])
+    for row_id in prompt_rows:
+        row_cells = [row_id]
+        for keyword_domain in domains:
+            count = matrix[row_id][keyword_domain]["count"]
+            symbol = heat_symbol(count)
+            row_cells.append(f"{count} ({symbol})")
+        lines.append("| " + " | ".join(row_cells) + " |")
+
+    lines.extend(
+        [
+            "",
+            "## 4. 按领域汇总的 own-domain vs foreign-domain",
+            "",
+            "| prompt domain | own hits | foreign hits | foreign/own ratio | verdict |",
+            "|---|---:|---:|---:|---|",
+        ]
+    )
+    for domain in domains:
+        summary = own_foreign[domain]
+        ratio = summary["foreign_to_own_ratio"]
+        ratio_text = "inf" if ratio is None else f"{ratio:.2f}"
+        lines.append(
+            f"| {domain} | {summary['own_hits_count']} | {summary['foreign_hits_count']} | "
+            f"{ratio_text} | {summary['verdict']} |"
+        )
+
+    lines.extend(["", "## 5. 各 prompt 变体生成样例", ""])
+    for row_id in prompt_rows:
+        domain = row_id.split("::", 1)[0]
+        lines.extend(
+            [
+                f"### {row_id}",
+                "",
+                f"**Prompt**: `{outputs[row_id]['prompt']}`",
+                "",
+                f"**Output**:",
+                "",
+                "```text",
+                outputs[row_id]["output"],
+                "```",
+                "",
+                "**命中详情**:",
+                "",
+            ]
+        )
+        for keyword_domain in domains:
+            hits = matrix[row_id][keyword_domain]["hits"]
+            lines.append(f"- {keyword_domain}: {hits}")
+        lines.append("")
+
+    lines.extend(
+        [
+            "## 6. 结论",
+            "",
+            "如果 foreign hits 在多个 prompt 上持续显著非零，则说明系统存在跨域污染。",
+            "如果 own hits 明显高于 foreign hits，则说明仍保留一定的领域接地能力。",
+            "",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def main() -> int:
+    ensure(os.path.exists(TARGET_PATH), f"Target file does not exist: {TARGET_PATH}")
+    started = time.perf_counter()
+    module = load_target_module(TARGET_PATH)
+    model = build_model(module)
+    gates: list[float] = []
+    for domain in PROMPTS:
+        gates.extend(stable_write(model, CORPORA[domain]))
+
+    matrix: dict[str, dict[str, dict[str, object]]] = {}
+    outputs: dict[str, dict[str, str]] = {}
+    own_vs_foreign: dict[str, dict[str, object]] = {}
+    prompt_rows: list[str] = []
+    domains = list(PROMPTS)
+    aggregate: dict[str, dict[str, int | float | None]] = {
+        domain: {"own": 0, "foreign": 0} for domain in domains
+    }
+    for prompt_domain, prompts in PROMPTS.items():
+        for idx, prompt in enumerate(prompts, start=1):
+            row_id = f"{prompt_domain}::p{idx}"
+            prompt_rows.append(row_id)
+            output = model.generate(prompt, mt=20, greedy=True)
+            cont = continuation(prompt, output)
+            outputs[row_id] = {"prompt": prompt, "output": output}
+            row = {}
+            own_count = 0
+            foreign_count = 0
+            for keyword_domain in domains:
+                hits = keyword_hits(cont, KEYWORDS[keyword_domain])
+                count = len(hits)
+                row[keyword_domain] = {
+                    "hits": hits,
+                    "count": count,
+                }
+                if keyword_domain == prompt_domain:
+                    own_count += count
+                else:
+                    foreign_count += count
+            matrix[row_id] = row
+            aggregate[prompt_domain]["own"] += own_count
+            aggregate[prompt_domain]["foreign"] += foreign_count
+
+    for domain in domains:
+        own_count = int(aggregate[domain]["own"])
+        foreign_count = int(aggregate[domain]["foreign"])
+        ratio = None if own_count == 0 else foreign_count / own_count
+        verdict = "clean"
+        if own_count == 0 or foreign_count >= own_count:
+            verdict = "high-contamination"
+        elif foreign_count > 0:
+            verdict = "mixed"
+        own_vs_foreign[domain] = {
+            "own_hits_count": own_count,
+            "foreign_hits_count": foreign_count,
+            "foreign_to_own_ratio": ratio,
+            "verdict": verdict,
+        }
+
+    duration = time.perf_counter() - started
+    payload = {
+        "target_path": TARGET_PATH,
+        "model_name": MODEL_NAME,
+        "python": platform.python_version(),
+        "torch": torch.__version__,
+        "transformers": transformers.__version__,
+        "duration_s": duration,
+        "avg_gate": sum(gates) / len(gates) if gates else None,
+        "prompt_rows": prompt_rows,
+        "matrix": matrix,
+        "outputs": outputs,
+        "own_vs_foreign": own_vs_foreign,
+    }
+
+    with open(JSON_OUT, "w", encoding="utf-8") as handle:
+        json.dump(payload, handle, indent=2, ensure_ascii=False, sort_keys=True)
+    with open(MD_OUT, "w", encoding="utf-8") as handle:
+        handle.write(build_markdown(payload))
+
+    print("Cross-domain contamination heatmap generated.")
+    print(f"JSON: {JSON_OUT}")
+    print(f"Markdown: {MD_OUT}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/reports/repro_transformers5_generate_failure.py b/reports/repro_transformers5_generate_failure.py
new file mode 100644
index 0000000..515b110
--- /dev/null
+++ b/reports/repro_transformers5_generate_failure.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Minimal black-box reproducer for the transformers 5.x generate failure."""
+
+from __future__ import annotations
+
+import importlib.util
+from importlib.machinery import SourceFileLoader
+
+import torch
+import transformers
+
+
+TARGET_PATH = "/home/ubuntu/.cursor/projects/workspace/uploads/AgentMemorySystem.md"
+
+
+def load_target_module(path: str):
+    loader = SourceFileLoader("agent_memory_system_repro", path)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    if spec is None:
+        raise RuntimeError(f"Unable to create import spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+def main() -> int:
+    print(f"torch={torch.__version__}")
+    print(f"transformers={transformers.__version__}")
+
+    module = load_target_module(TARGET_PATH)
+    torch.manual_seed(42)
+    model = module.MemLLM(module.Cfg())
+    model.load("gpt2")
+    print(model.generate("Hello", mt=15, greedy=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())