shakes76 · TPGCIG · Oct 10, 2025 · Oct 10, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/recognition/Project13-TristanGreen/.gitignore b/recognition/Project13-TristanGreen/.gitignore
@@ -0,0 +1,2 @@
+/runs
+/__pycache__
diff --git a/recognition/Project13-TristanGreen/README.md b/recognition/Project13-TristanGreen/README.md
diff --git a/recognition/Project13-TristanGreen/assets/images/braint5.png b/recognition/Project13-TristanGreen/assets/images/braint5.png
diff --git a/recognition/Project13-TristanGreen/assets/images/loss_curve_full.png b/recognition/Project13-TristanGreen/assets/images/loss_curve_full.png
diff --git a/recognition/Project13-TristanGreen/assets/images/loss_curve_med.png b/recognition/Project13-TristanGreen/assets/images/loss_curve_med.png
diff --git a/recognition/Project13-TristanGreen/assets/images/rouge_val_curve_full.png b/recognition/Project13-TristanGreen/assets/images/rouge_val_curve_full.png
diff --git a/recognition/Project13-TristanGreen/assets/images/rouge_val_curve_med.png b/recognition/Project13-TristanGreen/assets/images/rouge_val_curve_med.png
diff --git a/recognition/Project13-TristanGreen/assets/images/t5architecture.jpg b/recognition/Project13-TristanGreen/assets/images/t5architecture.jpg
diff --git a/recognition/Project13-TristanGreen/assets/images/t5simple.png b/recognition/Project13-TristanGreen/assets/images/t5simple.png
diff --git a/recognition/Project13-TristanGreen/chat.py b/recognition/Project13-TristanGreen/chat.py
@@ -0,0 +1,58 @@
+"""
+------------------------------------------------------------
+ Interactive CLI for Brain-T5 (Chat Mode)
+ -----------------------------------------------------------
+ Description:
+    Lightweight interface for real-time summarization queries.
+    Runs inference loop over the fine-tuned LoRA FLAN-T5 model.
+
+ Usage:
+    $ python chat.py --model_dir runs/flan_t5_base_lora_biolaysumm
+
+ Notes:
+    - Press Enter to re-prompt; type 'exit' or 'quit' to stop.
+------------------------------------------------------------
+"""
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+import argparse
+
+p = argparse.ArgumentParser()
+
+p.add_argument("--model_dir", required=True)
+
+# --- config ---
+ADAPTER_DIR = p.parse_args().model_dir
+BASE_MODEL = "google/flan-t5-base"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+PREFIX = "summarize: "
+MAX_INPUT_LEN = 1024
+MAX_NEW_TOKENS = 256
+NUM_BEAMS = 4
+
+
+# --- load model ---
+print("Loading model...")
+tok = AutoTokenizer.from_pretrained(ADAPTER_DIR)
+base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
+model = PeftModel.from_pretrained(base, ADAPTER_DIR).to(DEVICE).eval()
+print("Ready.")
+
+# --- chat loop ---
+while True:
+    user = input("\n🧠 You: ").strip()
+    if not user:
+        continue
+    if user.lower() in {"exit", "quit", "q"}:
+        print("Bye.")
+        break
+
+    enc = tok(PREFIX + user, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
+    with torch.inference_mode():
+        out = model.generate(**enc,
+                             max_new_tokens=MAX_NEW_TOKENS,
+                             num_beams=NUM_BEAMS,
+                             no_repeat_ngram_size=3,
+                             early_stopping=True)
+    print("\n🤖 Model:", tok.decode(out[0], skip_special_tokens=True))
diff --git a/recognition/Project13-TristanGreen/dataset.py b/recognition/Project13-TristanGreen/dataset.py
@@ -0,0 +1,130 @@
+"""
+------------------------------------------------------------
+ Dataset Loader and Preprocessing for Brain-T5
+ -----------------------------------------------------------
+ Description:
+    Handles dataset intake and preprocessing for FLAN-T5 fine-tuning.
+    Supports Hugging Face (BioLaySumm) datasets, CSV, or JSONL inputs.
+
+ Key Components:
+    - make_datasets(): loads and tokenizes splits (train/val/test).
+    - Seq2SeqCollatorFast: dynamic padding & label masking for T5.
+
+ Notes:
+    - Automatically prefixes "summarize: " to each input.
+    - Pads to model’s max token length.
+    - Masks <pad> tokens in labels with -100 for CrossEntropyLoss.
+------------------------------------------------------------
+"""
+from __future__ import annotations
+from typing import Optional, List, Dict
+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from torch.nn.utils.rnn import pad_sequence
+
+DATASET_ID = "BioLaySumm/BioLaySumm2025-LaymanRRG-opensource-track"
+INPUT_COL  = "radiology_report"
+TARGET_COL = "layman_report"
+
+# Collator: batch pad inputs/labels and map pad tokens in labels to -100 (ignored by CE loss).
+# pad_to_multiple_of lets you round sequence lengths (e.g., to 8/16/32) for Tensor Core efficiency.
+class Seq2SeqCollatorFast:
+    def __init__(self, tokenizer, label_pad_token_id=-100, pad_to_multiple_of=None):
+        self.tok = tokenizer
+        self.label_pad_token_id = label_pad_token_id
+        self.pad_to_multiple_of = pad_to_multiple_of
+
+    def _maybe_pad_to_multiple(self, tensor, pad_value):
+        if self.pad_to_multiple_of is None:
+            return tensor
+        L = tensor.size(1)
+        if L % self.pad_to_multiple_of == 0:
+            return tensor
+        add = self.pad_to_multiple_of - (L % self.pad_to_multiple_of)
+        return torch.nn.functional.pad(tensor, (0, add), value=pad_value)
+
+    def __call__(self, feats: List[Dict[str, torch.Tensor]]):
+        # IMPORTANT: convert tokenizer pad tokens in labels to -100 so loss ignores padded positions.
+        ids  = [f["input_ids"]      if isinstance(f["input_ids"],      torch.Tensor) else torch.tensor(f["input_ids"])      for f in feats]
+        am   = [f["attention_mask"] if isinstance(f["attention_mask"], torch.Tensor) else torch.tensor(f["attention_mask"]) for f in feats]
+        labs = [f["labels"]         if isinstance(f["labels"],         torch.Tensor) else torch.tensor(f["labels"])         for f in feats]
+
+        pad_id = self.tok.pad_token_id
+        ids  = pad_sequence(ids,  batch_first=True, padding_value=pad_id)
+        am   = pad_sequence(am,   batch_first=True, padding_value=0)
+        labs = pad_sequence(labs, batch_first=True, padding_value=pad_id)
+        labs = labs.masked_fill(labs.eq(pad_id), self.label_pad_token_id)
+
+        ids  = self._maybe_pad_to_multiple(ids,  pad_id)
+        am   = self._maybe_pad_to_multiple(am,   0)
+        labs = self._maybe_pad_to_multiple(labs, self.label_pad_token_id)
+        return {"input_ids": ids, "attention_mask": am, "labels": labs}
+
+# Build tokenizer + HF datasets with optional self-split (80/10/10).
+# Ensures input instruction prefix and truncation to max lengths.
+def make_datasets(
+    tokenizer_name: str = "google/flan-t5-base",
+    train_split: str = "train",
+    val_split: Optional[str] = "validation",
+    test_split: Optional[str] = "test",
+    max_input_len: int = 1024,
+    max_target_len: int = 256,
+    prefix_text: str = "summarize: ",
+    *,
+    self_split: bool = False,
+    self_split_seed: int = 1337,
+    self_split_val: float = 0.1,
+    self_split_test: float = 0.1,
+):
+
+    tok = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+
+    ds = load_dataset(DATASET_ID)
+
+    from datasets import DatasetDict
+
+    # Vectorize one batch: add instruction prefix, tokenize src/tgt independently, attach 'labels'.
+    if self_split:
+        base = ds["train"].train_test_split(test_size=self_split_test, seed=self_split_seed)
+        train_part = base["train"]
+        test_part = base["test"]
+        vt = train_part.train_test_split(
+            test_size=self_split_val / (1.0 - self_split_test), seed=self_split_seed)
+        ds = DatasetDict({
+            "train": vt["train"],
+            "validation": vt["test"],
+            "test": test_part,
+        })
+
+    # Validate required columns exist
+    for split in [s for s in [train_split, val_split, test_split] if s and s in ds]:
+        cols = ds[split].column_names
+        if INPUT_COL not in cols or TARGET_COL not in cols:
+            raise KeyError(f"Expected columns '{INPUT_COL}', '{TARGET_COL}' in split '{split}', found {cols}")
+
+    # Vectorize one batch: add instruction prefix, tokenize src/tgt independently, attach 'labels'.
+    def encode_batch(batch):
+        srcs = [prefix_text + s for s in batch[INPUT_COL]]
+        enc = tok(srcs, max_length=max_input_len, truncation=True)
+        tgt = tok(text_target=batch[TARGET_COL], max_length=max_target_len, truncation=True)
+        enc["labels"] = tgt["input_ids"]
+        return enc
+
+    remove_cols = ds[train_split].column_names
+    train_proc = ds[train_split].map(encode_batch, batched=True, remove_columns=remove_cols, desc="Tokenizing train")
+
+    val_proc = None
+    if val_split and val_split in ds:
+        remove_cols_val = ds[val_split].column_names
+        val_proc = ds[val_split].map(encode_batch, batched=True, remove_columns=remove_cols_val, desc="Tokenizing val")
+
+    test_proc = None
+    if test_split and test_split in ds:
+        remove_cols_test = ds[test_split].column_names
+        test_proc = ds[test_split].map(encode_batch, batched=True, remove_columns=remove_cols_test, desc="Tokenizing test")
+
+    collator = Seq2SeqCollatorFast(tok, label_pad_token_id=-100, pad_to_multiple_of=None)
+    return tok, train_proc, val_proc, test_proc, collator
diff --git a/recognition/Project13-TristanGreen/modules.py b/recognition/Project13-TristanGreen/modules.py
@@ -0,0 +1,108 @@
+"""
+------------------------------------------------------------
+ Model Utilities for Brain-T5
+ -----------------------------------------------------------
+ Description:
+    Provides helper functions for loading base models and attaching
+    LoRA adapters to target layers of FLAN-T5.
+
+ Key Functions:
+    - load_base_model(): loads pretrained T5/FLAN-T5 with dtype control.
+    - attach_lora(): injects trainable low-rank adapters for fine-tuning.
+
+ Notes:
+    - Uses PEFT (Parameter-Efficient Fine-Tuning) via Hugging Face.
+    - Keeps original model frozen except LoRA-injected parameters.
+------------------------------------------------------------
+"""
+from __future__ import annotations
+from typing import Optional, Dict, Any, List
+
+import torch
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+
+try:
+    from peft import LoraConfig, get_peft_model, PeftModel
+    PEFT_AVAILABLE = True
+except Exception:
+    PEFT_AVAILABLE = False
+
+# Use fast tokenizer; default pad_token from eos_token if missing (required by T5 decoding).
+def get_tokenizer(name: str = "google/flan-t5-base"):
+    tok = AutoTokenizer.from_pretrained(name, use_fast=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    return tok
+
+# Load FLAN-T5 with dtype/device_map options.
+# Ensure decoder_start_token_id is set so generation starts from a valid token.
+def load_base_model(
+    name: str = "google/flan-t5-base",
+    dtype: Optional[torch.dtype] = torch.float16,
+    device_map: Optional[str] = None,
+):
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        name,
+        dtype=dtype,
+        device_map=device_map,
+    )
+    if getattr(model.config, "decoder_start_token_id", None) is None:
+        model.config.decoder_start_token_id = model.config.pad_token_id
+    return model
+
+# Inject LoRA on attention projections (q/k/v/o). Bias=none keeps adapter minimal.
+# r/alpha/dropout control rank, scaling, and regularization of the adapters.
+def attach_lora(model, r: int = 8, alpha: int = 16, dropout: float = 0.05, target_modules: Optional[List[str]] = None):
+    if not PEFT_AVAILABLE:
+        raise RuntimeError("peft not installed. `pip install peft` to use LoRA.")
+    if target_modules is None:
+        target_modules = ["q", "k", "v", "o"]
+    cfg = LoraConfig(
+        r=r, lora_alpha=alpha, lora_dropout=dropout,
+        target_modules=target_modules, bias="none", task_type="SEQ_2_SEQ_LM",
+    )
+    return get_peft_model(model, cfg)
+
+# Convenience generation wrapper (batched):
+#  - Applies optional "summarize: " prefix.
+#  - Pads/truncates, moves to device, decodes without special tokens.
+#  - Beam search defaults tuned for readability over speed.
+@torch.no_grad()
+def generate(
+    model,
+    tokenizer,
+    inputs: List[str],
+    max_input_len: int = 1024,
+    max_new_tokens: int = 256,
+    num_beams: int = 4,
+    no_repeat_ngram_size: int = 3,
+    length_penalty: float = 1.0,
+    add_prefix: bool = True,
+    prefix_text: str = "summarize: ",
+    device: Optional[str] = None,
+) -> List[str]:
+    model.eval()
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    batch = [(prefix_text + x) if add_prefix else x for x in inputs]
+    enc = tokenizer(
+        batch,
+        max_length=max_input_len,
+        truncation=True,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+
+    out = model.generate(
+        **enc,
+        max_new_tokens=max_new_tokens,
+        num_beams=num_beams,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        length_penalty=length_penalty,
+        early_stopping=True,
+    )
+    return tokenizer.batch_decode(out, skip_special_tokens=True)