Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
66a0e8b
Implemented dataset with a summariser to connect expert-layman terms …
TPGCIG Oct 10, 2025
8c5ba34
added training data and rouge values to README.md
TPGCIG Oct 10, 2025
9e42f22
Train working for seq2seq on the flan-t5 model. Currently lacks sched…
TPGCIG Oct 13, 2025
cf9bc45
Implemented tokenizer and pretrained flan-t5 from HF in load_base_model.
TPGCIG Oct 13, 2025
cd5a99c
Implemented basic predict.py with ability to use weights to parse jar…
TPGCIG Oct 13, 2025
74ff2a3
Implemented text generation helper fully, finalised modules for now.
TPGCIG Oct 15, 2025
a8cffee
Finished prediction to support single and multi-line json file inputs.
TPGCIG Oct 15, 2025
fc0117a
Implemented a chatbot functionality for testing the predict.py and we…
TPGCIG Oct 15, 2025
1c6f8e3
added gitignore
TPGCIG Oct 15, 2025
6b9515b
Edited README to add title
TPGCIG Oct 16, 2025
6c25f29
README: added headings for topic but yet to fill.
TPGCIG Oct 16, 2025
1062c94
work on README. added overview, motivation, features and codebase lay…
TPGCIG Oct 16, 2025
3b713ee
added requirements.txt for training model.
TPGCIG Oct 16, 2025
b57e1f7
uploaded icon image for readme
TPGCIG Oct 17, 2025
592e0e0
moved brain image
TPGCIG Oct 17, 2025
3dfc029
Remove duplicate image from Project folder
TPGCIG Oct 17, 2025
9f22ed2
renamed image file braint5.png
TPGCIG Oct 17, 2025
3792b1c
Update README with project details and logo
TPGCIG Oct 17, 2025
db5d0a2
Added training usage to README
TPGCIG Oct 19, 2025
3c177a0
fixed small typo in README
TPGCIG Oct 19, 2025
4e35368
Added table of contents and touched up README
TPGCIG Oct 19, 2025
3f03b7d
added sample usage to README.md
TPGCIG Oct 20, 2025
3e30894
added images for T5 explanation
TPGCIG Oct 20, 2025
a2286c1
added lots of detail to README
TPGCIG Oct 20, 2025
5c106cf
removed a command line argument from train.py which was unnecessary
TPGCIG Oct 20, 2025
cf72462
Merge branch 'topic-recognition' of github.com:TPGCIG/PatternAnalysis…
TPGCIG Oct 20, 2025
db25202
added command line arg option to choose which weights you want to use.
TPGCIG Nov 2, 2025
b04e1fb
robustness changes and editing tool usage a bit as some functions hav…
TPGCIG Nov 2, 2025
d7e9192
truncating a few parameter options that are unintuitive and arent cri…
TPGCIG Nov 2, 2025
d5fbddf
small changes to align functionality of database tools with new train…
TPGCIG Nov 2, 2025
3edc346
added training data and graphs for README.md
TPGCIG Nov 2, 2025
54f254d
added dataset section to README
TPGCIG Nov 2, 2025
9c218ff
added explanation for ROUGE scores and removed table of contents sinc…
TPGCIG Nov 2, 2025
bd9a93b
fixed some unnecessary comments to improve clarity
TPGCIG Nov 2, 2025
aec136f
testing with another computer, fixed requirements and installation st…
TPGCIG Nov 2, 2025
239dbfb
added instructions on how to use chat.py
TPGCIG Nov 2, 2025
7b3ea6b
added comment headers to each file to clarify the functionality
TPGCIG Nov 2, 2025
7f4bd7d
small edits discussing data split functionalities
TPGCIG Nov 2, 2025
4363c5d
added information on how to use venv in installation
TPGCIG Nov 2, 2025
61bef03
added comments around codebase to improve clarity to different functi…
TPGCIG Nov 2, 2025
032ce58
slight edits to remove unnecessary command line args from README and …
TPGCIG Nov 2, 2025
22171a7
typo in train.py, added in justification for parameters in README
TPGCIG Nov 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions recognition/Project13-TristanGreen/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/runs
/__pycache__
369 changes: 369 additions & 0 deletions recognition/Project13-TristanGreen/README.md

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 58 additions & 0 deletions recognition/Project13-TristanGreen/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
------------------------------------------------------------
Interactive CLI for Brain-T5 (Chat Mode)
-----------------------------------------------------------
Description:
Lightweight interface for real-time summarization queries.
Runs inference loop over the fine-tuned LoRA FLAN-T5 model.

Usage:
$ python chat.py --model_dir runs/flan_t5_base_lora_biolaysumm

Notes:
- Press Enter to re-prompt; type 'exit' or 'quit' to stop.
------------------------------------------------------------
"""
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import argparse

p = argparse.ArgumentParser()

p.add_argument("--model_dir", required=True)

# --- config ---
ADAPTER_DIR = p.parse_args().model_dir
BASE_MODEL = "google/flan-t5-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PREFIX = "summarize: "
MAX_INPUT_LEN = 1024
MAX_NEW_TOKENS = 256
NUM_BEAMS = 4


# --- load model ---
print("Loading model...")
tok = AutoTokenizer.from_pretrained(ADAPTER_DIR)
base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
model = PeftModel.from_pretrained(base, ADAPTER_DIR).to(DEVICE).eval()
print("Ready.")

# --- chat loop ---
while True:
user = input("\n🧠 You: ").strip()
if not user:
continue
if user.lower() in {"exit", "quit", "q"}:
print("Bye.")
break

enc = tok(PREFIX + user, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
with torch.inference_mode():
out = model.generate(**enc,
max_new_tokens=MAX_NEW_TOKENS,
num_beams=NUM_BEAMS,
no_repeat_ngram_size=3,
early_stopping=True)
print("\n🤖 Model:", tok.decode(out[0], skip_special_tokens=True))
130 changes: 130 additions & 0 deletions recognition/Project13-TristanGreen/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
------------------------------------------------------------
Dataset Loader and Preprocessing for Brain-T5
-----------------------------------------------------------
Description:
Handles dataset intake and preprocessing for FLAN-T5 fine-tuning.
Supports Hugging Face (BioLaySumm) datasets, CSV, or JSONL inputs.

Key Components:
- make_datasets(): loads and tokenizes splits (train/val/test).
- Seq2SeqCollatorFast: dynamic padding & label masking for T5.

Notes:
- Automatically prefixes "summarize: " to each input.
- Pads to model’s max token length.
- Masks <pad> tokens in labels with -100 for CrossEntropyLoss.
------------------------------------------------------------
"""
from __future__ import annotations
from typing import Optional, List, Dict
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.nn.utils.rnn import pad_sequence

DATASET_ID = "BioLaySumm/BioLaySumm2025-LaymanRRG-opensource-track"
INPUT_COL = "radiology_report"
TARGET_COL = "layman_report"

# Collator: batch pad inputs/labels and map pad tokens in labels to -100 (ignored by CE loss).
# pad_to_multiple_of lets you round sequence lengths (e.g., to 8/16/32) for Tensor Core efficiency.
class Seq2SeqCollatorFast:
def __init__(self, tokenizer, label_pad_token_id=-100, pad_to_multiple_of=None):
self.tok = tokenizer
self.label_pad_token_id = label_pad_token_id
self.pad_to_multiple_of = pad_to_multiple_of

def _maybe_pad_to_multiple(self, tensor, pad_value):
if self.pad_to_multiple_of is None:
return tensor
L = tensor.size(1)
if L % self.pad_to_multiple_of == 0:
return tensor
add = self.pad_to_multiple_of - (L % self.pad_to_multiple_of)
return torch.nn.functional.pad(tensor, (0, add), value=pad_value)

def __call__(self, feats: List[Dict[str, torch.Tensor]]):
# IMPORTANT: convert tokenizer pad tokens in labels to -100 so loss ignores padded positions.
ids = [f["input_ids"] if isinstance(f["input_ids"], torch.Tensor) else torch.tensor(f["input_ids"]) for f in feats]
am = [f["attention_mask"] if isinstance(f["attention_mask"], torch.Tensor) else torch.tensor(f["attention_mask"]) for f in feats]
labs = [f["labels"] if isinstance(f["labels"], torch.Tensor) else torch.tensor(f["labels"]) for f in feats]

pad_id = self.tok.pad_token_id
ids = pad_sequence(ids, batch_first=True, padding_value=pad_id)
am = pad_sequence(am, batch_first=True, padding_value=0)
labs = pad_sequence(labs, batch_first=True, padding_value=pad_id)
labs = labs.masked_fill(labs.eq(pad_id), self.label_pad_token_id)

ids = self._maybe_pad_to_multiple(ids, pad_id)
am = self._maybe_pad_to_multiple(am, 0)
labs = self._maybe_pad_to_multiple(labs, self.label_pad_token_id)
return {"input_ids": ids, "attention_mask": am, "labels": labs}

# Build tokenizer + HF datasets with optional self-split (80/10/10).
# Ensures input instruction prefix and truncation to max lengths.
def make_datasets(
tokenizer_name: str = "google/flan-t5-base",
train_split: str = "train",
val_split: Optional[str] = "validation",
test_split: Optional[str] = "test",
max_input_len: int = 1024,
max_target_len: int = 256,
prefix_text: str = "summarize: ",
*,
self_split: bool = False,
self_split_seed: int = 1337,
self_split_val: float = 0.1,
self_split_test: float = 0.1,
):

tok = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token

ds = load_dataset(DATASET_ID)

from datasets import DatasetDict

# Vectorize one batch: add instruction prefix, tokenize src/tgt independently, attach 'labels'.
if self_split:
base = ds["train"].train_test_split(test_size=self_split_test, seed=self_split_seed)
train_part = base["train"]
test_part = base["test"]
vt = train_part.train_test_split(
test_size=self_split_val / (1.0 - self_split_test), seed=self_split_seed)
ds = DatasetDict({
"train": vt["train"],
"validation": vt["test"],
"test": test_part,
})

# Validate required columns exist
for split in [s for s in [train_split, val_split, test_split] if s and s in ds]:
cols = ds[split].column_names
if INPUT_COL not in cols or TARGET_COL not in cols:
raise KeyError(f"Expected columns '{INPUT_COL}', '{TARGET_COL}' in split '{split}', found {cols}")

# Vectorize one batch: add instruction prefix, tokenize src/tgt independently, attach 'labels'.
def encode_batch(batch):
srcs = [prefix_text + s for s in batch[INPUT_COL]]
enc = tok(srcs, max_length=max_input_len, truncation=True)
tgt = tok(text_target=batch[TARGET_COL], max_length=max_target_len, truncation=True)
enc["labels"] = tgt["input_ids"]
return enc

remove_cols = ds[train_split].column_names
train_proc = ds[train_split].map(encode_batch, batched=True, remove_columns=remove_cols, desc="Tokenizing train")

val_proc = None
if val_split and val_split in ds:
remove_cols_val = ds[val_split].column_names
val_proc = ds[val_split].map(encode_batch, batched=True, remove_columns=remove_cols_val, desc="Tokenizing val")

test_proc = None
if test_split and test_split in ds:
remove_cols_test = ds[test_split].column_names
test_proc = ds[test_split].map(encode_batch, batched=True, remove_columns=remove_cols_test, desc="Tokenizing test")

collator = Seq2SeqCollatorFast(tok, label_pad_token_id=-100, pad_to_multiple_of=None)
return tok, train_proc, val_proc, test_proc, collator
108 changes: 108 additions & 0 deletions recognition/Project13-TristanGreen/modules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
------------------------------------------------------------
Model Utilities for Brain-T5
-----------------------------------------------------------
Description:
Provides helper functions for loading base models and attaching
LoRA adapters to target layers of FLAN-T5.

Key Functions:
- load_base_model(): loads pretrained T5/FLAN-T5 with dtype control.
- attach_lora(): injects trainable low-rank adapters for fine-tuning.

Notes:
- Uses PEFT (Parameter-Efficient Fine-Tuning) via Hugging Face.
- Keeps original model frozen except LoRA-injected parameters.
------------------------------------------------------------
"""
from __future__ import annotations
from typing import Optional, Dict, Any, List

import torch
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
)

try:
from peft import LoraConfig, get_peft_model, PeftModel
PEFT_AVAILABLE = True
except Exception:
PEFT_AVAILABLE = False

# Use fast tokenizer; default pad_token from eos_token if missing (required by T5 decoding).
def get_tokenizer(name: str = "google/flan-t5-base"):
tok = AutoTokenizer.from_pretrained(name, use_fast=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
return tok

# Load FLAN-T5 with dtype/device_map options.
# Ensure decoder_start_token_id is set so generation starts from a valid token.
def load_base_model(
name: str = "google/flan-t5-base",
dtype: Optional[torch.dtype] = torch.float16,
device_map: Optional[str] = None,
):
model = AutoModelForSeq2SeqLM.from_pretrained(
name,
dtype=dtype,
device_map=device_map,
)
if getattr(model.config, "decoder_start_token_id", None) is None:
model.config.decoder_start_token_id = model.config.pad_token_id
return model

# Inject LoRA on attention projections (q/k/v/o). Bias=none keeps adapter minimal.
# r/alpha/dropout control rank, scaling, and regularization of the adapters.
def attach_lora(model, r: int = 8, alpha: int = 16, dropout: float = 0.05, target_modules: Optional[List[str]] = None):
if not PEFT_AVAILABLE:
raise RuntimeError("peft not installed. `pip install peft` to use LoRA.")
if target_modules is None:
target_modules = ["q", "k", "v", "o"]
cfg = LoraConfig(
r=r, lora_alpha=alpha, lora_dropout=dropout,
target_modules=target_modules, bias="none", task_type="SEQ_2_SEQ_LM",
)
return get_peft_model(model, cfg)

# Convenience generation wrapper (batched):
# - Applies optional "summarize: " prefix.
# - Pads/truncates, moves to device, decodes without special tokens.
# - Beam search defaults tuned for readability over speed.
@torch.no_grad()
def generate(
model,
tokenizer,
inputs: List[str],
max_input_len: int = 1024,
max_new_tokens: int = 256,
num_beams: int = 4,
no_repeat_ngram_size: int = 3,
length_penalty: float = 1.0,
add_prefix: bool = True,
prefix_text: str = "summarize: ",
device: Optional[str] = None,
) -> List[str]:
model.eval()
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"

batch = [(prefix_text + x) if add_prefix else x for x in inputs]
enc = tokenizer(
batch,
max_length=max_input_len,
truncation=True,
padding=True,
return_tensors="pt",
).to(device)

out = model.generate(
**enc,
max_new_tokens=max_new_tokens,
num_beams=num_beams,
no_repeat_ngram_size=no_repeat_ngram_size,
length_penalty=length_penalty,
early_stopping=True,
)
return tokenizer.batch_decode(out, skip_special_tokens=True)
Loading