Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,120 changes: 1,120 additions & 0 deletions TRAINING_GUIDE_KO.md

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions configs/models/en-ko-qe/approach1_referenceless_scratch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# ============================================================
# 접근법 1: ReferencelessRegression - 처음부터 학습 (From Scratch)
# ============================================================
# XLM-RoBERTa-large 인코더로 reference-free QE 모델을 처음부터 학습합니다.
# 인코더 위에 feed-forward 회귀 head를 새로 학습합니다.
#
# 아키텍처:
# - 인코더: XLM-RoBERTa-large (frozen -> unfreeze)
# - 입력: src + mt (각각 별도 인코딩)
# - 특징: [mt_emb, src_emb, mt*src, |mt-src|] (4 * 1024 = 4096 dim)
# - Head: 4096 -> 2048 -> 1024 -> 1
#
# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
# 예상 VRAM: ~20-30GB (batch_size=16, fp32)
#
# 실행:
# comet-train --cfg configs/models/en-ko-qe/approach1_referenceless_scratch.yaml
# ============================================================

referenceless_regression_metric:
class_path: comet.models.ReferencelessRegression
init_args:
# --- 인코더 설정 ---
encoder_model: XLM-RoBERTa
pretrained_model: xlm-roberta-large # 560M params

# --- 프리징 전략 ---
nr_frozen_epochs: 0.3 # 첫 에폭의 30%는 인코더 동결
keep_embeddings_frozen: True # 임베딩 레이어는 항상 동결 (메모리 절약)

# --- 옵티마이저 ---
optimizer: AdamW
encoder_learning_rate: 1.0e-06 # 인코더 학습률 (매우 작게)
learning_rate: 1.5e-05 # Head 학습률
layerwise_decay: 0.95 # 하위 레이어일수록 학습률 감소
warmup_steps: 0

# --- 레이어 설정 ---
pool: avg # 평균 풀링
layer: mix # 모든 레이어 가중합
layer_transformation: sparsemax # 희소 어텐션 (일부 레이어만 활성)
layer_norm: False

# --- 손실 함수 ---
loss: mse # Mean Squared Error

# --- 회귀 Head ---
hidden_sizes:
- 2048
- 1024
activations: Tanh
dropout: 0.1

# --- 데이터 ---
batch_size: 16
train_data:
- data/en-ko-qe/train.csv
validation_data:
- data/en-ko-qe/val.csv

# --- 트레이너 설정 ---
trainer:
class_path: pytorch_lightning.trainer.trainer.Trainer
init_args:
accelerator: gpu
devices: 1 # GPU 수 (환경에 맞게 조정)
# strategy: ddp # multi-GPU시 주석 해제
accumulate_grad_batches: 4 # 유효 배치: 16 * 4 = 64
max_epochs: 5
min_epochs: 1
gradient_clip_val: 1.0
gradient_clip_algorithm: norm
check_val_every_n_epoch: 1
log_every_n_steps: 100
enable_progress_bar: true
enable_model_summary: true
num_sanity_val_steps: 3
deterministic: false

# --- 얼리 스토핑 ---
early_stopping:
class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
init_args:
monitor: val_kendall # Kendall tau 상관계수 모니터링
min_delta: 0.0
patience: 2 # 2 에폭 동안 개선 없으면 중단
mode: max
verbose: False

# --- 모델 체크포인트 ---
model_checkpoint:
class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
init_args:
filename: '{epoch}-{step}-{val_kendall:.4f}'
monitor: val_kendall
save_top_k: 3
mode: max
save_weights_only: True
every_n_epochs: 1
verbose: True
109 changes: 109 additions & 0 deletions configs/models/en-ko-qe/approach2_unified_qe_scratch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# ============================================================
# 접근법 2: UnifiedMetric QE 모드 - 처음부터 학습 (From Scratch)
# ============================================================
# COMETKiwi와 동일한 UnifiedMetric 아키텍처를 사용하되
# input_segments: [mt, src]로 설정하여 reference-free 모드로 학습합니다.
#
# 아키텍처:
# - 인코더: InfoXLM-large (XLM-R 계열, COMETKiwi에서 사용)
# - 입력: [mt SEP src] (하나의 시퀀스로 연결)
# - 특징: CLS 토큰 임베딩 (1024 dim)
# - Head: 1024 -> 3072 -> 1024 -> 1
#
# ReferencelessRegression과의 차이:
# - src/mt를 하나의 시퀀스로 연결 (cross-attention 효과)
# - CLS 토큰 사용 (평균 풀링 대신)
# - COMETKiwi와 동일 구조 (fine-tuning으로 전환 용이)
#
# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
#
# 실행:
# comet-train --cfg configs/models/en-ko-qe/approach2_unified_qe_scratch.yaml
# ============================================================

unified_metric:
class_path: comet.models.UnifiedMetric
init_args:
# --- 인코더 설정 ---
encoder_model: XLM-RoBERTa
pretrained_model: microsoft/infoxlm-large # COMETKiwi와 동일 인코더

# --- 프리징 전략 ---
nr_frozen_epochs: 0.3
keep_embeddings_frozen: True

# --- 옵티마이저 ---
optimizer: AdamW
encoder_learning_rate: 1.0e-06
learning_rate: 1.5e-05
layerwise_decay: 0.95
warmup_steps: 0

# --- 레이어 설정 ---
sent_layer: mix # 문장 레벨: 모든 레이어 가중합
layer_transformation: sparsemax
layer_norm: True
word_layer: 24 # 단어 레벨 (미사용이지만 설정 필요)

# --- 손실 함수 ---
loss: mse

# --- 회귀 Head ---
hidden_sizes:
- 3072
- 1024
activations: Tanh
dropout: 0.1

# --- QE 설정 (핵심!) ---
input_segments: # reference-free: mt + src만 사용
- mt
- src
word_level_training: False # 단어 레벨 학습 비활성

# --- 데이터 ---
batch_size: 16
train_data:
- data/en-ko-qe/train.csv
validation_data:
- data/en-ko-qe/val.csv

# --- 트레이너 설정 ---
trainer:
class_path: pytorch_lightning.trainer.trainer.Trainer
init_args:
accelerator: gpu
devices: 1
accumulate_grad_batches: 4
max_epochs: 5
min_epochs: 1
gradient_clip_val: 1.0
gradient_clip_algorithm: norm
check_val_every_n_epoch: 1
log_every_n_steps: 100
enable_progress_bar: true
enable_model_summary: true
num_sanity_val_steps: 3
deterministic: false

# --- 얼리 스토핑 ---
early_stopping:
class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
init_args:
monitor: val_kendall
min_delta: 0.0
patience: 2
mode: max
verbose: False

# --- 모델 체크포인트 ---
model_checkpoint:
class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
init_args:
filename: '{epoch}-{step}-{val_kendall:.4f}'
monitor: val_kendall
save_top_k: 3
mode: max
save_weights_only: True
every_n_epochs: 1
verbose: True
114 changes: 114 additions & 0 deletions configs/models/en-ko-qe/approach3_finetune_cometkiwi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# ============================================================
# 접근법 3: COMETKiwi 기반 Fine-tuning (추천)
# ============================================================
# 사전학습된 COMETKiwi (wmt22-cometkiwi-da) 체크포인트를 로드하고
# 한국어-영어 특허 도메인 데이터로 추가 학습합니다.
#
# 이 접근법의 장점:
# - 이미 QE에 최적화된 가중치에서 시작
# - WMT 학습 데이터의 다국어 지식 활용
# - 적은 에폭으로도 도메인 적응 가능
# - 가장 높은 성능 기대
#
# 주의사항:
# - 체크포인트 경로를 실제 경로로 수정 필요
# - nr_frozen_epochs를 높여서 catastrophic forgetting 방지
# - learning_rate를 낮게 설정
#
# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
#
# 실행 (2단계):
# 1. 먼저 체크포인트 다운로드:
# python scripts/download_checkpoint.py --model Unbabel/wmt22-cometkiwi-da
#
# 2. 학습 실행:
# comet-train --cfg configs/models/en-ko-qe/approach3_finetune_cometkiwi.yaml \
# --load_from_checkpoint checkpoints/wmt22-cometkiwi-da/checkpoints/model.ckpt
# ============================================================

unified_metric:
class_path: comet.models.UnifiedMetric
init_args:
# --- 인코더 설정 (COMETKiwi 원본과 동일) ---
encoder_model: XLM-RoBERTa
pretrained_model: microsoft/infoxlm-large

# --- 프리징 전략 (Fine-tuning용으로 조정) ---
nr_frozen_epochs: 0.5 # 더 오래 동결 (forgetting 방지)
keep_embeddings_frozen: True

# --- 옵티마이저 (Fine-tuning용으로 학습률 감소) ---
optimizer: AdamW
encoder_learning_rate: 5.0e-07 # 인코더: 원본의 절반
learning_rate: 1.0e-05 # Head: 원본보다 낮게
layerwise_decay: 0.95
warmup_steps: 100 # 워밍업 추가 (안정성)

# --- 레이어 설정 ---
sent_layer: mix
layer_transformation: sparsemax
layer_norm: True
word_layer: 24

# --- 손실 함수 ---
loss: mse

# --- 회귀 Head ---
hidden_sizes:
- 3072
- 1024
activations: Tanh
dropout: 0.1

# --- QE 설정 ---
input_segments:
- mt
- src
word_level_training: False

# --- 데이터 ---
batch_size: 16
train_data:
- data/en-ko-qe/train.csv
validation_data:
- data/en-ko-qe/val.csv

# --- 트레이너 설정 (Fine-tuning) ---
trainer:
class_path: pytorch_lightning.trainer.trainer.Trainer
init_args:
accelerator: gpu
devices: 1
accumulate_grad_batches: 4
max_epochs: 3 # Fine-tuning이므로 적은 에폭
min_epochs: 1
gradient_clip_val: 1.0
gradient_clip_algorithm: norm
check_val_every_n_epoch: 1
log_every_n_steps: 100
enable_progress_bar: true
enable_model_summary: true
num_sanity_val_steps: 3
deterministic: false

# --- 얼리 스토핑 ---
early_stopping:
class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
init_args:
monitor: val_kendall
min_delta: 0.0
patience: 2
mode: max
verbose: False

# --- 모델 체크포인트 ---
model_checkpoint:
class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
init_args:
filename: 'cometkiwi-ft-{epoch}-{step}-{val_kendall:.4f}'
monitor: val_kendall
save_top_k: 3
mode: max
save_weights_only: True
every_n_epochs: 1
verbose: True
Loading