Unbabel · taeungshin · Feb 8, 2026 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/TRAINING_GUIDE_KO.md b/TRAINING_GUIDE_KO.md
diff --git a/configs/models/en-ko-qe/approach1_referenceless_scratch.yaml b/configs/models/en-ko-qe/approach1_referenceless_scratch.yaml
@@ -0,0 +1,100 @@
+# ============================================================
+# 접근법 1: ReferencelessRegression - 처음부터 학습 (From Scratch)
+# ============================================================
+# XLM-RoBERTa-large 인코더로 reference-free QE 모델을 처음부터 학습합니다.
+# 인코더 위에 feed-forward 회귀 head를 새로 학습합니다.
+#
+# 아키텍처:
+#   - 인코더: XLM-RoBERTa-large (frozen -> unfreeze)
+#   - 입력: src + mt (각각 별도 인코딩)
+#   - 특징: [mt_emb, src_emb, mt*src, |mt-src|] (4 * 1024 = 4096 dim)
+#   - Head: 4096 -> 2048 -> 1024 -> 1
+#
+# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
+# 예상 VRAM: ~20-30GB (batch_size=16, fp32)
+#
+# 실행:
+#   comet-train --cfg configs/models/en-ko-qe/approach1_referenceless_scratch.yaml
+# ============================================================
+
+referenceless_regression_metric:
+  class_path: comet.models.ReferencelessRegression
+  init_args:
+    # --- 인코더 설정 ---
+    encoder_model: XLM-RoBERTa
+    pretrained_model: xlm-roberta-large       # 560M params
+
+    # --- 프리징 전략 ---
+    nr_frozen_epochs: 0.3                      # 첫 에폭의 30%는 인코더 동결
+    keep_embeddings_frozen: True               # 임베딩 레이어는 항상 동결 (메모리 절약)
+
+    # --- 옵티마이저 ---
+    optimizer: AdamW
+    encoder_learning_rate: 1.0e-06             # 인코더 학습률 (매우 작게)
+    learning_rate: 1.5e-05                     # Head 학습률
+    layerwise_decay: 0.95                      # 하위 레이어일수록 학습률 감소
+    warmup_steps: 0
+
+    # --- 레이어 설정 ---
+    pool: avg                                  # 평균 풀링
+    layer: mix                                 # 모든 레이어 가중합
+    layer_transformation: sparsemax            # 희소 어텐션 (일부 레이어만 활성)
+    layer_norm: False
+
+    # --- 손실 함수 ---
+    loss: mse                                  # Mean Squared Error
+
+    # --- 회귀 Head ---
+    hidden_sizes:
+      - 2048
+      - 1024
+    activations: Tanh
+    dropout: 0.1
+
+    # --- 데이터 ---
+    batch_size: 16
+    train_data:
+      - data/en-ko-qe/train.csv
+    validation_data:
+      - data/en-ko-qe/val.csv
+
+# --- 트레이너 설정 ---
+trainer:
+  class_path: pytorch_lightning.trainer.trainer.Trainer
+  init_args:
+    accelerator: gpu
+    devices: 1                                 # GPU 수 (환경에 맞게 조정)
+    # strategy: ddp                            # multi-GPU시 주석 해제
+    accumulate_grad_batches: 4                 # 유효 배치: 16 * 4 = 64
+    max_epochs: 5
+    min_epochs: 1
+    gradient_clip_val: 1.0
+    gradient_clip_algorithm: norm
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 100
+    enable_progress_bar: true
+    enable_model_summary: true
+    num_sanity_val_steps: 3
+    deterministic: false
+
+# --- 얼리 스토핑 ---
+early_stopping:
+  class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
+  init_args:
+    monitor: val_kendall                       # Kendall tau 상관계수 모니터링
+    min_delta: 0.0
+    patience: 2                                # 2 에폭 동안 개선 없으면 중단
+    mode: max
+    verbose: False
+
+# --- 모델 체크포인트 ---
+model_checkpoint:
+  class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+  init_args:
+    filename: '{epoch}-{step}-{val_kendall:.4f}'
+    monitor: val_kendall
+    save_top_k: 3
+    mode: max
+    save_weights_only: True
+    every_n_epochs: 1
+    verbose: True
diff --git a/configs/models/en-ko-qe/approach2_unified_qe_scratch.yaml b/configs/models/en-ko-qe/approach2_unified_qe_scratch.yaml
@@ -0,0 +1,109 @@
+# ============================================================
+# 접근법 2: UnifiedMetric QE 모드 - 처음부터 학습 (From Scratch)
+# ============================================================
+# COMETKiwi와 동일한 UnifiedMetric 아키텍처를 사용하되
+# input_segments: [mt, src]로 설정하여 reference-free 모드로 학습합니다.
+#
+# 아키텍처:
+#   - 인코더: InfoXLM-large (XLM-R 계열, COMETKiwi에서 사용)
+#   - 입력: [mt SEP src] (하나의 시퀀스로 연결)
+#   - 특징: CLS 토큰 임베딩 (1024 dim)
+#   - Head: 1024 -> 3072 -> 1024 -> 1
+#
+# ReferencelessRegression과의 차이:
+#   - src/mt를 하나의 시퀀스로 연결 (cross-attention 효과)
+#   - CLS 토큰 사용 (평균 풀링 대신)
+#   - COMETKiwi와 동일 구조 (fine-tuning으로 전환 용이)
+#
+# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
+#
+# 실행:
+#   comet-train --cfg configs/models/en-ko-qe/approach2_unified_qe_scratch.yaml
+# ============================================================
+
+unified_metric:
+  class_path: comet.models.UnifiedMetric
+  init_args:
+    # --- 인코더 설정 ---
+    encoder_model: XLM-RoBERTa
+    pretrained_model: microsoft/infoxlm-large  # COMETKiwi와 동일 인코더
+
+    # --- 프리징 전략 ---
+    nr_frozen_epochs: 0.3
+    keep_embeddings_frozen: True
+
+    # --- 옵티마이저 ---
+    optimizer: AdamW
+    encoder_learning_rate: 1.0e-06
+    learning_rate: 1.5e-05
+    layerwise_decay: 0.95
+    warmup_steps: 0
+
+    # --- 레이어 설정 ---
+    sent_layer: mix                            # 문장 레벨: 모든 레이어 가중합
+    layer_transformation: sparsemax
+    layer_norm: True
+    word_layer: 24                             # 단어 레벨 (미사용이지만 설정 필요)
+
+    # --- 손실 함수 ---
+    loss: mse
+
+    # --- 회귀 Head ---
+    hidden_sizes:
+      - 3072
+      - 1024
+    activations: Tanh
+    dropout: 0.1
+
+    # --- QE 설정 (핵심!) ---
+    input_segments:                            # reference-free: mt + src만 사용
+      - mt
+      - src
+    word_level_training: False                 # 단어 레벨 학습 비활성
+
+    # --- 데이터 ---
+    batch_size: 16
+    train_data:
+      - data/en-ko-qe/train.csv
+    validation_data:
+      - data/en-ko-qe/val.csv
+
+# --- 트레이너 설정 ---
+trainer:
+  class_path: pytorch_lightning.trainer.trainer.Trainer
+  init_args:
+    accelerator: gpu
+    devices: 1
+    accumulate_grad_batches: 4
+    max_epochs: 5
+    min_epochs: 1
+    gradient_clip_val: 1.0
+    gradient_clip_algorithm: norm
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 100
+    enable_progress_bar: true
+    enable_model_summary: true
+    num_sanity_val_steps: 3
+    deterministic: false
+
+# --- 얼리 스토핑 ---
+early_stopping:
+  class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
+  init_args:
+    monitor: val_kendall
+    min_delta: 0.0
+    patience: 2
+    mode: max
+    verbose: False
+
+# --- 모델 체크포인트 ---
+model_checkpoint:
+  class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+  init_args:
+    filename: '{epoch}-{step}-{val_kendall:.4f}'
+    monitor: val_kendall
+    save_top_k: 3
+    mode: max
+    save_weights_only: True
+    every_n_epochs: 1
+    verbose: True
diff --git a/configs/models/en-ko-qe/approach3_finetune_cometkiwi.yaml b/configs/models/en-ko-qe/approach3_finetune_cometkiwi.yaml
@@ -0,0 +1,114 @@
+# ============================================================
+# 접근법 3: COMETKiwi 기반 Fine-tuning (추천)
+# ============================================================
+# 사전학습된 COMETKiwi (wmt22-cometkiwi-da) 체크포인트를 로드하고
+# 한국어-영어 특허 도메인 데이터로 추가 학습합니다.
+#
+# 이 접근법의 장점:
+#   - 이미 QE에 최적화된 가중치에서 시작
+#   - WMT 학습 데이터의 다국어 지식 활용
+#   - 적은 에폭으로도 도메인 적응 가능
+#   - 가장 높은 성능 기대
+#
+# 주의사항:
+#   - 체크포인트 경로를 실제 경로로 수정 필요
+#   - nr_frozen_epochs를 높여서 catastrophic forgetting 방지
+#   - learning_rate를 낮게 설정
+#
+# 필요 GPU: 1-2x A100 (80GB) 또는 2-4x V100 (32GB)
+#
+# 실행 (2단계):
+#   1. 먼저 체크포인트 다운로드:
+#      python scripts/download_checkpoint.py --model Unbabel/wmt22-cometkiwi-da
+#
+#   2. 학습 실행:
+#      comet-train --cfg configs/models/en-ko-qe/approach3_finetune_cometkiwi.yaml \
+#          --load_from_checkpoint checkpoints/wmt22-cometkiwi-da/checkpoints/model.ckpt
+# ============================================================
+
+unified_metric:
+  class_path: comet.models.UnifiedMetric
+  init_args:
+    # --- 인코더 설정 (COMETKiwi 원본과 동일) ---
+    encoder_model: XLM-RoBERTa
+    pretrained_model: microsoft/infoxlm-large
+
+    # --- 프리징 전략 (Fine-tuning용으로 조정) ---
+    nr_frozen_epochs: 0.5                      # 더 오래 동결 (forgetting 방지)
+    keep_embeddings_frozen: True
+
+    # --- 옵티마이저 (Fine-tuning용으로 학습률 감소) ---
+    optimizer: AdamW
+    encoder_learning_rate: 5.0e-07             # 인코더: 원본의 절반
+    learning_rate: 1.0e-05                     # Head: 원본보다 낮게
+    layerwise_decay: 0.95
+    warmup_steps: 100                          # 워밍업 추가 (안정성)
+
+    # --- 레이어 설정 ---
+    sent_layer: mix
+    layer_transformation: sparsemax
+    layer_norm: True
+    word_layer: 24
+
+    # --- 손실 함수 ---
+    loss: mse
+
+    # --- 회귀 Head ---
+    hidden_sizes:
+      - 3072
+      - 1024
+    activations: Tanh
+    dropout: 0.1
+
+    # --- QE 설정 ---
+    input_segments:
+      - mt
+      - src
+    word_level_training: False
+
+    # --- 데이터 ---
+    batch_size: 16
+    train_data:
+      - data/en-ko-qe/train.csv
+    validation_data:
+      - data/en-ko-qe/val.csv
+
+# --- 트레이너 설정 (Fine-tuning) ---
+trainer:
+  class_path: pytorch_lightning.trainer.trainer.Trainer
+  init_args:
+    accelerator: gpu
+    devices: 1
+    accumulate_grad_batches: 4
+    max_epochs: 3                              # Fine-tuning이므로 적은 에폭
+    min_epochs: 1
+    gradient_clip_val: 1.0
+    gradient_clip_algorithm: norm
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 100
+    enable_progress_bar: true
+    enable_model_summary: true
+    num_sanity_val_steps: 3
+    deterministic: false
+
+# --- 얼리 스토핑 ---
+early_stopping:
+  class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
+  init_args:
+    monitor: val_kendall
+    min_delta: 0.0
+    patience: 2
+    mode: max
+    verbose: False
+
+# --- 모델 체크포인트 ---
+model_checkpoint:
+  class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+  init_args:
+    filename: 'cometkiwi-ft-{epoch}-{step}-{val_kendall:.4f}'
+    monitor: val_kendall
+    save_top_k: 3
+    mode: max
+    save_weights_only: True
+    every_n_epochs: 1
+    verbose: True