balance classes faster

KMCzajkowski · web-flow · commit 1145efaf6964 · 2025-12-19T09:08:51.000+01:00
from michalpiasecki0
diff --git a/timm/data/loader.py b/timm/data/loader.py
@@ -16,6 +16,7 @@
 
 import torch.utils.data
 import numpy as np
+import pandas as pd
 
 from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .dataset import IterableImageDataset, ImageDataset
@@ -228,6 +229,7 @@ def create_loader(
         worker_seeding: str = 'all',
         tf_preprocessing: bool = False,
         balance_classes: bool = False,
+        dataset_csv_path: Optional[str] = None 
 ):
     """
 
@@ -272,10 +274,12 @@ def create_loader(
         worker_seeding: Control worker random seeding at init.
         tf_preprocessing: Use TF 1.0 inference preprocessing for testing model ports.
         balance_classes: Sample classes with uniform probability
+        dataset_csv_path: Path to dataset csv, used for class balancing 
 
     Returns:
         DataLoader
     """
+    
     re_num_splits = 0
     if re_split:
         # apply RE to second half of batch if no aug split otherwise line up with aug split
@@ -329,7 +333,9 @@ def create_loader(
     else:
         assert num_aug_repeats == 0, "RepeatAugment not currently supported in non-distributed or IterableDataset use"
         if balance_classes:
-            all_labels = [c for (_, c) in dataset]
+            assert dataset_csv_path, "Provide csv with labels to use balance_classes."
+            dataset_csv = pd.read_csv(dataset_csv_path)
+            all_labels = dataset_csv["label"].values
             unique, counts = np.unique(all_labels, return_counts=True)
             unique_counts = {v: c for v, c in zip(unique, counts)}
             label_weights = np.array([1 / unique_counts[num] for num in all_labels])
diff --git a/timm/train.py b/timm/train.py
@@ -755,6 +755,7 @@ def train(config: dict[str, t.Any]):
         use_multi_epochs_loader=args.use_multi_epochs_loader,
         worker_seeding=args.worker_seeding,
         balance_classes=args.balance_classes,
+        samples_csv_path=args.train_samples_csv_path
     )
 
     loader_eval = None

Original file line number	Diff line number	Diff line change
`@@ -755,6 +755,7 @@ def train(config: dict[str, t.Any]):`
`755`	`755`	`use_multi_epochs_loader=args.use_multi_epochs_loader,`
`756`	`756`	`worker_seeding=args.worker_seeding,`
`757`	`757`	`balance_classes=args.balance_classes,`
	`758`	`+ samples_csv_path=args.train_samples_csv_path`
`758`	`759`	`)`
`759`	`760`
`760`	`761`	`loader_eval = None`