I had the same problem with Roboflow’s rebalancing of datasets. I wrote these functions to move train and valid Yolo images/labels to give the same distribution of classes in both dataset splits. YMMV.
import os
import shutil
from collections import defaultdict
dataset_dir = dataset.location # Change to your dataset location
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
train_labels_dir = os.path.join(dataset_dir, 'train', 'labels')
val_labels_dir = os.path.join(dataset_dir, 'valid', 'labels')
train_images_dir = os.path.join(dataset_dir, 'train', 'images')
val_images_dir = os.path.join(dataset_dir, 'valid', 'images')
ann_ext = '.txt'
img_exts = ['.jpg', '.jpeg', '.png']
def get_class_counts_and_files(labels_dir):
class_counts = defaultdict(int)
file_classes = {}
for fname in os.listdir(labels_dir):
if fname.endswith(ann_ext):
path = os.path.join(labels_dir, fname)
with open(path) as f:
classes_in_file = set()
for line in f:
cls = line.strip().split()[0]
class_counts[cls] += 1
classes_in_file.add(cls)
file_classes[fname] = classes_in_file
return class_counts, file_classes
def move_file(fname, src_labels_dir, dst_labels_dir, src_images_dir, dst_images_dir):
# Move label file
src_ann = os.path.join(src_labels_dir, fname)
dst_ann = os.path.join(dst_labels_dir, fname)
shutil.move(src_ann, dst_ann)
# Move corresponding image file
base = os.path.splitext(fname)[0]
for ext in img_exts:
src_img = os.path.join(src_images_dir, base + ext)
if os.path.exists(src_img):
dst_img = os.path.join(dst_images_dir, base + ext)
shutil.move(src_img, dst_img)
break # Only move one image per label
def print_class_distribution(train_counts, val_counts, title="Class distribution"):
all_classes = sorted(set(train_counts) | set(val_counts), key=int)
train_total = sum(train_counts.values())
val_total = sum(val_counts.values())
print(f"\n{title}:")
print(f"{'Class':>8} | {'Train Count':>11} | {'Train %':>7} | {'Val Count':>9} | {'Val %':>7}")
print("-" * 54)
for cls in all_classes:
t_count = train_counts.get(cls, 0)
v_count = val_counts.get(cls, 0)
t_pct = (t_count / train_total * 100) if train_total else 0
v_pct = (v_count / val_total * 100) if val_total else 0
print(f"{cls:>8} | {t_count:11d} | {t_pct:6.2f}% | {v_count:9d} | {v_pct:6.2f}%")
print(f"{'TOTAL':>8} | {train_total:11d} | {100.00:6.2f}% | {val_total:9d} | {100.00:6.2f}%\n")
def balance_classes():
train_counts, train_files = get_class_counts_and_files(train_labels_dir)
val_counts, val_files = get_class_counts_and_files(val_labels_dir)
print_class_distribution(train_counts, val_counts, title="Initial class distribution")
all_classes = set(train_counts) | set(val_counts)
for cls in all_classes:
total = train_counts.get(cls, 0) + val_counts.get(cls, 0)
train_target = int(round(total * TRAIN_SPLIT))
val_target = total - train_target
# Move from train to val if train has too many
if train_counts.get(cls, 0) > train_target:
files = [f for f, classes in train_files.items() if cls in classes]
moved = 0
for f in files:
move_file(f, train_labels_dir, val_labels_dir, train_images_dir, val_images_dir)
moved += 1
train_counts, train_files = get_class_counts_and_files(train_labels_dir)
val_counts, val_files = get_class_counts_and_files(val_labels_dir)
if train_counts.get(cls, 0) <= train_target:
break
print(f"Moved {moved} files with class {cls} from train to valid.")
# Move from val to train if val has too many
elif val_counts.get(cls, 0) > val_target:
files = [f for f, classes in val_files.items() if cls in classes]
moved = 0
for f in files:
move_file(f, val_labels_dir, train_labels_dir, val_images_dir, train_images_dir)
moved += 1
train_counts, train_files = get_class_counts_and_files(train_labels_dir)
val_counts, val_files = get_class_counts_and_files(val_labels_dir)
if val_counts.get(cls, 0) <= val_target:
break
print(f"Moved {moved} files with class {cls} from valid to train.")
print("Balancing complete.")
print_class_distribution(train_counts, val_counts, title="Final class distribution")
balance_classes()
Here’s example output:
Initial class distribution:
Class | Train Count | Train % | Val Count | Val %
------------------------------------------------------
0 | 354 | 5.18% | 24 | 4.84%
1 | 9 | 0.13% | 1 | 0.20%
2 | 450 | 6.58% | 24 | 4.84%
3 | 264 | 3.86% | 13 | 2.62%
4 | 30 | 0.44% | 0 | 0.00%
5 | 3 | 0.04% | 20 | 4.03%
6 | 0 | 0.00% | 1 | 0.20%
7 | 12 | 0.18% | 1 | 0.20%
8 | 99 | 1.45% | 2 | 0.40%
9 | 336 | 4.92% | 18 | 3.63%
10 | 15 | 0.22% | 2 | 0.40%
11 | 132 | 1.93% | 13 | 2.62%
12 | 258 | 3.78% | 8 | 1.61%
13 | 423 | 6.19% | 5 | 1.01%
14 | 615 | 9.00% | 31 | 6.25%
15 | 654 | 9.57% | 39 | 7.86%
16 | 189 | 2.77% | 10 | 2.02%
17 | 375 | 5.49% | 27 | 5.44%
18 | 0 | 0.00% | 3 | 0.60%
19 | 231 | 3.38% | 7 | 1.41%
20 | 1983 | 29.02% | 238 | 47.98%
21 | 195 | 2.85% | 8 | 1.61%
22 | 18 | 0.26% | 0 | 0.00%
23 | 189 | 2.77% | 1 | 0.20%
TOTAL | 6834 | 100.00% | 496 | 100.00%
Moved 3 files with class 5 from valid to train.
Moved 3 files with class 22 from train to valid.
Moved 2 files with class 18 from valid to train.
Moved 6 files with class 8 from train to valid.
Moved 1 files with class 4 from train to valid.
Moved 1 files with class 19 from valid to train.
Moved 14 files with class 9 from train to valid.
Moved 2 files with class 23 from train to valid.
Moved 2 files with class 10 from valid to train.
Moved 2 files with class 11 from train to valid.
Moved 4 files with class 13 from train to valid.
Moved 6 files with class 16 from valid to train.
Moved 22 files with class 17 from train to valid.
Moved 1 files with class 12 from train to valid.
Moved 1 files with class 7 from train to valid.
Moved 1 files with class 14 from train to valid.
Moved 6 files with class 3 from valid to train.
Moved 1 files with class 0 from train to valid.
Moved 7 files with class 2 from train to valid.
Moved 13 files with class 20 from valid to train.
Moved 2 files with class 21 from train to valid.
Moved 15 files with class 15 from train to valid.
Balancing complete.
Final class distribution:
Class | Train Count | Train % | Val Count | Val %
------------------------------------------------------
0 | 288 | 5.21% | 90 | 5.01%
1 | 9 | 0.16% | 1 | 0.06%
2 | 361 | 6.52% | 113 | 6.29%
3 | 228 | 4.12% | 49 | 2.73%
4 | 20 | 0.36% | 10 | 0.56%
5 | 15 | 0.27% | 8 | 0.45%
6 | 1 | 0.02% | 0 | 0.00%
7 | 8 | 0.14% | 5 | 0.28%
8 | 82 | 1.48% | 19 | 1.06%
9 | 268 | 4.84% | 86 | 4.79%
10 | 11 | 0.20% | 6 | 0.33%
11 | 112 | 2.02% | 33 | 1.84%
12 | 182 | 3.29% | 84 | 4.67%
13 | 288 | 5.21% | 140 | 7.79%
14 | 505 | 9.13% | 141 | 7.85%
15 | 533 | 9.63% | 160 | 8.90%
16 | 156 | 2.82% | 43 | 2.39%
17 | 303 | 5.48% | 99 | 5.51%
18 | 2 | 0.04% | 1 | 0.06%
19 | 198 | 3.58% | 40 | 2.23%
20 | 1655 | 29.91% | 566 | 31.50%
21 | 152 | 2.75% | 51 | 2.84%
22 | 11 | 0.20% | 7 | 0.39%
23 | 145 | 2.62% | 45 | 2.50%
TOTAL | 5533 | 100.00% | 1797 | 100.00%
it would be nice is Roboflow offered this functionality instead of randomly choosing files for each split.