1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| import os import random import shutil
def split_dataset(source_dir, dest_dir, train_ratio=0.8, val_ratio=0.1): """ 按指定比例划分图像数据集为训练集、验证集和测试集。 参数: source_dir: 原始数据集路径,要求每个类别是一个子文件夹 dest_dir: 输出路径,将创建 train/val/test 子目录 train_ratio: 训练集所占比例(默认0.8) val_ratio: 验证集所占比例(默认0.1),测试集 = 1 - train - val """ class_names = os.listdir(source_dir)
for class_name in class_names: files = os.listdir(os.path.join(source_dir, class_name)) random.shuffle(files)
train_split = int(len(files) * train_ratio) val_split = int(len(files) * (train_ratio + val_ratio))
subsets = { 'train': files[:train_split], 'val': files[train_split:val_split], 'test': files[val_split:] }
for subset, subset_files in subsets.items(): subset_dir = os.path.join(dest_dir, subset, class_name) os.makedirs(subset_dir, exist_ok=True) for f in subset_files: src_path = os.path.join(source_dir, class_name, f) dst_path = os.path.join(subset_dir, f) shutil.copy(src_path, dst_path)
|