YOLO转COCO格式实战：手把手教你用Python脚本适配DETR训练需求

张

张建站

2026/6/14 1:28:26

10分钟阅读

YOLO转COCO格式实战Python脚本适配DETR训练全流程解析当目标检测遇上Transformer架构DETR以其端到端的特性正在重塑检测任务的实现范式。但许多研究者在尝试将自有YOLO格式数据集迁移至DETR框架时往往在格式转换环节遭遇水土不服。本文将深入剖析YOLO与COCO格式的核心差异通过可复用的Python解决方案带你完整实现从数据转换到训练调优的全链路实践。1. 理解格式转换的本质需求在计算机视觉领域数据格式就像不同国家间的语言体系。YOLO采用的归一化中心坐标和宽高表示法如0 0.5 0.5 0.2 0.3与COCO要求的绝对坐标边界框如[100,150,40,60]存在本质差异。这种差异不仅体现在数值表达上更反映在整体数据结构中YOLO格式特征每个图像对应一个.txt标签文件使用相对坐标0-1范围类别索引从0开始连续编号无统一元数据描述文件COCO格式特征所有标注集中存储在.json文件使用绝对像素坐标需要完整的categories元数据包含图像尺寸等附加信息# YOLO标签示例class x_center y_center width height 0 0.45 0.32 0.12 0.15 1 0.67 0.29 0.08 0.20 # 对应COCO标注结构 { images: [{id: 1, file_name: image1.jpg, width: 640, height: 480}], annotations: [{ id: 1, image_id: 1, category_id: 0, bbox: [288, 153, 76, 72], # x,y,w,h area: 5472, iscrowd: 0 }], categories: [{id: 0, name: person}, {id: 1, name: car}] }关键提示COCO格式的bbox采用[x_top_left, y_top_left, width, height]形式与YOLO的中心点坐标需要精确转换任何四舍五入都可能导致训练时出现框体偏移。2. 构建健壮的转换脚本实现格式转换需要处理三个核心问题坐标系统转换、文件结构重组和元数据生成。以下代码展示了如何通过Python字典操作和OpenCV/PIL混合处理实现高精度转换import os import json from PIL import Image import numpy as np class YOLOToCOCOConverter: def __init__(self, class_mapping): self.categories [{id: i, name: name} for i, name in enumerate(class_mapping)] self.image_id 1 self.annotation_id 1 def _get_image_dimensions(self, image_path): 支持多种图像格式的尺寸获取 try: with Image.open(image_path) as img: return img.size # (width, height) except Exception as e: print(fError reading {image_path}: {str(e)}) return None def convert_single_label(self, yolo_line, img_width, img_height): 将单行YOLO标签转换为COCO标注格式 parts yolo_line.strip().split() if len(parts) ! 5: return None class_id, x_center, y_center, width, height map(float, parts) # 坐标转换核心算法 abs_x x_center * img_width abs_y y_center * img_height abs_w width * img_width abs_h height * img_height return { id: self.annotation_id, image_id: self.image_id, category_id: int(class_id), bbox: [abs_x - abs_w/2, abs_y - abs_h/2, abs_w, abs_h], area: abs_w * abs_h, iscrowd: 0 } def process_directory(self, img_dir, label_dir, output_json): 处理整个目录的转换 coco_data { images: [], annotations: [], categories: self.categories } for label_file in os.listdir(label_dir): if not label_file.endswith(.txt): continue base_name os.path.splitext(label_file)[0] img_extensions [.jpg, .jpeg, .png, .bmp] img_path None for ext in img_extensions: temp_path os.path.join(img_dir, base_name ext) if os.path.exists(temp_path): img_path temp_path break if not img_path: continue img_width, img_height self._get_image_dimensions(img_path) if not img_width: continue coco_data[images].append({ id: self.image_id, file_name: os.path.basename(img_path), width: img_width, height: img_height }) with open(os.path.join(label_dir, label_file), r) as f: for line in f: annotation self.convert_single_label(line, img_width, img_height) if annotation: coco_data[annotations].append(annotation) self.annotation_id 1 self.image_id 1 os.makedirs(os.path.dirname(output_json), exist_okTrue) with open(output_json, w) as f: json.dump(coco_data, f, indent2)关键改进点解析多图像格式支持自动检测.jpg/.png等不同后缀避免因格式问题导致转换失败异常处理机制对损坏图像或错误标签进行跳过处理并记录日志内存优化采用流式处理避免大文件内存溢出坐标精度保留全程使用浮点运算仅在最后写入JSON时进行类型转换3. 典型问题排查指南在实际转换过程中开发者常会遇到以下几类问题3.1 图像与标签匹配异常症状生成的JSON文件中images列表为空或annotations与images不对应解决方案使用文件名严格匹配策略区分大小写实现交叉验证检查def validate_matching(img_dir, label_dir): img_files {os.path.splitext(f)[0] for f in os.listdir(img_dir)} label_files {os.path.splitext(f)[0] for f in os.listdir(label_dir)} print(f仅在图像目录存在的文件: {img_files - label_files}) print(f仅在标签目录存在的文件: {label_files - img_files})3.2 坐标转换错误症状可视化时边界框明显偏移或尺寸异常调试方法打印中间转换值print(f原始YOLO值: {parts} - 转换后COCO: {abs_x},{abs_y},{abs_w},{abs_h})使用OpenCV可视化验证import cv2 img cv2.imread(img_path) x,y,w,h map(int, bbox) cv2.rectangle(img, (x,y), (xw,yh), (0,255,0), 2) cv2.imshow(debug, img) cv2.waitKey(0)3.3 类别ID不匹配症状训练时出现类别预测混乱或损失值异常处理方案建立双向映射字典category_map {cat[name]: cat[id] for cat in categories} id_to_name {cat[id]: cat[name] for cat in categories}在转换前验证YOLO类别ID是否超出范围if int(class_id) len(self.categories): print(f警告: 非法类别ID {class_id} 在文件 {label_file})4. DETR训练适配技巧成功转换COCO格式后还需针对DETR特性进行专项优化4.1 预训练权重调整DETR默认使用COCO预训练权重91类需要修改分类头适配自有数据集def adapt_pretrained_weights(pretrained_path, num_classes, output_path): import torch state_dict torch.load(pretrained_path) # 调整分类层维度 original_cls_weight state_dict[model][class_embed.weight] original_cls_bias state_dict[model][class_embed.bias] new_cls_weight torch.zeros((num_classes 1, 256)) # 1 for background new_cls_bias torch.zeros(num_classes 1) # 保持背景类参数不变 new_cls_weight[0] original_cls_weight[0] new_cls_bias[0] original_cls_bias[0] # 随机初始化新增类别参数 nn.init.normal_(new_cls_weight[1:], mean0, std0.01) state_dict[model][class_embed.weight] new_cls_weight state_dict[model][class_embed.bias] new_cls_bias torch.save(state_dict, output_path)4.2 超参数配置建议基于不同数据集规模的经验参数数据规模推荐batch_size基础学习率训练epochs学习率衰减策略1k4-81e-5300200epoch后×0.11k-10k8-162e-5150-200120epoch后×0.110k16-323e-5100-15080epoch后×0.14.3 关键训练命令示例python main.py \ --dataset_file coco \ --coco_path /path/to/converted_dataset \ --epochs 300 \ --lr 1e-4 \ --batch_size 8 \ --num_workers 4 \ --output_dir logs \ --resume adapted_weights.pth \ --lr_drop 200 \ --clip_max_norm 0.1注意事项DETR对学习率非常敏感建议使用--lr_backbone参数将骨干网络学习率设为主学习率的1/10例如--lr 1e-4 --lr_backbone 1e-55. 可视化与验证流程为确保转换质量建议建立三级验证体系基础完整性检查def check_coco_anns(json_path): with open(json_path) as f: data json.load(f) print(f图像数量: {len(data[images])}) print(f标注数量: {len(data[annotations])}) print(f平均每图标注: {len(data[annotations])/len(data[images]):.1f}) print(类别分布:) for cat in data[categories]: count sum(1 for ann in data[annotations] if ann[category_id] cat[id]) print(f{cat[name]}: {count})随机样本可视化def visualize_random_samples(coco_json, img_dir, num_samples3): from pycocotools.coco import COCO import random coco COCO(coco_json) img_ids coco.getImgIds() for img_id in random.sample(img_ids, num_samples): img_info coco.loadImgs(img_id)[0] ann_ids coco.getAnnIds(imgIdsimg_id) annotations coco.loadAnns(ann_ids) img Image.open(os.path.join(img_dir, img_info[file_name])) draw ImageDraw.Draw(img) for ann in annotations: x,y,w,h ann[bbox] draw.rectangle([x,y,xw,yh], outlinered, width2) draw.text((x,y), coco.loadCats(ann[category_id])[0][name], fillwhite) plt.imshow(img) plt.show()Dataloader验证from torch.utils.data import DataLoader from datasets import build_dataset def test_dataloader(coco_path): dataset build_dataset(image_settrain, argsargparse.Namespace( coco_pathcoco_path, dataset_filecoco, masksFalse )) loader DataLoader(dataset, batch_size2, collate_fnutils.collate_fn) for batch in loader: print(f图像张量形状: {batch[0].tensors.shape}) print(f标注数量: {len(batch[1])}) targets batch[1] print(f示例标注: {targets[0]}) break在完成全部转换和验证后建议将处理好的数据集打包为以下标准结构便于团队共享和版本控制mydataset_coco/ ├── annotations/ │ ├── instances_train.json │ └── instances_val.json ├── train2017/ │ ├── image1.jpg │ └── ... └── val2017/ ├── image100.jpg └── ...实际项目中我们曾遇到YOLO标签文件与图像文件名大小写不一致导致的转换失败案例如Image1.JPG对应image1.txt最终通过添加大小写不敏感匹配解决了问题。这也提醒我们在构建转换工具时必须考虑实际数据中可能存在的各种边界情况。