手把手复现YOLO v1用PyTorch从零搭建并训练自己的检测模型附代码在计算机视觉领域目标检测一直是最具挑战性的任务之一。2016年Joseph Redmon等人提出的YOLOYou Only Look Once算法彻底改变了这一领域的游戏规则。与传统的两阶段检测方法不同YOLO将目标检测视为一个回归问题实现了端到端的实时检测。本文将带你从零开始用PyTorch完整实现YOLO v1模型并通过Pascal VOC数据集进行实战训练。1. 环境准备与数据加载1.1 基础环境配置首先确保你的Python环境已安装3.7版本并准备好以下核心依赖pip install torch1.10.0 torchvision0.11.1 pip install opencv-python numpy tqdm matplotlib对于GPU加速建议使用CUDA 11.3配合上述PyTorch版本。可以通过以下代码验证环境import torch print(fPyTorch版本: {torch.__version__}) print(fCUDA可用: {torch.cuda.is_available()})1.2 Pascal VOC数据集处理YOLO v1原始论文使用Pascal VOC 20072012数据集。我们将实现一个自定义的Dataset类from torch.utils.data import Dataset import xml.etree.ElementTree as ET import cv2 import os class VOCDataset(Dataset): def __init__(self, rootdata/VOCdevkit, splittrain, image_size448): self.image_size image_size self.split split self.annotations [] # 解析VOC标注文件 for year in [2007, 2012]: anno_dir os.path.join(root, fVOC{year}/Annotations) img_dir os.path.join(root, fVOC{year}/JPEGImages) ids_file os.path.join(root, fVOC{year}/ImageSets/Main/{split}.txt) with open(ids_file) as f: for line in f: img_id line.strip() self.annotations.append({ img_path: os.path.join(img_dir, f{img_id}.jpg), anno_path: os.path.join(anno_dir, f{img_id}.xml) }) def __len__(self): return len(self.annotations) def __getitem__(self, idx): # 实现图像加载和标注解析 annotation self.annotations[idx] img cv2.imread(annotation[img_path]) img cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img self._resize_image(img) # 解析XML标注 boxes, labels self._parse_annotation(annotation[anno_path]) # 转换为YOLO格式的网格标注 target self._convert_to_yolo_format(boxes, labels) return torch.from_numpy(img).permute(2,0,1).float()/255.0, target注意完整实现还需要添加图像resize、标注解析和YOLO格式转换等方法这些将在下一节详细展开。2. YOLO网络架构实现2.1 骨干网络设计YOLO v1采用24层卷积2层全连接的架构。我们先实现卷积块的基础结构import torch.nn as nn class ConvBlock(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride1, padding0): super().__init__() self.conv nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding) self.bn nn.BatchNorm2d(out_channels) self.leakyrelu nn.LeakyReLU(0.1) def forward(self, x): return self.leakyrelu(self.bn(self.conv(x)))基于这个基础模块我们可以构建完整的YOLO网络class YOLOv1(nn.Module): def __init__(self, S7, B2, C20): super().__init__() self.S S # 网格划分数量 self.B B # 每个网格预测的边界框数 self.C C # 类别数量 # 特征提取部分 self.features nn.Sequential( ConvBlock(3, 64, 7, stride2, padding3), nn.MaxPool2d(2, stride2), ConvBlock(64, 192, 3, padding1), nn.MaxPool2d(2, stride2), # 中间层省略... ConvBlock(1024, 1024, 3, padding1), ConvBlock(1024, 1024, 3, stride2, padding1), ) # 检测头部分 self.detector nn.Sequential( nn.Linear(7*7*1024, 4096), nn.LeakyReLU(0.1), nn.Dropout(0.5), nn.Linear(4096, S*S*(B*5 C)), nn.Sigmoid() # 输出归一化到0-1 ) def forward(self, x): x self.features(x) x x.view(x.size(0), -1) # 展平 x self.detector(x) return x.view(-1, self.S, self.S, self.B*5 self.C)2.2 关键实现细节在构建网络时有几个关键点需要注意最后一层使用Sigmoid激活确保坐标和置信度输出在0-1范围内LeakyReLU斜率论文中使用0.1的负斜率全连接层前的Dropout设置为0.5防止过拟合3. 损失函数实现YOLO的损失函数由五部分组成我们需要仔细实现每个组件def yolo_loss(predictions, targets, S7, B2, C20, lambda_coord5, lambda_noobj0.5): YOLO v1自定义损失函数 参数: predictions: 模型输出张量 (batch_size, S, S, B*5C) targets: 真实标注张量 (batch_size, S, S, B*5C) # 坐标损失 coord_mask targets[..., 4] 0 # 有目标的网格 pred_boxes predictions[..., :B*5].reshape(-1, S, S, B, 5) target_boxes targets[..., :B*5].reshape(-1, S, S, B, 5) # 中心点坐标损失 xy_loss torch.sum(coord_mask * torch.sum( (pred_boxes[..., :2] - target_boxes[..., :2])**2, dim-1)) # 宽高损失带平方根 wh_loss torch.sum(coord_mask * torch.sum( (torch.sqrt(pred_boxes[..., 2:4]) - torch.sqrt(target_boxes[..., 2:4]))**2, dim-1)) # 有目标置信度损失 conf_loss torch.sum(coord_mask * (pred_boxes[..., 4] - target_boxes[..., 4])**2) # 无目标置信度损失 noobj_mask targets[..., 4] 0 noobj_loss torch.sum(noobj_mask * (predictions[..., 4] - targets[..., 4])**2) # 类别损失 class_loss torch.sum(coord_mask * torch.sum( (predictions[..., B*5:] - targets[..., B*5:])**2, dim-1)) # 加权组合 total_loss ( lambda_coord * (xy_loss wh_loss) conf_loss lambda_noobj * noobj_loss class_loss ) return total_loss提示实际实现时建议对每个损失组件进行单独监控以便调试模型。4. 训练流程与技巧4.1 数据预处理细节在VOCDataset类中我们需要完善几个关键方法def _resize_image(self, img): 保持宽高比的resize并用灰色填充边缘 h, w img.shape[:2] scale min(self.image_size/w, self.image_size/h) new_w, new_h int(w*scale), int(h*scale) resized cv2.resize(img, (new_w, new_h)) canvas np.full((self.image_size, self.image_size, 3), 128, dtypenp.uint8) # 将resized图像放在canvas中心 x1 (self.image_size - new_w) // 2 y1 (self.image_size - new_h) // 2 canvas[y1:y1new_h, x1:x1new_w] resized return canvas.astype(np.float32) def _parse_annotation(self, xml_path): 解析VOC XML标注文件 tree ET.parse(xml_path) root tree.getroot() boxes [] labels [] for obj in root.iter(object): # 获取类别索引Pascal VOC有20类 cls obj.find(name).text cls_idx VOC_CLASSES.index(cls) # 解析边界框坐标相对坐标 bbox obj.find(bndbox) xmin float(bbox.find(xmin).text) ymin float(bbox.find(ymin).text) xmax float(bbox.find(xmax).text) ymax float(bbox.find(ymax).text) boxes.append([xmin, ymin, xmax, ymax]) labels.append(cls_idx) return np.array(boxes), np.array(labels) def _convert_to_yolo_format(self, boxes, labels): 将VOC格式标注转换为YOLO训练格式 target np.zeros((self.S, self.S, self.B*5 self.C)) cell_size 1.0 / self.S for box, label in zip(boxes, labels): # 计算中心点所在的网格 x_center, y_center (box[0] box[2])/2, (box[1] box[3])/2 grid_x, grid_y int(x_center / cell_size), int(y_center / cell_size) # 计算相对于网格的坐标 box_x (x_center - grid_x * cell_size) / cell_size box_y (y_center - grid_y * cell_size) / cell_size # 计算宽高相对于整图 box_w (box[2] - box[0]) box_h (box[3] - box[1]) # 填充目标张量 if target[grid_y, grid_x, 4] 0: # 该网格尚未分配目标 target[grid_y, grid_x, :4] [box_x, box_y, box_w, box_h] target[grid_y, grid_x, 4] 1 # 置信度 target[grid_y, grid_x, 5 label] 1 # 类别概率 return target4.2 训练循环实现下面是训练过程的核心代码def train(model, dataloader, criterion, optimizer, epochs50): model.train() device next(model.parameters()).device for epoch in range(epochs): running_loss 0.0 progress_bar tqdm(dataloader, descfEpoch {epoch1}/{epochs}) for images, targets in progress_bar: images images.to(device) targets targets.to(device) # 前向传播 outputs model(images) loss criterion(outputs, targets) # 反向传播和优化 optimizer.zero_grad() loss.backward() optimizer.step() running_loss loss.item() progress_bar.set_postfix(lossrunning_loss/(progress_bar.n1)) # 每个epoch保存模型 torch.save(model.state_dict(), fyolov1_epoch{epoch1}.pth)4.3 训练技巧与调参根据实践经验以下技巧能显著提升训练效果学习率策略初始学习率设为0.001每10个epoch衰减为原来的1/10使用Adam优化器比SGD更稳定数据增强transform transforms.Compose([ transforms.RandomHorizontalFlip(p0.5), transforms.ColorJitter(brightness0.2, contrast0.2, saturation0.2), transforms.RandomAffine(degrees10, translate(0.1,0.1), scale(0.9,1.1)), ])梯度裁剪torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm5.0)5. 模型评估与预测5.1 非极大值抑制(NMS)实现预测后处理的关键步骤def non_max_suppression(predictions, conf_thresh0.5, iou_thresh0.4): 对YOLO输出执行非极大值抑制 参数: predictions: 模型输出张量 (S, S, B*5C) conf_thresh: 置信度阈值 iou_thresh: IoU阈值 # 转换预测格式为(x1,y1,x2,y2,conf,class) boxes [] S predictions.shape[0] cell_size 1.0 / S for i in range(S): for j in range(S): for b in range(2): # 每个网格2个预测框 pred predictions[i,j,b*5:(b1)*5] cls_probs predictions[i,j,10:] if pred[4] conf_thresh: continue # 计算绝对坐标 x (j pred[0]) * cell_size y (i pred[1]) * cell_size w pred[2] h pred[3] x1, y1 x - w/2, y - h/2 x2, y2 x w/2, y h/2 # 获取最可能的类别 cls_idx torch.argmax(cls_probs) conf pred[4] * cls_probs[cls_idx] boxes.append([x1, y1, x2, y2, conf, cls_idx]) if not boxes: return [] boxes torch.tensor(boxes) # 按置信度排序 boxes boxes[boxes[:,4].argsort(descendingTrue)] keep [] while boxes.shape[0] 0: keep.append(boxes[0]) if boxes.shape[0] 1: break # 计算IoU ious box_iou(keep[-1].unsqueeze(0), boxes[1:])[0] boxes boxes[1:][ious iou_thresh] return keep def box_iou(box1, box2): 计算两组边界框之间的IoU # 计算交集区域 inter_x1 torch.max(box1[:,0], box2[:,0]) inter_y1 torch.max(box1[:,1], box2[:,1]) inter_x2 torch.min(box1[:,2], box2[:,2]) inter_y2 torch.min(box1[:,3], box2[:,3]) inter_area torch.clamp(inter_x2 - inter_x1, min0) * torch.clamp(inter_y2 - inter_y1, min0) # 计算并集区域 area1 (box1[:,2] - box1[:,0]) * (box1[:,3] - box1[:,1]) area2 (box2[:,2] - box2[:,0]) * (box2[:,3] - box2[:,1]) return inter_area / (area1 area2 - inter_area)5.2 可视化预测结果最后我们可以实现一个可视化函数来查看模型预测效果def visualize_prediction(image, boxes, class_namesVOC_CLASSES): 在图像上绘制预测框 img image.copy() height, width img.shape[:2] for box in boxes: x1, y1, x2, y2, conf, cls_idx box x1, x2 int(x1 * width), int(x2 * width) y1, y2 int(y1 * height), int(y2 * height) # 绘制边界框 color (0, 255, 0) # 绿色 cv2.rectangle(img, (x1,y1), (x2,y2), color, 2) # 添加标签和置信度 label f{class_names[int(cls_idx)]}: {conf:.2f} cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) return img在实际项目中训练完整的YOLO v1模型在Pascal VOC数据集上大约需要2-3天使用单个RTX 3090 GPU。初期训练时建议先用小批量数据验证代码正确性确认损失下降趋势正常后再进行完整训练。