U-Net Semantic Segmentation (像素语义分割) PASCAL VOC 实例
本文为“SCNet Faster R-CNN Transfer Learning Object Detection PASCAL VOC实例” 的延展为使用U-Net即由一个CNN卷积网络将原图像降维至数个小pixel size 的feature以此使得kernel可以在embedding space获得全域更大范围视野后Enconder network再通过一个卷积神经网络Conv2dReLU module, Decoder network将图像还原为原大小并在pixel层面识别图像中的不同objects。这与 DeepLab v3有所不同然而DeepLab v3通常需要大量样本而U-Net通常仅需要较小样本即可完成优化因此U-Net通常作为Medical Image医疗影像处理的首选神经网络而DeepLab v3更倾向于大规模城市场景的应用。本文为公益 类 代码由DeepSeek辅助生成经过实例测试。使用平台为SCNet详见https://blog.csdn.net/YucongCai/article/details/159696147?spm1001.2014.3001.5502https://blog.csdn.net/YucongCai/article/details/159696147?spm1001.2014.3001.5502为了下载U-Net pretrained mode建议安装 python library。segmentation-models-pytorch1. 下载数据集显示两个示例并将数据集准备给神经网络#!/usr/bin/env python3 Complete example: Semantic segmentation on Pascal VOC using U‑Net (PyTorch) import os import random import numpy as np import matplotlib.pyplot as plt from tqdm import tqdm import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, random_split import torchvision.transforms as transforms from torchvision.datasets import VOCSegmentation import segmentation_models_pytorch as smp # ------------------------------- # 1. Download the dataset (torchvision will download automatically) # ------------------------------- data_root ./data # where the dataset will be stored download True # set to False if you already have the data # The dataset is ~2 GB, download only once train_dataset VOCSegmentation( rootdata_root, year2012, image_settrain, downloaddownload, transformNone, # we apply transforms later target_transformNone ) val_dataset VOCSegmentation( rootdata_root, year2012, image_setval, downloaddownload, transformNone, target_transformNone ) print(fTraining samples: {len(train_dataset)}) print(fValidation samples: {len(val_dataset)}) # ------------------------------- # 2. Print two random examples with their segmentation masks # ------------------------------- def show_image_and_mask(dataset, idx): Display an image and its segmentation mask side by side. img, mask dataset[idx] # both are PIL images fig, axes plt.subplots(1, 2, figsize(10, 5)) axes[0].imshow(img) axes[0].set_title(Original image) axes[0].axis(off) axes[1].imshow(mask) axes[1].set_title(Ground truth mask) axes[1].axis(off) plt.tight_layout() plt.show() print(\nRandom example #1:) rand_idx1 random.randint(0, len(train_dataset)-1) show_image_and_mask(train_dataset, rand_idx1) print(\nRandom example #2:) rand_idx2 random.randint(0, len(train_dataset)-1) show_image_and_mask(train_dataset, rand_idx2) # ------------------------------- # 3. Prepare the data for neural network # ------------------------------- # Transformations: # - Resize all images to 256x256 (to keep memory manageable) # - Convert to PyTorch tensors and normalise images to [0,1] # - The mask is kept as a PIL image; later we convert it to a LongTensor image_size 256 transform_img transforms.Compose([ transforms.Resize((image_size, image_size)), transforms.ToTensor(), # scales to [0,1] ]) transform_mask transforms.Compose([ transforms.Resize((image_size, image_size)), transforms.ToTensor(), # yields a float tensor, but we need Long later ]) def transform_train(image, mask): Apply transforms to both image and mask. image transform_img(image) mask transform_mask(mask) # mask is now a float tensor in [0,1]; we need class indices (0‑20) # Multiply by 255 and round to nearest integer, then convert to Long mask (mask * 255).long().squeeze(0) # shape: (H, W) return image, mask def transform_val(image, mask): Same transforms for validation. return transform_train(image, mask) # Apply the transforms by wrapping the datasets class VOCDatasetWrapper(torch.utils.data.Dataset): def __init__(self, voc_dataset, transformNone): self.voc_dataset voc_dataset self.transform transform def __len__(self): return len(self.voc_dataset) def __getitem__(self, idx): image, mask self.voc_dataset[idx] if self.transform: image, mask self.transform(image, mask) return image, mask train_dataset_transformed VOCDatasetWrapper(train_dataset, transformtransform_train) val_dataset_transformed VOCDatasetWrapper(val_dataset, transformtransform_val)2. 下载pretrained 一个U-Net神经网络其中Enconder的backbone为pretrained 在ImageNet数据集的ResNet34PASCAL VOC 2012 segmentation 的输出目标为21种不同的class定义模型后开始训练。要注意的是PASCAL VOC 2012采用了255作为图像的特殊voidlabel, 需要借助“criterion nn.CrossEntropyLoss(ignore_index255) # ignore void label” 对loss function进行处理。# Create DataLoaders batch_size 4 train_loader DataLoader(train_dataset_transformed, batch_sizebatch_size, shuffleTrue, num_workers2) val_loader DataLoader(val_dataset_transformed, batch_sizebatch_size, shuffleFalse, num_workers2) # ------------------------------- # 4. Define or import the U‑Net for training # ------------------------------- # Pascal VOC has 21 classes (20 objects background) n_classes 21 # Use a pre‑trained U‑Net with a ResNet34 encoder (good trade‑off) model smp.Unet( encoder_nameresnet34, # encoder architecture encoder_weightsimagenet, # use pre‑trained weights on ImageNet in_channels3, # RGB input classesn_classes, # output channels number of classes activationNone # well use CrossEntropyLoss which expects raw logits ) # Move model to GPU if available device torch.device(cuda if torch.cuda.is_available() else cpu) model model.to(device) print(f\nUsing device: {device}) # Loss function: standard CrossEntropy for multi‑class segmentation # criterion nn.CrossEntropyLoss() criterion nn.CrossEntropyLoss(ignore_index255) # ignore void label # Optimizer: Adam with a small learning rate optimizer optim.Adam(model.parameters(), lr1e-4) # Optional learning rate scheduler scheduler optim.lr_scheduler.StepLR(optimizer, step_size5, gamma0.5) # ------------------------------- # 5. Start training # ------------------------------- num_epochs 30 # increase for better performance (e.g. 30‑50) best_val_loss float(inf) print(\nStarting training...) for epoch in range(1, num_epochs1): # Training phase model.train() train_loss 0.0 loop tqdm(train_loader, descfEpoch {epoch}/{num_epochs} [Train]) for images, masks in loop: images images.to(device) masks masks.to(device) # shape: (B, H, W), dtype long optimizer.zero_grad() outputs model(images) # shape: (B, 21, H, W) loss criterion(outputs, masks) loss.backward() optimizer.step() train_loss loss.item() * images.size(0) loop.set_postfix(lossloss.item()) train_loss train_loss / len(train_loader.dataset) # Validation phase model.eval() val_loss 0.0 with torch.no_grad(): loop tqdm(val_loader, descfEpoch {epoch}/{num_epochs} [Val]) for images, masks in loop: images images.to(device) masks masks.to(device) outputs model(images) loss criterion(outputs, masks) val_loss loss.item() * images.size(0) loop.set_postfix(lossloss.item()) val_loss val_loss / len(val_loader.dataset) print(fEpoch {epoch:2d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}) # Save the best model based on validation loss if val_loss best_val_loss: best_val_loss val_loss torch.save(model.state_dict(), best_unet_voc.pth) print( - Saved new best model) scheduler.step() print(Training finished.)3. 保存训练后的模型并用训练后的模型进行分类任务。# ------------------------------- # 6. Save the model, load it and use it for two random examples # ------------------------------- # Load the best saved model model.load_state_dict(torch.load(best_unet_voc.pth, map_locationdevice)) model.eval() print(\nLoaded best model from best_unet_voc.pth) # Helper function to visualise predictions (3 rows: original, ground truth, prediction) def predict_and_plot(model, dataset, idx): Plot original image, ground truth mask, and predicted mask for one sample. image, true_mask dataset[idx] # image is already a tensor of shape (3, H, W), true_mask is (H, W) # Add batch dimension and send to device with torch.no_grad(): image_batch image.unsqueeze(0).to(device) # (1,3,H,W) output model(image_batch) # (1,21,H,W) pred_mask torch.argmax(output, dim1).squeeze(0) # (H,W) # Move everything to CPU for plotting img_np image.cpu().permute(1,2,0).numpy() true_np true_mask.cpu().numpy() pred_np pred_mask.cpu().numpy() fig, axes plt.subplots(1, 3, figsize(15, 5)) axes[0].imshow(img_np) axes[0].set_title(Original image) axes[0].axis(off) axes[1].imshow(true_np, cmapjet, vmin0, vmax20) axes[1].set_title(Ground truth mask) axes[1].axis(off) axes[2].imshow(pred_np, cmapjet, vmin0, vmax20) axes[2].set_title(Predicted mask) axes[2].axis(off) plt.tight_layout() plt.show() # Pick two random indices from the validation set (unseen data) rand_val_idx1 random.randint(0, len(val_dataset_transformed)-1) rand_val_idx2 random.randint(0, len(val_dataset_transformed)-1) print(\nPrediction on random validation sample #1:) predict_and_plot(model, val_dataset_transformed, rand_val_idx1) print(\nPrediction on random validation sample #2:) predict_and_plot(model, val_dataset_transformed, rand_val_idx2) print(\nDone.)至此一个简单的通过U-Net Pretrained, Transfer Learning)完成的Semantic Segmentation (像素语义分割) 的模型便完成了。我在找工作HR或项目合作请联系yucongcai_businessoutlook.com与科研相关的请联系yucongcai_researchoutlook.com