YOLOV2 pytorch版本代碼詳解
@[TOC](YOLOV2 Pytorch 版本)
https://blog.csdn.net/qq_35732321/article/details/127511193
YOLOV2和YOLOV1的不同點(diǎn):
1. backbone : darknet19,(26x26xchannels),(13x13xchannels)
2. loss ? ? : 在計(jì)算框的損失時(shí),用的是預(yù)測(cè)偏移和真實(shí)偏移的mse。
3. anchor ? : 增加先驗(yàn)框提高預(yù)測(cè)精度,需要對(duì)數(shù)據(jù)進(jìn)行編碼和解碼。
```
目錄
1. 數(shù)據(jù)處理
? ? 1.1 數(shù)據(jù)集分類(lèi)
? ? 1.2 數(shù)據(jù)轉(zhuǎn)換為hdf5格式
? ? 1.3 編碼
2. 網(wǎng)絡(luò)模型
? ? 2.1 DarkNet19
? ? 2.2 yolo_body+decoder
3. 損失函數(shù)
? ? 3.1 正樣本損失
? ? 3.2 負(fù)樣本損失
? ? 3.3 類(lèi)別損失
? ? 3.4 框損失
4. 訓(xùn) ? ?練
? ? 4.1 載入數(shù)據(jù)
? ? 4.2 載入模型
? ? 4.3 損失函數(shù)
? ? 4.4 更新參數(shù)
5. 預(yù) ? ?測(cè)
? ? 5.1 數(shù)據(jù)處理
? ? 5.2 預(yù)測(cè)
? ? 5.3 篩選
? ? 5.4 畫(huà)框
```
## 1. 數(shù)據(jù)處理
### ? 1.1 數(shù)據(jù)集分類(lèi)
data_process/datasets_split_1.py
```
aim ? ?: 把數(shù)據(jù)集劃分為訓(xùn)練集、測(cè)試集、驗(yàn)證集。每個(gè)數(shù)據(jù)集存放的是圖片的名稱(chēng)。
input ?:xml_path、base_path、trainval_radio、train_radio
output : base_path+trainval.txt、base_path+train.txt、base_path+val.txt、base_path+test.txt。
process:
? ? ? ? 1. 根據(jù)xml_path里的文件獲取總樣本名稱(chēng)。
? ? ? ? 2. 根據(jù)trainval_radio、train_radio獲取各個(gè)數(shù)據(jù)集的樣本數(shù)量。根據(jù)各個(gè)數(shù)據(jù)集的樣本數(shù)量從總樣本中抽取樣本,獲取樣本的下標(biāo)。
? ? ? ? 3. 根據(jù)下標(biāo)所在的數(shù)據(jù)集,把數(shù)據(jù)集的名稱(chēng)放在不同的數(shù)據(jù)集中。
```
```python
import random,os
xml_path = '../VOCdevkit/VOC2007/Annotations' ? ?# 總樣本
base_path = '../VOCdevkit/VOC2007/ImageSets/Main'
trainval_radio = 0.9 ? # 訓(xùn)練測(cè)試數(shù)據(jù)集的樣本比例
train_radio = 0.9 ? ? ?# 驗(yàn)證集比例
names_list = []
img_names = os.listdir(xml_path)
for name in img_names:
? ? if name.endswith('.xml'):
? ? ? ? names_list.append(name[:-4])
N = len(names_list) ? ? ? # 總樣本量
trainval_num = int(N*trainval_radio) ?# 訓(xùn)練測(cè)試數(shù)據(jù)集量
train_num = int(trainval_num*train_radio) ?# 訓(xùn)練集樣本量
trainval_idx = random.sample(range(N),trainval_num) ?# 訓(xùn)練測(cè)試數(shù)據(jù)集下標(biāo)
train_idx = random.sample(trainval_idx,train_num)
# 訓(xùn)練集下標(biāo)
# 數(shù)據(jù)集地址
ftrain_val = open(os.path.join(base_path,'trainval.txt'),'w')
ftrain = open(os.path.join(base_path,'train.txt'),'w')
fval = open(os.path.join(base_path,'val.txt'),'w')
ftest = open(os.path.join(base_path,'test.txt'),'w')
# 讀入數(shù)據(jù)
for i in range(N) :
? ? name = names_list[i] + '\n'
? ? if i in trainval_idx:
? ? ? ? ftrain_val.write(name)
? ? ? ? if i in train_idx:
? ? ? ? ? ? ftrain.write(name)
? ? ? ? else:
? ? ? ? ? ? fval.write(name)
? ? else:
? ? ? ? ftest.write(name)
ftrain_val.close()
ftrain.close()
fval.close()
ftest.close()
```
### ? 1.2 數(shù)據(jù)轉(zhuǎn)換為hdf5格式
data_process/data2hdf5_2.py
```
input ?: 數(shù)據(jù)集
output : pascal_voc_07_12_LS.hdf5
process:
? ? ? ? 1. 獲取數(shù)據(jù)集的樣本。train_set --> get_ids(voc_path,train_set) --> train_ids
? ? ? ? 2. 生成voc_h5file,設(shè)置存儲(chǔ)的圖片數(shù)據(jù)類(lèi)型和框的數(shù)據(jù)類(lèi)型。劃分每個(gè)數(shù)據(jù)集所屬的group。voc_h5file存儲(chǔ)'classes'。在每個(gè)group中設(shè)置train_images和train_boxes項(xiàng)目用來(lái)存儲(chǔ)圖片和框。
? ? ? ? 3. train_ids ,train_images,train_boxes --> add_to_dataset();
? ? ? ? img_id --> get_img(voc_path,year,img_id);get_boxes(voc_path,year,img_id) ?--> img_data;img_box
```
代碼
```python
import numpy as np
import os,h5py,argparse
import xml.etree.ElementTree as ElementTree
sets_from_2007 = [('2007','train'),('2007','val')]
train_set = [('2007','train')]
val_set = [('2007','val')]
test_set = [('2007','test')]
classes = [
? ? "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
? ? "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
? ? "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
parser = argparse.ArgumentParser(description='Conver Pascal VOC 2007 detection dataset to HDF5')
parser.add_argument('-p','--path_to_voc',help='path to VOCdevkit directory',
? ? ? ? ? ? ? ? ? ? default='../VOCdevkit')
def get_ids(voc_path,datasets):
? ? ''' 數(shù)據(jù)集中的樣本'''
? ? ids = []
? ? for year,set in datasets:
? ? ? ? id_path = os.path.join(voc_path,'VOC%s/ImageSets/Main/%s.txt'%(year,set))
? ? ? ? print(id_path)
? ? ? ? with open(id_path,'r')as f:
? ? ? ? ? ? ids.extend(f.read().strip().split())
? ? return ids
def get_img(voc_path,year,img_id):
? ? ''' ?讀取圖片 '''
? ? img_path = os.path.join(voc_path,'VOC%s/JPEGImages/%s.jpg'%(year,img_id))
? ? with open(img_path,'rb')as f:
? ? ? ? data = f.read()
? ? return np.frombuffer(data,dtype='uint8') ?# [n,]
def get_boxes(voc_path,year,img_id):
? ? ''' ?讀取框 '''
? ? boxes_path = os.path.join(voc_path,'VOC%s/Annotations/%s.xml'%(year,img_id))
? ? with open(boxes_path,'r') as f:
? ? ? ? xml_tree = ElementTree.parse(f)
? ? root = xml_tree.getroot()
? ? boxes = []
? ? for obj in root.iter('object'):
? ? ? ? difficult = obj.find('difficult').text
? ? ? ? cls = obj.find('name').text
? ? ? ? if cls not in classes or int(difficult) == 1:
? ? ? ? ? ? continue
? ? ? ? xml_box = obj.find('bndbox')
? ? ? ? bbox = (int(xml_box.find('xmin').text),
? ? ? ? ? ? ? ? int(xml_box.find('ymin').text),
? ? ? ? ? ? ? ? int(xml_box.find('xmax').text),
? ? ? ? ? ? ? ? int(xml_box.find('ymax').text),
? ? ? ? ? ? ? ? classes.index(cls))
? ? ? ? boxes.extend(bbox)
? ? return np.array(boxes) ?# [n,]
def add_to_dataset(voc_path,year,ids,images,boxes,start = 0):
? ? ''' ?遍歷每一個(gè)樣本,讀取數(shù)據(jù)集的樣本和框 ?'''
? ? for i,img_id ?in enumerate(ids):
? ? ? ? img_data = get_img(voc_path,year,img_id)
? ? ? ? img_box = get_boxes(voc_path,year,img_id)
? ? ? ? images[start+i] = img_data
? ? ? ? boxes[start+i] = img_box
? ? return i
def _main(args):
? ? voc_path = os.path.expanduser(args.path_to_voc)
? ? # 1 獲取數(shù)據(jù)集樣本
? ? train_ids = get_ids(voc_path,train_set)
? ? val_ids = get_ids(voc_path,val_set)
? ? test_ids = get_ids(voc_path,test_set)
? ? train_ids_2007 = get_ids(voc_path,sets_from_2007)
? ? total_train_ids = len(train_ids)+len(train_ids_2007)
? ? # 2 設(shè)置voc_h5file、數(shù)據(jù)類(lèi)型、train_group
? ? print('Creating HDF5 dataset structure.')
? ? fname = os.path.join(voc_path,'pascal_voc_07_12_LS.hdf5')
? ? voc_h5file = h5py.File(fname,'w')
? ? uint8_dt = h5py.special_dtype(vlen = np.dtype('uint8')) # variable length uint8
? ? int_dt = h5py.special_dtype(vlen = np.dtype(int))
? ? train_group = voc_h5file.create_group('train')
? ? val_group = voc_h5file.create_group('val')
? ? test_group = voc_h5file.create_group('test')
? ? # 設(shè)置classes,實(shí)際應(yīng)用中沒(méi)有使用
? ? voc_h5file.attrs['classes'] = np.string_(str.join(',',classes))
? ? # 3 設(shè)置train_images 、train_boxes容器
? ? train_images = train_group.create_dataset('images',shape=(total_train_ids,),dtype=uint8_dt)
? ? val_images = val_group.create_dataset('images',shape=(len(val_ids),),dtype=uint8_dt)
? ? test_images = test_group.create_dataset('images',shape=(len(test_ids),),dtype=uint8_dt)
? ? train_boxes = train_group.create_dataset('boxes',shape=(total_train_ids,),dtype=int_dt)
? ? val_boxes = val_group.create_dataset('boxes',shape=(len(val_ids),),dtype=int_dt)
? ? test_boxes = test_group.create_dataset('boxes',shape=(len(test_ids),),dtype=int_dt)
? ? # 4 加載數(shù)據(jù)
? ? print('Process Pascal VOC 2007 datasets for training set')
? ? last_2007 = add_to_dataset(voc_path,'2007',train_ids_2007,train_images,train_boxes)
? ? print('Processing Pascal VOC 2012 training set.')
? ? add_to_dataset(voc_path,'2007',train_ids,train_images,train_boxes,start=last_2007+1)
? ? print('Processing Pascal VOC 2012 val set.')
? ? add_to_dataset(voc_path, '2007', val_ids, val_images, val_boxes)
? ? print('Processing Pascal VOC 2007 test set.')
? ? add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)
? ? print('Closing HDF5 file.')
? ? voc_h5file.close()
? ? print('Done.')
if __name__ == '__main__':
? ? _main(parser.parse_args())
? ? # voc_path = parser.parse_args().path_to_voc
? ? # datasets = [('2007','train')]
? ? # ids = get_ids(voc_path,datasets)
? ? # # print(ids)
? ? # img = get_img(voc_path,year='2007',img_id='000025')
? ? # box = get_boxes(voc_path,year='2007',img_id='000025')
? ? # print(box.reshape(-1,5))
```
### ? 1.3 編碼
data_process/data_encoder_3.py
```
input ?: data_path,anchors_path,idx
output : processed_images[n,3,416,416],out[n,13,13,5,4+1+5]
process:
? ? ? ? 1.讀取圖片、框、類(lèi)別數(shù)據(jù)。processed_images,processed_boxes = self.process_data(idx)
? ? ? ? 2.對(duì)框編碼,得到真實(shí)偏移和cls。out = self.encoder(processed_boxes)
```
代碼
```python
import numpy as np
import io,os,PIL,h5py,argparse
from PIL import Image
import torch
import torch.utils.data as data
YOLO_ANCHORS = np.array(
? ? ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434),
? ? ?(7.88282, 3.52778), (9.77052, 9.16828)))
def get_classes(classes_path):
? ? with open(classes_path) as f:
? ? ? ? class_name = f.read().strip().split()
? ? return class_name
def get_anchors(anchors_path):
? ? if os.path.isfile(anchors_path):
? ? ? ? with open(anchors_path)as f:
? ? ? ? ? ? anchors = f.read().strip().split()
? ? ? ? return np.array(list(map(float,anchors))).reshape(-1, 2)
? ? else:
? ? ? ? Warning('Could not open anchors file, using default.')
? ? ? ? return YOLO_ANCHORS
class yoloDataset(data.Dataset):
? ? image_size = [416,416]
? ? def __init__(self,data_path,anchors_path):
? ? ? ? self.anchors = self.get_anchors(anchors_path)
? ? ? ? data = h5py.File(data_path, 'r')
? ? ? ? self.images = data['train/images'][:]
? ? ? ? self.boxes = data['train/boxes'][:]
? ? ? ? # 1 每張圖片中,框最多是多少
? ? ? ? self.max_num = 0
? ? ? ? self.num_samples = len(self.boxes)
? ? ? ? self.flag = self.boxes is not None
? ? ? ? if self.flag:
? ? ? ? ? ? for i in range(self.num_samples):
? ? ? ? ? ? ? ? self.boxes[i] = self.boxes[i].reshape(-1,5)
? ? ? ? ? ? ? ? if self.max_num < self.boxes[i].shape[0]:
? ? ? ? ? ? ? ? ? ? self.max_num = self.boxes[i].shape[0]
? ? def __len__(self):
? ? ? ? return self.num_samples
? ? def __getitem__(self,idx):
? ? ? ? processed_images,processed_boxes = self.process_data(idx)
? ? ? ? out = self.encoder(processed_boxes)
? ? ? ? return torch.tensor(processed_images), torch.tensor(out)
? ? def get_anchors(self,anchors_path):
? ? ? ? if os.path.isfile(anchors_path):
? ? ? ? ? ? with open(anchors_path)as f:
? ? ? ? ? ? ? ? anchors = f.read().strip().split()
? ? ? ? ? ? return np.array(list(map(float,anchors))).reshape(-1, 2)
? ? ? ? else:
? ? ? ? ? ? Warning('Could not open anchors file, using default.')
? ? ? ? ? ? return YOLO_ANCHORS
? ? def process_data(self,idx):
? ? ? ? '''?
? ? ? ? aim : ?1.把圖片歸一化到0`1,轉(zhuǎn)換通道。
? ? ? ? ? ? ? ?2.box[x1,y1,x2,y2]-->[cx,cy,w,h];在原圖上的相對(duì)位置;
? ? ? ? ? ? ? ? ?每張圖片上框的shape為[max_num,5],多余的補(bǔ)零。
? ? ? ? inputs: idx
? ? ? ? outputs: np.array(img),np.array(new_box)?
? ? ? ? '''
? ? ? ? images = self.images[idx]
? ? ? ? boxes = self.boxes[idx]
? ? ? ? img = Image.open(io.BytesIO(images))
? ? ? ? img_shape = np.array(img.size) ? ? ? ? ? #
? ? ? ? img = img.resize(self.image_size, PIL.Image.BICUBIC) # ?(416, 416)
? ? ? ? img = np.array(img,np.float)/255.
? ? ? ? img = np.transpose(img,(2,0,1))
? ? ? ? if self.flag:
? ? ? ? ? ? box = np.concatenate([(boxes[:,2:4] + boxes[:,:2])*0.5/img_shape,(boxes[:,2:4] - boxes[:,:2])/img_shape,boxes[:,4:5]],1)
? ? ? ? ? ? new_box = np.zeros((self.max_num,5),dtype=np.float32)
? ? ? ? ? ? new_box[:len(box),:] = box ? ? ? ? ? ? ? ? ? ? ? # box(cx,cy,w,h,cls)
? ? ? ? ? ? return np.array(img),np.array(new_box)
? ? ? ? else:
? ? ? ? ? ? return np.array(img),None
? ? def encoder(self,boxes):
? ? ? ? ''' ? one picture
? ? ? ? aim ? : 把真實(shí)框映射到特征圖上。
? ? ? ? ? ? ? ? 1. 真實(shí)框在特征圖上對(duì)應(yīng)的數(shù)值;
? ? ? ? ? ? ? ? 2 真實(shí)框在特征圖上對(duì)應(yīng)的對(duì)應(yīng)的下標(biāo);
? ? ? ? ? ? ? ? 3 計(jì)算預(yù)測(cè)偏移
? ? ? ? inputs:
? ? ? ? ? ? box[max_num_box, 5(cx,cy,w,h,cls)],anchors[5,2] ? max_num_box=10 ; image_size=[416,416]
? ? ? ? outputs:
? ? ? ? ? ? true_boxes:[h, w, num_boxes, 4]
? ? ? ? ? ? detectors_mask: (h, w, num_boxes, 1) ? ? ? ? ?eg:(13, 13, 5, 1)
? ? ? ? ? ? matching_true_boxes:(h, w, num_boxes, 5) ? ? ?eg:(13, 13, 5, 5)
? ? ? ? '''
? ? ? ? # 1 創(chuàng)建模版
? ? ? ? h,w = self.image_size
? ? ? ? num_anchors = len(self.anchors)
? ? ? ? num_box_params = boxes.shape[1]
? ? ? ? assert h % 32 == 0,'Image sizes in YOLO_v2 must be multiples of 32.'
? ? ? ? assert w % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'
? ? ? ? grid_h = h//32 ?# 13
? ? ? ? grid_w = w//32
? ? ? ? true_boxes = np.zeros([grid_h,grid_w,num_anchors,4],dtype=np.float32)
? ? ? ? detectors_mask = np.zeros([grid_h,grid_w,num_anchors,1],dtype=np.float32) ?# (13, 13, 5, 1)
? ? ? ? matching_true_boxes = np.zeros([grid_h,grid_w,num_anchors,num_box_params],dtype=np.float32) ?# (13, 13, 5, 5)
? ? ? ? # 2 編碼
? ? ? ? box_class = boxes[:,4] ?# [n,1]
? ? ? ? box = boxes[:,:4]*np.array([grid_w,grid_h,grid_w,grid_h])
? ? ? ? i,j = list(map(int,box[:,0])),list(map(int,box[:,1]))
? ? ? ? best_idx = self.iou_wh(box[:,:2],self.anchors) ?# ?(10, 2), (5, 2)--> ?((10,), (10,))
? ? ? ? true_boxes[i, j, best_idx] = boxes[:,:4]/np.array([grid_h,grid_w,grid_h,grid_w])
? ? ? ? detectors_mask[i,j,best_idx] = 1
? ? ? ? adjusted_box = np.array(
? ? ? ? ? ? [
? ? ? ? ? ? ? ? box[:,0] - i, box[:,1] - j,
? ? ? ? ? ? ? ? np.log(box[:,2] / self.anchors[best_idx][:,0]),
? ? ? ? ? ? ? ? np.log(box[:,3] / self.anchors[best_idx][:,1]), box_class
? ? ? ? ? ? ],
? ? ? ? ? ? dtype=np.float32).T
? ? ? ? matching_true_boxes[i, j, best_idx] = adjusted_box
? ? ? ? out = np.concatenate([np.array(true_boxes),np.array(detectors_mask),np.array(matching_true_boxes)],-1)
? ? ? ? return out ?# true_boxes,detectors_mask, matching_true_boxes ?# ((13, 13, 5, 1), (13, 13, 5, 5))
? ? def iou_wh(self,boxes_wh,anchors_wh):
? ? ? ? '''boxes_wh[n,2],anchors_wh [m,2]
? ? ? ? iou[n,m]'''
? ? ? ? boxes_wh=np.expand_dims(boxes_wh,1) ? ? ?# [10,1,2]
? ? ? ? anchors_wh=np.expand_dims(anchors_wh,0) ?# [1,5,2]
? ? ? ? box_max = boxes_wh/2.
? ? ? ? box_min = -box_max
? ? ? ? anchor_max = anchors_wh/2.
? ? ? ? anchor_min = -anchor_max
? ? ? ? inter_mins = np.maximum(box_min,anchor_min) ? ? ?# [10,5,2]
? ? ? ? inter_maxs = np.minimum(box_max,anchor_max)
? ? ? ? inter_wh = np.maximum(inter_maxs-inter_mins,0.)
? ? ? ? inter_area = inter_wh[...,0] * inter_wh [...,1] ?# [10,5]
? ? ? ? boxes_area = boxes_wh[...,0] * boxes_wh[...,1]
? ? ? ? anchors_area = anchors_wh[...,0]*anchors_wh[...,1] ?#[1,5]
? ? ? ? iou = inter_area/(boxes_area+anchors_area-inter_area) ?# [10,5]
? ? ? ? best_iou = np.max(iou,1)
? ? ? ? best_idx = np.argmax(iou,1)
? ? ? ? return list(best_idx*(best_iou > 0))
if __name__ == '__main__':
? ? from torch.utils.data import DataLoader
? ? data_path = '../VOCdevkit/pascal_voc_07_12_LS.hdf5'
? ? anchors_path = '../model_data.pascal_classes.txt'
? ? train_dataset = yoloDataset(data_path,anchors_path) ?# [3, 416, 416],[13, 13, 5, 10]
? ? train_loader = DataLoader(train_dataset,batch_size=1,shuffle=True,num_workers=0)
? ? for i,(img,boxes) in enumerate(train_loader):
? ? ? ? print(img.shape) ? ? # torch.Size([1, 3, 416, 416])
? ? ? ? print(boxes.shape) ? # torch.Size([1, 13, 13, 5, 10]) 4+1+5
```
## 2. 網(wǎng)絡(luò)模型
### ? 2.1 DarkNet19
nets/darketnet19.py
```
input ?: img[b,3,416,416]
output : feas[b,1024,13,13]
process:
? ? 1.features_26 = (cov_bn_leaky3 ?--> maxpool)*2 -->?
? ? ? (bottleneck_block*2 ?--> maxpool)*2 -->?
? ? ? bottleneck_x2_block ?--> maxpool -->?
? ? ? bottleneck_x2_block
? ? 2.features_13 = features_26 ?--> maxpool --> bbx22
```
代碼
```python
import torch
import torch.nn as nn
import math
def cov_bn_leaky3(inplanes,outplanes):
? ? return nn.Sequential(
? ? ? ? nn.Conv2d(inplanes,outplanes,kernel_size=3,padding=1),
? ? ? ? nn.BatchNorm2d(outplanes),
? ? ? ? nn.LeakyReLU(0.1)
? ? )
def cov_bn_leaky1(inplanes,outplanes):
? ? return nn.Sequential(
? ? ? ? nn.Conv2d(inplanes,outplanes,kernel_size=1),
? ? ? ? nn.BatchNorm2d(outplanes),
? ? ? ? nn.LeakyReLU(0.1)
? ? )
def bottleneck_block(inplanes,outplanes,bottleneck_filters):
? ? return nn.Sequential(
? ? ? ? cov_bn_leaky3(inplanes,outplanes),
? ? ? ? cov_bn_leaky1(outplanes,bottleneck_filters),
? ? ? ? cov_bn_leaky3(bottleneck_filters,outplanes)
? ? )
def bottleneck_x2_block(inplanes,outplanes,bottleneck_filters):
? ? return nn.Sequential(
? ? ? ? bottleneck_block(inplanes,outplanes,bottleneck_filters),
? ? ? ? cov_bn_leaky1(outplanes,bottleneck_filters),
? ? ? ? cov_bn_leaky3(bottleneck_filters,outplanes)
? ? )
class darknet_body(nn.Module):
? ? def __init__(self,):
? ? ? ? super(darknet_body, self).__init__()
? ? ? ? self.cbl1 = cov_bn_leaky3(3,32)
? ? ? ? self.cbl2 = cov_bn_leaky3(32,64)
? ? ? ? self.bb1 = ?bottleneck_block(64,128, 64)
? ? ? ? self.bb2 = ?bottleneck_block(128,256, 128)
? ? ? ? self.bbx21 = ?bottleneck_x2_block(256,512, 256)
? ? ? ? self.bbx22 = ?bottleneck_x2_block(512,1024, 512)
? ? ? ? self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2)
? ? ? ? self.features_26 = nn.Sequential(self.cbl1,self.maxpool,self.cbl2,self.maxpool,self.bb1,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?self.maxpool, self.bb2 ,self.maxpool, ?self.bbx21)
? ? ? ? self.features_13 = nn.Sequential(self.features_26 ,self.maxpool, self.bbx22)
? ? ? ? for m in self.modules():
? ? ? ? ? ? if isinstance(m,nn.Conv2d):
? ? ? ? ? ? ? ? n = m.kernel_size[0]*m.kernel_size[1]*m.out_channels
? ? ? ? ? ? ? ? m.weight.data.normal_(0,math.sqrt(2./n))
? ? ? ? ? ? elif isinstance(m,nn.BatchNorm2d):
? ? ? ? ? ? ? ? m.weight.data.fill_(1)
? ? ? ? ? ? ? ? m.bias.data.zero_()
? ? def forward(self,x):
? ? ? ? # out = self.cbl1(x) ? ? ?# ?[1, 32, 416, 416]
? ? ? ? # out = self.maxpool(out) # ?[1, 32, 208, 208]
? ? ? ? # out = self.cbl2(out) ? ?# ?[1, 64, 208, 208]
? ? ? ? # out = self.maxpool(out) # ?[1, 64, 104, 104]
? ? ? ? # out = self.bb1(out) ? ? # ?[1, 128, 104, 104]
? ? ? ? # out = self.maxpool(out) # ?[1, 128, 52, 52]
? ? ? ? # out = self.bb2(out) ? ? # ?[1, 256, 52, 52]
? ? ? ? # out = self.maxpool(out) # ?[1, 256, 26, 26]
? ? ? ? # out = self.bbx21(out) ? # ?[1, 512, 26, 26]
? ? ? ? # out = self.maxpool(x) ? # ?[1, 512, 13, 13]
? ? ? ? # out = self.bbx22(out) ? # ?[1, 1024, 13, 13]
? ? ? ? x = self.features_13(x)
? ? ? ? return x
def darknet19(inputs):
? ? """Generate Darknet-19 model for Imagenet classification."""
? ? body = darknet_body()(inputs)
? ? logits = nn.Conv2d(1024,1000, (1, 1))(body)
? ? logits = nn.Softmax(1)(logits)
? ? return logits
if __name__ == '__main__':
? ? x = torch.randn([1,3,416,416])
? ? # y = cov_bn_leaky1(3,10)(x)
? ? # y = bottleneck_block(3,30,20)
? ? # y = bottleneck_x2_block(3,30,20)(x)
? ? # net = darknet_body()
? ? # y = net(x)
? ? y = darknet_body()
? ? print('y.features_26 :',y.features_26)
? ? print('\n')
? ? print('y.bbx22 :',y.bbx22)
? ? # for i in y.children():
? ? # ? ? print(i)
```
### ? 2.2 yolo_body+decoder
nets/yolo_model.py
```
(1)yolo_body
input ?:[1,3,416,416]?
output :[1, 13, 13, 125]
process:
1.fea_26,fea_13
2.torch.cat([fea_26,fea_13],1) -->?
? cov_bn_leaky3,cov_bn_leaky1 ?-->?
? transpose
(2)yolo_decoder
inputs:
? ? ? ? feats: tensor, [None,125,13,13],
? ? ? ? anchors: array-like,Anchor box widths and heights. (5,2)
? ? ? ? num_classes: int, Number of target classes. 20
outputs:
? ? ? ? box_xy[1, 13, 13, 5, 2]?
? ? ? ? box_wh[1, 13, 13, 5, 2]
? ? ? ? box_conf[1, 13, 13, 5, 1]
? ? ? ? box_class_pred[1, 13, 13, 5, 20]
process:
? ? ? ? 根據(jù)公式,是編碼過(guò)程的逆過(guò)程。
```
代碼
```python
import sys
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
from nets.darketnet19 import cov_bn_leaky1,cov_bn_leaky3,darknet_body
sys.path.append('..') # 這個(gè)是干什么的?
voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
? ? ? ? ? ? ? ? ? ? ? ? ?[6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
voc_classes = [
? ? "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
? ? "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
? ? "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
def grid(h,w):
? ? cx = torch.repeat_interleave(torch.arange(h),w).view(-1,1)
? ? cy = torch.Tensor.repeat(torch.arange(w),h).view(-1,1)
? ? return torch.cat([cx,cy],1)
class yolo_body(nn.Module):
? ? def __init__(self,num_anchors=5,num_classes=20):
? ? ? ? super(yolo_body, self).__init__()
? ? ? ? self.num_anchors = num_anchors
? ? ? ? self.num_classes = ?num_classes
? ? ? ? self.darknet = darknet_body()
? ? ? ? self.fea_13 = nn.Sequential(self.darknet.features_13,cov_bn_leaky3(1024,1024),
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cov_bn_leaky3(1024,1024))
? ? ? ? self.fea_26 = nn.Sequential(self.darknet.features_26,cov_bn_leaky1(512,64))
? ? def pass_through(self,x):
? ? ? ? return torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],1)
? ? def forward(self,x):
? ? ? ? fea_13 = self.fea_13(x)
? ? ? ? fea_26 = self.fea_26(x)
? ? ? ? fea_26 = self.pass_through(fea_26)
? ? ? ? out = torch.cat([fea_26,fea_13],1)
? ? ? ? out = cov_bn_leaky3(1280,1024)(out)
? ? ? ? out = cov_bn_leaky1(1024,self.num_anchors*(self.num_classes+5))(out)
? ? ? ? out = torch.transpose(out,1,3)
? ? ? ? return out ?# inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]
'''
def yolo_body(inputs,num_anchors=5,num_classes=20):
? ? darknet = darknet_body()
? ? features_26 = darknet.features_26
? ? features_13 = darknet.features_13
? ? fea_13 = nn.Sequential(features_13,cov_bn_leaky3(1024,1024),
? ? ? ? ? ? ? ? ? ? ? ?cov_bn_leaky3(1024,1024))(inputs)
? ? fea_26 = nn.Sequential(features_26,cov_bn_leaky1(512,64))(inputs)
? ? fea_26 = pass_through(fea_26)
? ? out = torch.cat([fea_26,fea_13],1)
? ? out = cov_bn_leaky3(1280,1024)(out)
? ? out = cov_bn_leaky1(1024,num_anchors*(num_classes+5))(out)
? ? out = torch.transpose(out,1,3)
? ? print('out.shape:',out.shape)
? ? return out ?# inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]
'''
def yolo_decoder(feats,anchors,num_classes):
? ? ''' Convert final layer features to bounding box parameters.
? ? inputs:
? ? ? ? feats: tensor, [None,125,13,13],
? ? ? ? anchors: array-like,Anchor box widths and heights.
? ? ? ? num_classes: int, Number of target classes.
? ? outputs:
? ? ? ? box_xy ,box_wh,box_conf ,box_class_pred
? ? '''
? ? grids = feats.shape[1:3] ? ? # ?torch.Size([13, 13])
? ? num_anchors = len(anchors) ? # 5
? ? anchors_wh = Variable(torch.from_numpy(anchors)).view(1,1,1,num_anchors,2) # [1, 1, 1, 5, 2]
? ? anchors_cxy = grid(grids[0],grids[1]).view(-1,grids[0],grids[1],1,2) ? ? ? # [1, 13, 13, 1, 2]
? ? feats = feats.view(-1,grids[0],grids[1],num_anchors,num_classes+5) # [1, 13, 13, 125]-->[1, 13, 13, 5, 25]
? ? box_xy = torch.sigmoid(feats[..., :2]) ?# [1,13,13,5,2]
? ? box_wh = torch.exp(feats[..., 2:4]) ? ? # [1,13,13,5,2]
? ? box_confidence = torch.sigmoid(feats[..., 4:5]) ? ? # [1,13,13,5,1]
? ? box_class_probs = torch.softmax(feats[..., 5:],-1) ?# [1,13,13,5,20]
? ? box_xy = (box_xy + anchors_cxy) / torch.tensor(list(grids)) # [1, 13, 13, 5, 2]
? ? box_wh = box_wh * anchors_wh / torch.tensor(list(grids)) ? ?# [1, 13, 13, 5, 2]
? ? return box_xy, box_wh, box_confidence, box_class_probs
if __name__ == '__main__':
? ? x = torch.randn([1,3,416,416])
? ? net = yolo_body()
? ? params = []
? ? params_dict = dict(net.named_parameters())
? ? print(net(x).shape) ?# torch.Size([1, 13, 13, 125])
# x = yolo_body(inputs=x,num_anchors=5,num_classes=20)
? ? # box_xy, box_wh, box_confidence, box_class_probs = yolo_encoder(feats=x,anchors=voc_anchors,num_classes=20)
```
## 3. 損失函數(shù)
loss.py
```
input ?: pred(b, 13, 13, 125),target(b, 13, 13, 5, 1)
output : total_loss
process:
? ? 1.數(shù)據(jù)準(zhǔn)備?
? ? ? ?target --> true_boxes, detectors_mask, matching_true_boxes
? ? ? ?pred --> sigmoid --> pred_d_boxes
? ? ? ?pred --> yolo_decoder() --> pred_xy, pred_wh, pred_confidence, pred_class_prob
? ? 2.正樣本損失 best_iou/1 - pred_confidence,detectors_mask --> objects_loss
? ? 3.負(fù)樣本損失 (pred_xy, pred_wh),true_boxes --> iou --> object_detections;
? ? ? ? ? ? ? ? object_detections,detectors_mask,pred_confidence --> no_objects_loss
? ? 4.類(lèi)別損失 ?matching_true_boxes[...,-1],pred_class_prob,detectors_mask --> classification_loss
? ? 5.框損失 ? ?matching_true_boxes[...,:4],pred_d_boxes,detectors_mask --> coordinates_loss
```
代碼
```python
import torch
import numpy as np
import torch.nn as nn
from nets.yolo_model import yolo_decoder
voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
? ? ? ? ? ? ? ? ? ? ? ? ?[6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
'''
model_body.output (b, 13, 13, 125)
detectors_mask_input (b, 13, 13, 5, 1)
matching_boxes_input (b, 13, 13, 5, 5)
'''
class yoloLoss(nn.Module):
? ? def __init__(self,object_scale,no_object_scale,class_scale,
? ? ? ? ? ? ? ? ?coordinates_scale,anchors,num_classes,
? ? ? ? ? ? ? ? ?rescore_confidence=False,print_loss=False): ? # criterion = yoloLoss(7,2,5,0.5)
? ? ? ? super(yoloLoss, self).__init__()
? ? ? ? self.object_scale = object_scale
? ? ? ? self.no_object_scale = no_object_scale
? ? ? ? self.class_scale = class_scale
? ? ? ? self.coordinates_scale = coordinates_scale
? ? ? ? self.rescore_confidence = rescore_confidence
? ? ? ? self.print_loss = print_loss
? ? ? ? self.anchors = anchors
? ? ? ? self.num_classes = num_classes
? ? def compute_iou(self,box_t,box_p):
? ? ? ? ''' box_pred [b,13, 13, 5, 4],box_true[b,13, 13, 5, 4] (x1,y1,x2,y2)'''
? ? ? ? # 1 lt,rd --> wh --> inter + areas ?--> iou
? ? ? ? lt = torch.maximum(box_t[...,:2],box_p[...,:2])
? ? ? ? rd = torch.minimum(box_t[...,2:],box_p[...,2:])
? ? ? ? wh = rd - lt
? ? ? ? wh[wh<0]=0 ? ? ? ? ? ? ? ? ? # [b,h,w,5,n,2]
? ? ? ? inter = wh[...,0]*wh[...,1] ?# [b,h,w,5,n]
? ? ? ? area_t = (box_t[...,3]-box_t[...,1])*(box_t[...,2]-box_t[...,0]) # [b,1,1,1,n]
? ? ? ? area_p = (box_p[...,3]-box_p[...,1])*(box_p[...,2]-box_p[...,0]) # [b,1,1,1,n]
? ? ? ? iou = inter/(area_t+area_p-inter)
? ? ? ? return iou ? ? ? ? ? # [b,h,w,5,n]
? ? def yolo_loss(self,pred,target):
? ? ? ? # 1 數(shù)據(jù)準(zhǔn)備
? ? ? ? num_anchors = len(self.anchors)
? ? ? ? yolo_output = pred ? ? ? ? ? ? ? ? ? ?# [1, 13, 13, 125 ?]
? ? ? ? true_boxes = target[...,:4] ? ? ? ? ? # [1, 13, 13, ?5, 4]
? ? ? ? detectors_mask = target[...,4:5] ? ? ?# [1, 13, 13, ?5, 1]
? ? ? ? matching_true_boxes = target[...,5:] ?# [1, 13, 13, ?5, 5]
? ? ? ? pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_decoder(
? ? ? ? ? ? yolo_output,anchors=voc_anchors,num_classes=self.num_classes)
? ? ? ? # 預(yù)測(cè)偏移
? ? ? ? yolo_output_shape = yolo_output.shape[1:3] # # torch.Size([1, 13, 13, 125])
? ? ? ? feats = yolo_output.view(-1,yolo_output_shape[0],yolo_output_shape[1],
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?num_anchors,self.num_classes+5) ?# torch.Size([1, 13, 13, 5, 25])
? ? ? ? pred_d_boxes = torch.cat((torch.sigmoid(feats[...,0:2]),feats[...,2:4]),axis=-1) # torch.Size([1, 13, 13, 5, 4])
? ? ? ? # 2 true_boxes與pred_xy, pred_wh的iou
? ? ? ? ## true_boxes[(1,13, 13, 5, 4)],pred_xy[1, 13, 13, 5, 2]
? ? ? ? true_box = torch.cat([(true_boxes[...,:2]-true_boxes[...,2:4]/2.),(true_boxes[...,:2]+true_boxes[...,2:4]/2.)],-1)
? ? ? ? pred_box = torch.cat([(pred_xy-pred_wh/2.),(pred_xy+pred_wh/2.)],-1) ?# [1, 13, 13, 5, 1, 4]
? ? ? ? iou = self.compute_iou(true_box,pred_box) ? ?# [1, 13, 13, 5, 10]
? ? ? ? best_iou, _ = iou.max(-1) ? ? ? ? ? ? ? ? ? ?# [1, 13, 13, 5 ? ?]
? ? ? ? best_iou = best_iou.unsqueeze(-1) ? ? ? ? ? ?# [1, 13, 13, 5, 1 ]
? ? ? ? object_detections = best_iou > 0.6 ? ? ? ? ? # [1, 13, 13, 5, 1 ]
? ? ? ? # 3 loss
? ? ? ? # 3.1 no_obj loss
? ? ? ? no_objects_loss = self.no_object_scale * (1-object_detections)*torch.logical_not(detectors_mask)*torch.square(-pred_confidence)
? ? ? ? # 3.2 obj loss
? ? ? ? if self.rescore_confidence:
? ? ? ? ? ? objects_loss = self.object_scale * detectors_mask * torch.square(best_iou - pred_confidence)
? ? ? ? else:
? ? ? ? ? ? objects_loss = self.object_scale * detectors_mask * torch.square(1 - pred_confidence)
? ? ? ? # 3.3 ?(obj loss + no_obj loss)
? ? ? ? confidence_loss = (objects_loss + no_objects_loss).sum()
? ? ? ? # detectors_mask[b, 13, 13, 5, 1]
? ? ? ? # 3.4 cls loss ?true_boxes[b,n] ? ? ? ? ? ? ? ? # pred_class_prob [b,13,13,5,20]
? ? ? ? matching_classes = matching_true_boxes[...,4] ? # [b, 13, 13, 5, 1]
? ? ? ? s1,s2,s3,s4 = matching_classes.shape
? ? ? ? one_hot = torch.eye(self.num_classes)
? ? ? ? matching_classes = one_hot[matching_classes.flatten()].view(s1,s2,s3,s4,self.num_classes)
? ? ? ? classification_loss = (self.class_scale * detectors_mask * torch.square(matching_classes - pred_class_prob)).sum() # ?[b,n,20]
? ? ? ? # boxes loss
? ? ? ? matching_boxes = matching_true_boxes[...,0:4]
? ? ? ? coordinates_loss = (self.coordinates_scale * detectors_mask * torch.square(matching_boxes-pred_d_boxes)).sum()
? ? ? ? total_loss = 0.5 * (confidence_loss + classification_loss + coordinates_loss)
? ? ? ? return total_loss
if __name__ == '__main__':
? ? print('PyCharm')
```
## 4. 訓(xùn) ? ?練
train.py
```
process:
? ? ? ? 1.載入數(shù)據(jù)
? ? ? ? 2.載入模型
? ? ? ? 3.損失函數(shù)
? ? ? ? 4.更新參數(shù)
```
代碼
```python
import os
import torch,h5py
import numpy as np
from loss import yoloLoss
from torch.autograd import Variable
from nets.yolo_model import yolo_body
from torch.utils.data import DataLoader
from data_process.data_encoder_3 import get_classes,get_anchors,yoloDataset
# 1 parameters
use_gpu = False
learning_rate = 0.001
num_epochs = 1
batch_size = 1
# 2 model
net = yolo_body()
params = []
params_dict = dict(net.named_parameters())
for k,v in params_dict.items():
? ? if k.startswith('features'):
? ? ? ? params += [{'params':[v],'lr':learning_rate*1}]
? ? else:
? ? ? ? params += [{'params':[v],'lr':learning_rate*1}]
? ? ? ??
# 3 loss + optimizer
anchors_path = 'model_data/anchors.txt'
classes_path = 'model_data/pascal_classes.txt'
anchors = get_anchors(anchors_path)
classes = get_classes(classes_path)
num_classes = len(classes)
cost = yoloLoss(5,1,1,1,anchors,num_classes)
optimizer = torch.optim.SGD(params,lr=learning_rate,momentum=0.9,weight_decay=5e-4)
# 4 data
data_path = 'VOCdevkit/pascal_voc_07_12_LS.hdf5'
data = h5py.File(data_path, 'r')
train_dataset = yoloDataset(data_path,anchors_path) ? # (11, 3, 416, 416) (11, 13, 13, 5, 10)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=0)
# 5 train
num_iter = 0
best_test_loss = np.inf
for epoch in range(num_epochs):
? ? net.train()
? ? if epoch == 30:
? ? ? ? learning_rate = 0.0001
? ? if epoch == 40:
? ? ? ? learning_rate = 0.00001
? ? for params_group in optimizer.param_groups:
? ? ? ? params_group['lr'] = learning_rate
? ? print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
? ? print('Learning Rate for this epoch: {}'.format(learning_rate))
? ? total_loss = 0.
? ? for i,(img,targets) in enumerate(train_loader):
? ? ? ? imgs = Variable(img).to(torch.float32) ? ? ? ? ?# torch.Size[b, 3, 416, 416]
? ? ? ? targets = Variable(targets) ? # torch.Size[b, 13, 13, 5, 10]
? ? ? ? pred = net(imgs)
? ? ? ? loss = cost.yolo_loss(pred,targets)
? ? ? ? optimizer.zero_grad()
? ? ? ? loss.backward()
? ? ? ? optimizer.step()
? ? ? ? if(i+1)%5 == 0:
? ? ? ? ? ? print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f'
? ? ? ? ? ? ? ? ? ?%(epoch+1, num_epochs, i+1, len(train_loader), loss.data.item(), total_loss / (i+1)))
? ? ? ? ? ? num_iter += 1
```
## 5. 預(yù) ? ?測(cè)
predict.py
```
process:
? ? 1.數(shù)據(jù)處理
? ? 2.預(yù)測(cè)
? ? 3.篩選
? ? 4.畫(huà)框
```
代碼
```python
'''
# 1 img process
# 2 predict --> decoder
# 3 filter_boxes
# 4 draw
'''
from torch.autograd import Variable
import torchvision.transforms as transforms
import numpy as np
from PIL import Image,ImageDraw,ImageFont
import colorsys,imghdr,os,torch,CV2
from nets.yolo_model import yolo_body,yolo_decoder
from data_process.data_encoder_3 import get_classes,get_anchors
def yolo_boxes_to_corners(box_xy,box_wh):
? ? box_mins = box_xy - (box_wh/2.)
? ? box_maxes = box_xy + (box_wh/2.)
? ? return torch.cat([box_mins[...,1:2], box_mins[...,0:1],
? ? ? ? ? ? ? ? ? ? ? ? ? box_maxes[...,1:2],box_maxes[...,0:1]],-1)
def yolo_filter_boxes(boxes,box_confidence,box_class_probs,threshold=.6):
? ? '''
? ? inputs:
? ? ? ? box ? ? ? ? ? ? [1,13,13,5,4 ]_
? ? ? ? confidence ? ? ?[1,13,13,5,1 ]
? ? ? ? box_class_probs [1,13,13,5,20]
? ? outputs:
? ? ? ? boxes[n,4], scores[n], classes[n]
? ? '''
? ? box_scores = box_confidence * box_class_probs ? ? ? ? ?# box_scores.shape [1,13,13,5,20]
? ? box_class_scores ,box_classes = torch.max(box_scores,axis=-1) ?# [1, 13, 13, 5]), torch.Size([1, 13, 13, 5]
? ? prediction_mask = box_class_scores >= threshold ? ? ? ?# [1, 13, 13, 5])
? ? boxes = boxes[prediction_mask] ? ? ? ? ? ? ?# [n,4]
? ? scores = box_class_scores[prediction_mask] ?# [n]
? ? classes = box_classes[prediction_mask] ? ? ?# [n]
? ? return boxes, scores, classes
def nms(bboxes,scores,threshold=0.5):
? ? x1 = bboxes[:,0]
? ? y1 = bboxes[:,1]
? ? x2 = bboxes[:,2]
? ? y2 = bboxes[:,3]
? ? areas = (x2-x1)*(y2-y1)
? ? _,order = scores.sort(0,descending=True)
? ? keep = []
? ? while order.numel() > 0:
? ? ? ? if order.numel()>1:
? ? ? ? ? ? i = order[0]
? ? ? ? else:
? ? ? ? ? ? i = order
? ? ? ? keep.append(i)
? ? ? ? if order.numel() == 1:
? ? ? ? ? ? break
? ? ? ? xx1 = x1[order[1:]].clamp(min=x1[i])
? ? ? ? yy1 = y1[order[1:]].clamp(min=y1[i])
? ? ? ? xx2 = x2[order[1:]].clamp(max=x1[i])
? ? ? ? yy2 = y2[order[1:]].clamp(max=y1[i])
? ? ? ? w = (xx2-xx1).clamp(min=0)
? ? ? ? h = (yy2-yy1).clamp(min=0)
? ? ? ? inter = w*h
? ? ? ? ove = inter/(areas[i]+areas[order[1:]]-inter)
? ? ? ? ids = torch.nonzero(ove <= threshold).squeeze()
? ? ? ? if ids.numel() == 0:
? ? ? ? ? ? break
? ? ? ? order = order[ids+1]
? ? return torch.LongTensor(keep)
def yolo_eval(yolo_outputs,image_shape=[416,416],
? ? ? ? ? ? ? score_threshold=.6,iou_threshold=.5):
? ? ''' score_filter + NMS
? ? ? ? box_xy[1,13,13,5,2],
? ? ? ? box_wh[1,13,13,5,2]_
? ? ? ? confidence[1,13,13,5,1],
? ? ? ? box_class_probs [1,13,13,5,20]
? ? '''
? ? box_xy,box_wh,box_confidence,box_class_probs = yolo_outputs
? ? boxes = yolo_boxes_to_corners(box_xy, box_wh) #[1, 13, 13, 5, 4]
? ? # 1 score_filter?
? ? boxes, scores, classes = yolo_filter_boxes(
? ? ? ? boxes, box_confidence, box_class_probs, threshold=score_threshold)
? ? # 預(yù)測(cè)框映射到原圖
? ? boxes = boxes * torch.tensor([image_shape[0],image_shape[1],image_shape[0],image_shape[1]])
? ? # 2 NMS
? ? keep = nms(boxes,scores,iou_threshold)
? ? return boxes[keep],scores[keep],classes[keep]
def detect_img():
? ? # 1 img process
? ? image_name = '000015.jpg'
? ? image = CV2.imread('VOCdevkit/VOC2007/JPEGImages/'+image_name) # (375, 500, 3)
? ? h,w,_ = image.shape ? ? ? ? ? ? ? ?# h,w,_ =(375, 500, 3)
? ? img = CV2.resize(image,(416,416)) ?# (448, 448, 3)
? ? img = CV2.cvtColor(img,CV2.COLOR_BGR2RGB) ?# ?dtype('float32')
? ? img = np.array(img,np.float)/255.
? ? transform = transforms.Compose([transforms.ToTensor(),])
? ? img = transform(img) ?# torch.Size([3, 448, 448])
? ? img = Variable(img[None,:,:,:],volatile=True) ?#t
? ? # 2 predict --> decoder
? ? net = yolo_body()
? ? net.eval()
? ? print('load model...')
? ? print('predicting...')
? ? feas = net(img)
? ? anchors_path = 'model_data/anchors.txt'
? ? classes_path = 'model_data/pascal_classes.txt'
? ? anchors = get_anchors(anchors_path)
? ? class_names = get_classes(classes_path)
? ? num_classes = len(class_names)
? ? pred = yolo_decoder(feas,anchors,num_classes)
? ? # box_xy[1,13,13,5,2], box_wh[1,13,13,5,2]_confidence[1,13,13,5,1], box_class_probs [1,13,13,5,20]
? ? # 3 filter_boxes
? ? boxes, scores, classes = yolo_eval(pred) ?# [n,4],[n],[n]
? ? print(boxes.shape, scores.shape, classes.shape)
? ? # 4 draw
? ? hsv_tuples = [(x / len(class_names), 1., 1.)
? ? ? ? ? ? ? ? ? for x in range(len(class_names))]
? ? colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
? ? colors = list(
? ? ? ? map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
? ? ? ? ? ? colors))
? ? font = ImageFont.truetype(
? ? ? ? font='font/FiraMono-Medium.otf',
? ? ? ? size= np.floor(3e-2 * h + 0.5).astype('int32'))
? ? thickness = (h + w) // 300
? ? for i, c in reversed(list(enumerate(classes))):
? ? ? ? predicted_class = class_names[c]
? ? ? ? box = boxes[i]
? ? ? ? score = scores[i]
? ? ? ? label = '{} {:.2f}'.format(predicted_class, score)
? ? ? ? draw = ImageDraw.Draw(image)
? ? ? ? label_size = draw.textsize(label, font)
? ? ? ? top, left, bottom, right = box
? ? ? ? top = max(0, np.floor(top + 0.5).astype('int32'))
? ? ? ? left = max(0, np.floor(left + 0.5).astype('int32'))
? ? ? ? bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
? ? ? ? right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
? ? ? ? print(label, (left, top), (right, bottom))
? ? ? ? if top - label_size[1] >= 0:
? ? ? ? ? ? text_origin = np.array([left, top - label_size[1]])
? ? ? ? else:
? ? ? ? ? ? text_origin = np.array([left, top + 1])
? ? ? ? # My kingdom for a good redistributable image drawing library.
? ? ? ? for i in range(thickness):
? ? ? ? ? ? draw.rectangle(
? ? ? ? ? ? ? ? [left + i, top + i, right - i, bottom - i],
? ? ? ? ? ? ? ? outline=colors[c])
? ? ? ? ? ? draw.rectangle(
? ? ? ? ? ? ? ? [tuple(text_origin), tuple(text_origin + label_size)],
? ? ? ? ? ? ? ? fill=colors[c])
? ? ? ? ? ? draw.text(text_origin, label, fill=(0, 0, 0), font=font)
? ? ? ? ? ? del draw
? ? ? ? print('\n',1111111)
? ? ? ? image.save(os.path.join('image', image_name), quality=90)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
? ? detect_img()
```
本文代碼下載:
鏈接: [link]( https://pan.baidu.com/s/1rFqB8tUU24fFBHrNmWzuTw?)
pwd=123a?
參考:
https://github.com/abeardear/pytorch-YOLO-v1
https://github.com/allanzelener/yad2k