https://github.com/WongKinYiu/yolov7/tree/pose
一、项目介绍:
关键点位置:
0、鼻子
1、左眼
2、右眼
3、左耳
4、右耳
5、左肩
6、右肩
7、左胳膊肘
8、右胳膊肘
9、左手
10、右手
11、左胯
12、右胯
13、左膝盖
14、右膝盖
15、左脚
16、右脚
二、环境安装
- 平台:windows 10
- 编译器:pycharm
- cuda 11.3
- cudnn 8.2.0.53
conda create -n yolov7Pose python=3.8.0
conda activate yolov7Pose
pip install torch-1.11.0+cu113-cp38-cp38-win_amd64.whl
pip install torchvision-0.12.0+cu113-cp38-cp38-win_amd64.whl
pip install matplotlib==3.2.2 -i https://mirror.baidu.com/pypi/simple
pip install opencv-python -i https://mirror.baidu.com/pypi/simple
pip install Pillow -i https://mirror.baidu.com/pypi/simple
pip install PyYAML==5.3.1 -i https://mirror.baidu.com/pypi/simple
pip install scipy==1.4.1 -i https://mirror.baidu.com/pypi/simple
pip install tensorboard==2.4.1 -i https://mirror.baidu.com/pypi/simple
pip install seaborn==0.11.0 -i https://mirror.baidu.com/pypi/simple
pip install pandas -i https://mirror.baidu.com/pypi/simple
pip install thop -i https://mirror.baidu.com/pypi/simple
pip install tqdm -i https://mirror.baidu.com/pypi/simple
pip install numpy==1.21 -i https://mirror.baidu.com/pypi/simple
pytorch的轮子文件下载地址:https://download.pytorch.org/whl/torch_stable.html
三、执行代码
python test.py --data data/coco_kpts.yaml --img 960 --conf 0.001 --iou 0.65 --weights yolov7-w6-pose.pt --kpt-label
创建main.py,代码改造
# -*- coding:utf-8 -*-
import torch
import cv2
import numpy as np
import time
import torchvision
from torchvision import transforms
import socket
import json
_CLASS_COLOR_MAP = [
(0, 0, 255) , # Person (blue).
(255, 0, 0) , # Bear (red).
(0, 255, 0) , # Tree (lime).
(255, 0, 255) , # Bird (fuchsia).
(0, 255, 255) , # Sky (aqua).
(255, 255, 0) , # Cat (yellow).
]
def xyxy2xywh(x):
# Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center
y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center
y[:, 2] = x[:, 2] - x[:, 0] # width
y[:, 3] = x[:, 3] - x[:, 1] # height
return y
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
# Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw # top left x
y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh # top left y
y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw # bottom right x
y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh # bottom right y
return y
def box_iou(box1, box2):
# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
"""
Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Arguments:
box1 (Tensor[N, 4])
box2 (Tensor[M, 4])
Returns:
iou (Tensor[N, M]): the NxM matrix containing the pairwise
IoU values for every element in boxes1 and boxes2
"""
def box_area(box):
# box = 4xn
return (box[2] - box[0]) * (box[3] - box[1])
area1 = box_area(box1.T)
area2 = box_area(box2.T)
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
return inter / (area1[:, None] + area2 - inter) # iou = inter / (area1 + area2 - inter)
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
# 调整大小并填充图像,同时满足步幅多重约束
shape = img.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # 只按比例缩小,不按比例放大(为了更好地测试mAP)
r = min(r, 1.0)
# 计算填充
ratio = r, r # width, height 比率
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
elif scaleFill: # 伸展
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)
def non_max_suppression_kpt(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,labels=(), kpt_label=False, nc=None, nkpt=None):
"""Runs Non-Maximum Suppression (NMS) on inference results
Returns:
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
"""
if nc is None:
nc = prediction.shape[2] - 5 if not kpt_label else prediction.shape[2] - 56 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_det = 300 # maximum number of detections per image
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
output = [torch.zeros((0,6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# 如果是自动标签,则使用Cat-apriori标签
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 5), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
# 如果没有剩余图像,则处理下一个图像
if not x.shape[0]:
continue
# Compute conf
x[:, 5:5+nc] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
if not kpt_label:
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
else:
kpts = x[:, 6:]
conf, j = x[:, 5:6].max(1, keepdim=True)
x = torch.cat((box, conf, j.float(), kpts), 1)[conf.view(-1) > conf_thres]
# 筛选类别
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# 将框更新为框(i,4)=权重(i,n)*框(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
def output_to_keypoint(output):
# 将模型输出转换为目标格式[batch_id,class_id,x,y,w,h,conf]
targets = []
for i, o in enumerate(output):
kpts = o[:,6:]
o = o[:,:6]
for index, (*box, conf, cls) in enumerate(o.detach().cpu().numpy()):
targets.append([i, cls, *list(*xyxy2xywh(np.array(box)[None])), conf, *list(kpts.detach().cpu().numpy()[index])])
return np.array(targets)
def plot_skeleton_kpts(im, kpts, steps, orig_shape=None):
# print(kpts)
#绘制coco数据集的骨架和关键点
palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
[230, 230, 0], [255, 153, 255], [153, 204, 255],
[255, 102, 255], [255, 51, 255], [102, 178, 255],
[51, 153, 255], [255, 153, 153], [255, 102, 102],
[255, 51, 51], [153, 255, 153], [102, 255, 102],
[51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
[255, 255, 255]])
skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
[7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
[1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
# 动作肢体颜色
pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
# 动作肢体关键点
pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]
radius = 5
num_kpts = len(kpts) // steps
for kid in range(num_kpts):
r, g, b = pose_kpt_color[kid]
if kid == 10:
x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
if not (x_coord % 640 == 0 or y_coord % 640 == 0):
if steps == 3:
conf = kpts[steps * kid + 2]
if conf < 0.01:
continue
cv2.circle(im, (int(x_coord), int(y_coord)), radius, (int(r), int(g), int(b)), -1)
for sk_id, sk in enumerate(skeleton):
r, g, b = pose_limb_color[sk_id]
pos1 = (int(kpts[(sk[0]-1)*steps]), int(kpts[(sk[0]-1)*steps+1]))
pos2 = (int(kpts[(sk[1]-1)*steps]), int(kpts[(sk[1]-1)*steps+1]))
if steps == 1:
conf1 = kpts[(sk[0]-1)*steps+2]
conf2 = kpts[(sk[1]-1)*steps+2]
if conf1<0.01 or conf2<0.01:
continue
if pos1[0]%640 == 0 or pos1[1]%640==0 or pos1[0]<0 or pos1[1]<0:
continue
if pos2[0] % 640 == 0 or pos2[1] % 640 == 0 or pos2[0]<0 or pos2[1]<0:
continue
cv2.line(im, pos1, pos2, (int(r), int(g), int(b)), thickness=2)
return x_coord,y_coord
def plot_skeleton_kpts1(im, kpts, steps, orig_shape=None):
# print(kpts)
#绘制coco数据集的骨架和关键点
palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
[230, 230, 0], [255, 153, 255], [153, 204, 255],
[255, 102, 255], [255, 51, 255], [102, 178, 255],
[51, 153, 255], [255, 153, 153], [255, 102, 102],
[255, 51, 51], [153, 255, 153], [102, 255, 102],
[51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
[255, 255, 255]])
skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
[7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
[1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
# 动作肢体颜色
pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
# 动作肢体关键点
pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]
radius = 5
num_kpts = len(kpts) // steps
for kid in range(num_kpts):
r, g, b = pose_kpt_color[kid]
if kid == 9:
x_coord1, y_coord1 = kpts[steps * kid], kpts[steps * kid + 1]
if not (x_coord % 640 == 0 or y_coord % 640 == 0):
if steps == 3:
conf = kpts[steps * kid + 2]
if conf < 0.01:
continue
cv2.circle(im, (int(x_coord1), int(y_coord1)), radius, (int(r), int(g), int(b)), -1)
for sk_id, sk in enumerate(skeleton):
r, g, b = pose_limb_color[sk_id]
pos1 = (int(kpts[(sk[0]-1)*steps]), int(kpts[(sk[0]-1)*steps+1]))
pos2 = (int(kpts[(sk[1]-1)*steps]), int(kpts[(sk[1]-1)*steps+1]))
if steps == 1:
conf1 = kpts[(sk[0]-1)*steps+2]
conf2 = kpts[(sk[1]-1)*steps+2]
if conf1<0.01 or conf2<0.01:
continue
if pos1[0]%640 == 0 or pos1[1]%640==0 or pos1[0]<0 or pos1[1]<0:
continue
if pos2[0] % 640 == 0 or pos2[1] % 640 == 0 or pos2[0]<0 or pos2[1]<0:
continue
cv2.line(im, pos1, pos2, (int(r), int(g), int(b)), thickness=2)
return x_coord1 ,y_coord1
#
#--------------------------------------------------------------
# 设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 加载模型
weigths = torch.load('yolov7-w6-pose.pt')
model = weigths['model']
model = model.half().to(device)
_ = model.eval()
# 读取视频
cap = cv2.VideoCapture(0)
if (cap.isOpened() == False):
print('open failed.')
# 分辨率
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
# 图片缩放
vid_write_image = letterbox(cap.read()[1], (frame_width), stride=64, auto=True)[0]
resize_height, resize_width = vid_write_image.shape[:2]
# 保存结果视频
out = cv2.VideoWriter("result_keypoint.mp4",
cv2.VideoWriter_fourcc(*'mp4v'), 30,
(resize_width, resize_height))
frame_count = 0
total_fps = 0
str_json ={}
while(cap.isOpened):
ret, frame = cap.read()
if ret:
orig_image = frame
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
image = letterbox(image, (frame_width), stride=64, auto=True)[0]
image_ = image.copy()
image = transforms.ToTensor()(image)
image = torch.tensor(np.array([image.numpy()]))
image = image.to(device)
image = image.half()
start_time = time.time()
with torch.no_grad():
output, _ = model(image)
end_time = time.time()
# 计算人数
# 计算fps
fps = 1 / (end_time - start_time)
total_fps += fps
frame_count += 1
output = non_max_suppression_kpt(output, 0.25, 0.65, nc=model.yaml['nc'], nkpt=model.yaml['nkpt'], kpt_label=True)
output = output_to_keypoint(output)
# print(output)
nimg = image[0].permute(1, 2, 0) * 255
nimg = nimg.cpu().numpy().astype(np.uint8)
nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)
alist = []
blist = []
alist1 = []
blist1 = []
# 遍历有几个人
for idx in range(output.shape[0]):
# x,y
#右手
x_coord,y_coord = plot_skeleton_kpts(nimg, output[idx, 7:].T, 3)
x_coord1, y_coord1 = plot_skeleton_kpts1(nimg, output[idx, 7:].T, 3)
# 加入元组,取x和480的绝对值进行排序,然后排完顺序改回原值x
alist = [idx, int(abs(x_coord*1.5)), int(y_coord*0.9)]
alist1 = [idx, int(abs(x_coord1 * 1.5)), int(y_coord1 * 0.9)]
# 把元组添加到list
blist.append(alist)
# 把元组添加到list
blist.append(alist1)
# 排序
result= sorted(blist, key=lambda t: t[1])
keys = [str(x) for x in np.arange(len(result))]
list_json = dict(zip(keys, result))
str_json = json.dumps(list_json, ensure_ascii=False) # json转为string
# # 左手
# x_coord1, y_coord1 = plot_skeleton_kpts1(nimg, output[idx, 7:].T, 3)
# # 加入元组,取x和480的绝对值进行排序,然后排完顺序改回原值x
# alist1 = [idx, int(abs(x_coord1 * 1.5)), int(y_coord1 * 0.9)]
# # 把元组添加到list
# blist.append(alist)
# # 排序
# result = sorted(blist, key=lambda t: t[1])
# keys1 = [str(x) for x in np.arange(len(result1))]
# list_json1 = dict(zip(keys1, result1))
# str_json= json.dumps(list_json1, ensure_ascii=False) # json转为string
addr = ('127.0.0.1', 10000)
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
data = str(str_json).encode()
# data1 = str(str_json1).encode()
print(data)
# print(data1)
s.sendto(data, addr)
# s.sendto(data1, addr)
s.close()
# 显示fps
cv2.putText(nimg, f"{fps:.3f} FPS", (15, 30), cv2.FONT_HERSHEY_SIMPLEX,
1, (0, 255, 0), 2)
# 显示结果并保存
cv2.namedWindow('0', 0)
cv2.resizeWindow("0", 960, 576)
cv2.imshow('0', nimg)
out.write(nimg)
# 按q退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
break
# 资源释放
cap.release()
cv2.destroyAllWindows()
# 计算平均fps
avg_fps = total_fps / frame_count
print(f"Average FPS: {avg_fps:.3f}")
核心代码:
def plot_skeleton_kpts(im, kpts, steps, orig_shape=None):
# print(kpts)
#绘制coco数据集的骨架和关键点
palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
[230, 230, 0], [255, 153, 255], [153, 204, 255],
[255, 102, 255], [255, 51, 255], [102, 178, 255],
[51, 153, 255], [255, 153, 153], [255, 102, 102],
[255, 51, 51], [153, 255, 153], [102, 255, 102],
[51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
[255, 255, 255]])
skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
[7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
[1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
# 动作肢体颜色
pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
# 动作肢体关键点
pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]
# 关键点半径
radius = 5
num_kpts = len(kpts) // steps
# 遍历关键点
for kid in range(num_kpts):
r, g, b = pose_kpt_color[kid]
if kid == 10:
x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
if not (x_coord % 640 == 0 or y_coord % 640 == 0):
if steps == 3:
conf = kpts[steps * kid + 2]
if conf < 0.01:
continue
cv2.circle(im, (int(x_coord), int(y_coord)), radius, (int(r), int(g), int(b)), -1)
for sk_id, sk in enumerate(skeleton):
r, g, b = pose_limb_color[sk_id]
pos1 = (int(kpts[(sk[0]-1)*steps]), int(kpts[(sk[0]-1)*steps+1]))
pos2 = (int(kpts[(sk[1]-1)*steps]), int(kpts[(sk[1]-1)*steps+1]))
if steps == 1:
conf1 = kpts[(sk[0]-1)*steps+2]
conf2 = kpts[(sk[1]-1)*steps+2]
if conf1<0.01 or conf2<0.01:
continue
if pos1[0]%640 == 0 or pos1[1]%640==0 or pos1[0]<0 or pos1[1]<0:
continue
if pos2[0] % 640 == 0 or pos2[1] % 640 == 0 or pos2[0]<0 or pos2[1]<0:
continue
cv2.line(im, pos1, pos2, (int(r), int(g), int(b)), thickness=2)
return x_coord,y_coord
四、效果展示
五、总结
使用yolov7pose的优点是
1、可以实现远距离识别
2、对于摄像头拍摄不到的地方,可以进行肢体关键点的预估和补全