OpenCV Recipes:目标跟踪

In this post, we are going to learn about tracking an object in a live video.

帧差

这可能是我们查看视频中哪些部分在移动的最简单的技术。当我们考虑实时视频流时,连续帧之间的差异会给我们提供大量信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import cv2

# Compute the frame difference
def frame_diff(prev_frame, cur_frame, next_frame):
# Absolute difference between current frame and next frame
diff_frames1 = cv2.absdiff(next_frame, cur_frame)

# Absolute difference between current frame and
# previous frame
diff_frames2 = cv2.absdiff(cur_frame, prev_frame)

# Return the result of bitwise 'AND' between the
# above two resultant images to obtain a mask where
# only the areas with white pixels are shown
return cv2.bitwise_and(diff_frames1, diff_frames2)

# Capture the frame from webcam
def get_frame(cap, scaling_factor):
# Capture the frame
ret, frame = cap.read()
# Resize the image
frame = cv2.resize(frame, None, fx=scaling_factor,
fy=scaling_factor, interpolation=cv2.INTER_AREA)

return frame


if __name__=='__main__':
cap = cv2.VideoCapture(0)
scaling_factor = 0.5

cur_frame, prev_frame, next_frame = None, None, None
while True:
frame = get_frame(cap, scaling_factor)
prev_frame = cur_frame
cur_frame = next_frame
# Convert frame to grayscale image
next_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if prev_frame is not None:
cv2.imshow("Object Movement", frame_diff(prev_frame, cur_frame,
next_frame))

key = cv2.waitKey(delay=10)
if key == 27:
break

cv2.destroyAllWindows()

10 毫秒的延迟被用来在帧之间有足够的时间来产生实际的显著差异。

基于颜色空间的跟踪

帧差给了我们一些有用的信息,但是我们不能用它来构建任何有意义的跟踪器。为了建立一个好的目标跟踪器,我们需要了解什么样的特征可以用来使我们的跟踪更加稳健和准确。所以,让我们看看如何使用颜色空间来设计一个好的跟踪器。可以将图像转换为 HSV 空间,然后使用颜色空间阈值来跟踪给定的对象。

跟踪器基于颜色特征识别视频中的特定对象。为了使用这个跟踪器,我们需要知道我们目标物体的颜色分布。下面是代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import cv2
import numpy as np

# Capture the frame from webcam
def get_frame(cap, scaling_factor):
# Capture the frame
ret, frame = cap.read()
# Resize the image
frame = cv2.resize(frame, None, fx=scaling_factor,
fy=scaling_factor, interpolation=cv2.INTER_AREA)

return frame


if __name__=='__main__':
cap = cv2.VideoCapture(0)
scaling_factor = 0.5

# Define 'blue' range in HSV color space
lower = np.array([60,100,100])
upper = np.array([180,255,255])

while True:
frame = get_frame(cap, scaling_factor)

# Convert the HSV color space
hsv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)

# Threshold the HSV image to get only blue color
mask = cv2.inRange(hsv_frame, lower, upper)

# Bitwise-AND mask and original image
res = cv2.bitwise_and(frame, frame, mask=mask)
res = cv2.medianBlur(res, ksize=5)

cv2.imshow('Original image', frame)
cv2.imshow('Color Detector', res)

# Check if the user pressed ESC key
c = cv2.waitKey(delay=10)
if c == 27:
break

cv2.destroyAllWindows()

构建交互式对象跟踪器

基于颜色空间的跟踪器给了我们跟踪彩色物体的自由,但是我们也受限于预先定义的颜色。如果我们只是想随机挑选一个物体呢?我们如何构建一个能够学习所选对象特性并自动跟踪它的对象跟踪器?这是就要用到 CAMShifi (Continuously Adaptive Meanshift)算法。

Mean shift 的概念其实很好也很简单。假设我们选择了一个感兴趣的区域,我们希望我们的目标跟踪器跟踪该目标。在该区域,我们根据颜色直方图选择一些点,并计算质心。如果质心位于这个区域的中心,我们知道物体没有移动。但是如果质心不在这个区域的中心,那么我们知道物体在某个方向上移动。质心的移动控制物体移动的方向。因此,我们将边界框移动到一个新的位置,以便新的质心成为这个边界框的中心。因此,该算法被称为均值漂移,因为均值(即质心)在漂移。通过这种方式,我们可以不断更新对象的当前位置。

但是 Meamshift 的问题是不允许改变边界框的大小。当你将物体移远相机时,对人眼来说,物体会显得更小,但 Meanshift 不会考虑这一点。在整个跟踪会话中,边界框的大小将保持不变。因此,我们需要使用 CAMShift。CAMShift 的优点是它可以使边界框的大小适应对象的大小。除此之外,它还可以跟踪物体的方位。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys
import cv2
import numpy as np

class ObjectTracker():
def __init__(self):
# Initialize the video capture object
# 0 -> indicates that frame should be captured
# from webcam
self.cap = cv2.VideoCapture(0)

# Capture the frame from the webcam
ret, self.frame = self.cap.read()

# Downsampling factor for the input frame
self.scaling_factor = 0.8
self.frame = cv2.resize(self.frame, None, fx=self.scaling_factor, fy=self.scaling_factor, interpolation=cv2.INTER_AREA)

cv2.namedWindow('Object Tracker')
cv2.setMouseCallback('Object Tracker', self.mouse_event)

self.selection = None
self.drag_start = None
self.tracking_state = 0

# Method to track mouse events
def mouse_event(self, event, x, y, flags, param):
x, y = np.int16([x, y])

# Detecting the mouse button down event
if event == cv2.EVENT_LBUTTONDOWN:
self.drag_start = (x, y)
self.tracking_state = 0

if self.drag_start:
if event == cv2.EVENT_MOUSEMOVE:
h, w = self.frame.shape[:2]
xo, yo = self.drag_start
x0, y0 = np.maximum(0, np.minimum([xo, yo], [x, y]))
x1, y1 = np.minimum([w, h], np.maximum([xo, yo], [x, y]))
self.selection = None

if x1-x0 > 0 and y1-y0 > 0:
self.selection = (x0, y0, x1, y1)

elif event == cv2.EVENT_LBUTTONUP:
self.drag_start = None
if self.selection is not None:
self.tracking_state = 1

# Method to start tracking the object
def start_tracking(self):
# Iterate until the user presses the Esc key
while True:
# Capture the frame from webcam
ret, self.frame = self.cap.read()
# Resize the input frame
self.frame = cv2.resize(self.frame, None, fx=self.scaling_factor, fy=self.scaling_factor, interpolation=cv2.INTER_AREA)

vis = self.frame.copy()

# Convert to HSV colorspace
hsv = cv2.cvtColor(self.frame, cv2.COLOR_BGR2HSV)

# Create the mask based on predefined thresholds.
mask = cv2.inRange(hsv, np.array((0., 60., 32.)), np.array((180., 255., 255.)))

if self.selection:
x0, y0, x1, y1 = self.selection
self.track_window = (x0, y0, x1-x0, y1-y0)
hsv_roi = hsv[y0:y1, x0:x1]
mask_roi = mask[y0:y1, x0:x1]

# Compute the histogram
hist = cv2.calcHist( [hsv_roi], [0], mask_roi, [16], [0, 180] )

# Normalize and reshape the histogram
cv2.normalize(hist, hist, 0, 255, cv2.NORM_MINMAX);
self.hist = hist.reshape(-1)

vis_roi = vis[y0:y1, x0:x1]
cv2.bitwise_not(vis_roi, vis_roi)
vis[mask == 0] = 0

if self.tracking_state == 1:
print('tracking')
self.selection = None

# Compute the histogram back projection
prob = cv2.calcBackProject([hsv], [0], self.hist, [0, 180], 1)

prob &= mask
term_crit = ( cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 1 )

# Apply CAMShift on 'prob'
track_box, self.track_window = cv2.CamShift(prob, self.track_window, term_crit)

# Draw an ellipse around the object
cv2.ellipse(vis, track_box, (0, 255, 0), 2)

cv2.imshow('Object Tracker', vis)

c = cv2.waitKey(delay=5)
if c == 27:
break

cv2.destroyAllWindows()

if __name__ == '__main__':
ObjectTracker().start_tracking()

基于特征的跟踪

基于特征的跟踪是指跨视频中连续帧跟踪单个特征点。我们使用一种叫做 optical flow 的技术来跟踪这些特征。optical flow 是计算机视觉中最流行的技术之一。

当我们检测特征点时,我们计算位移矢量,并显示这些关键点在连续帧之间的运动。这些向量被称为运动向量。有很多方法可以做到这一点,但是 Lucas-Kanade 方法可能是所有这些技术中最受欢迎的。我们通过提取特征点开始这个过程。对于每个特征点,我们创建以特征点为中心的 3 x 3 矩阵。这里的假设是,每个矩阵内的所有点都有相似的运动。我们可以根据问题调整这个窗口的大小。

对于当前帧中的每个特征点,我们将周围的 3 x 3 矩阵作为参考点。对于每个矩阵,我们在前一帧中查看它的邻居,以获得最佳匹配。邻居通常大于 3 x 3,因为我们想得到最接近当前矩阵的矩阵。现在,从前一帧中匹配块的中心像素到当前帧中考虑的块的中心像素的路径将成为运动矢量。我们对所有的特征点都这样做,并提取所有的运动矢量。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import cv2
import numpy as np

# Extract area of interest based on the tracking_paths
# In case there is none, entire frame is used
def calculate_region_of_interest(frame, tracking_paths):
mask = np.zeros_like(frame)
mask[:] = 255
for x, y in [np.int32(tp[-1]) for tp in tracking_paths]:
cv2.circle(mask, (x, y), 6, 0, -1)
return mask

def add_tracking_paths(frame, tracking_paths):
mask = calculate_region_of_interest(frame, tracking_paths)

# Extract good features to track. You can learn more
# about the parameters here: http://goo.gl/BI2Kml
feature_points = cv2.goodFeaturesToTrack(frame, mask = mask,
maxCorners = 500, qualityLevel = 0.3, minDistance = 7, blockSize = 7)

if feature_points is not None:
for x, y in np.float32(feature_points).reshape(-1, 2):
tracking_paths.append([(x, y)])

def compute_feature_points(tracking_paths, prev_img, current_img):
feature_points = [tp[-1] for tp in tracking_paths]
# Vector of 2D points for which the flow needs to be found
feature_points_0 = np.float32(feature_points).reshape(-1, 1, 2)

feature_points_1, status_1, err_1 = cv2.calcOpticalFlowPyrLK(prev_img, current_img,
feature_points_0, None, **tracking_params)
feature_points_0_rev, status_2, err_2 = cv2.calcOpticalFlowPyrLK(current_img, prev_img,
feature_points_1, None, **tracking_params)

# Compute the difference of the feature points
diff_feature_points = abs(feature_points_0-feature_points_0_rev).reshape(-1, 2).max(-1)

# threshold and keep only the good points
good_points = diff_feature_points < 1
return feature_points_1.reshape(-1, 2), good_points

def start_tracking(cap, scaling_factor, num_frames_to_track,
num_frames_jump, tracking_params):

tracking_paths = []
frame_index = 0

# Iterate until the user presses the ESC key
while True:
# read the input frame
ret, frame = cap.read()

# downsample the input frame
frame = cv2.resize(frame, None, fx=scaling_factor, fy=scaling_factor,
interpolation=cv2.INTER_AREA)

frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
output_img = frame.copy()

if len(tracking_paths) > 0:
prev_img, current_img = prev_gray, frame_gray
# Compute feature points using optical flow. You can
# refer to the documentation to learn more about the
# parameters here: http://goo.gl/t6P4SE
feature_points, good_points = compute_feature_points(
tracking_paths, prev_img, current_img)

new_tracking_paths = []
for tp, (x, y), good_points_flag in zip(tracking_paths,
feature_points, good_points):

if not good_points_flag: continue

tp.append((x, y))

# Using the queue structure i.e. first in, first out
if len(tp) > num_frames_to_track: del tp[0]

new_tracking_paths.append(tp)

# draw green circles on top of the output image
cv2.circle(output_img, (x, y), 3, (0, 255, 0), -1)

tracking_paths = new_tracking_paths

# draw green lines on top of the output image
point_paths = [np.int32(tp) for tp in tracking_paths]
cv2.polylines(output_img, point_paths, False, (0, 150, 0))

# 'if' condition to skip every 'n'th frame
if not frame_index % num_frames_jump:
add_tracking_paths(frame_gray, tracking_paths)

frame_index += 1
prev_gray = frame_gray

cv2.imshow('Optical Flow', output_img)

# Check if the user pressed the ESC key
c = cv2.waitKey(1)
if c == 27:
break

if __name__ == '__main__':
# Capture the input frame
cap = cv2.VideoCapture(0)

# Downsampling factor for the image
scaling_factor = 0.5

# Number of frames to keep in the buffer when you
# are tracking. If you increase this number,
# feature points will have more "inertia"
num_frames_to_track = 5

# Skip every 'n' frames. This is just to increase the speed.
num_frames_jump = 2

# 'winSize' refers to the size of each patch. These patches
# are the smallest blocks on which we operate and track
# the feature points. You can read more about the parameters
# here: http://goo.gl/ulwqLk
tracking_params = dict(winSize = (11, 11), maxLevel = 2,
criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))

start_tracking(cap, scaling_factor, num_frames_to_track,
num_frames_jump, tracking_params)

cv2.destroyAllWindows()

背景减法

背景减法在视频监控中非常有用。对于必须在静态场景中检测运动物体的情况,背景减法技术表现得非常好。顾名思义,该算法通过检测背景并从当前帧中减去背景来获得前景,即运动物体。

为了检测运动物体,我们需要首先建立背景模型。这与帧差不同,因为我们实际上是在对背景建模,并使用这个模型来检测运动物体。因此,这比简单的帧差技术要好得多。这项技术试图检测场景中的静态部分,然后将它们包含在背景模型中。所以,这是一种可以根据场景进行调整的自适应技术。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import cv2
import numpy as np

# Capture the input frame
def get_frame(cap, scaling_factor=0.5):
ret, frame = cap.read()

# Resize the frame
frame = cv2.resize(frame, None, fx=scaling_factor,
fy=scaling_factor, interpolation=cv2.INTER_AREA)

return frame

if __name__=='__main__':
# Initialize the video capture object
cap = cv2.VideoCapture(0)

# Create the background subtractor object
bgSubtractor = cv2.createBackgroundSubtractorMOG2()

# This factor controls the learning rate of the algorithm.
# The learning rate refers to the rate at which your model
# will learn about the background. Higher value for
# 'history' indicates a slower learning rate. You
# can play with this parameter to see how it affects
# the output.
history = 100

# Iterate until the user presses the ESC key
while True:
frame = get_frame(cap, 0.5)

# Apply the background subtraction model to the input frame
mask = bgSubtractor.apply(frame, learningRate=1.0/history)

# Convert from grayscale to 3-channel RGB
mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)

cv2.imshow('Input frame', frame)
cv2.imshow('Moving Objects MOG', mask & frame)

# Check if the user pressed the ESC key
c = cv2.waitKey(delay=30)
if c == 27:
break

cap.release()
cv2.destroyAllWindows()

在上面的例子中,我们使用了一种被称为 BackgroundSubtractorMOG 的背景减法方法,这是一种基于高斯混合的背景/前景分割算法。另一种方法是 BackgroundSubtractorGMG。

GreatX wechat
关注我的公众号,推送优质文章。