Perform YOLOv3 object detection using OpenCV in Python

created at 08-03-2021 views: 65

Use the yolov3 model to perform YOLOv3 object detection through the opencv camera.

Import the required modules:

import cv2
import numpy as np
import time

Let's define some variables and parameters we need:

CONFIDENCE = 0.5
SCORE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

# network configuration
config_path = "cfg/yolov3.cfg"
# YOLO net weights 
weights_path = "weights/yolov3.weights"
#  coco class labels (objects)
labels = open("data/coco.names").read().strip().split("\n")
# The color of the detection frame of each object
colors = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

config_path and weights_path respectively represent the yolov3 model configuration and the corresponding pre-training model weights.

labels is a list of all class labels of different objects detected, generating random colors
It is because there are many classes.

The following code loads the model:

net = cv2.dnn.readNetFromDarknet(config_path, weights_path)

First load a sample image:

load image

path_name = "test.jpg"
image = cv2.imread(path_name)
file_name = os.path.basename(path_name)
filename, ext = file_name.split(".")
h, w = image.shape[:2]

Next, this image needs to be normalized, scaled, and reshaped to make it suitable as an input to a neural network:

# create 4D blob
blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)

This will normalize the pixel value to a range from 0 to 1, resize the image to (416, 416) and scale it

print("image.shape:", image.shape)
print("blob.shape:", blob.shape)

image.shape: (1200, 1800, 3)
blob.shape: (1, 3, 416, 416)

Now let us input this image into the neural network to obtain the output prediction:

# Set blob as the input of the network
net.setInput(blob)
# Get all layer names
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
#Get network output
#Measure the time spent
start = time.perf_counter()
layer_outputs = net.forward(ln)
time_took = time.perf_counter() - start
print(f"Time took: {time_took:.2f}s")

boxes, confidences, class_ids = [], [], []

# Loop on each layer output
for output in layer_outputs:
    # Loop on every object
    ''' 
    detection.shape is equal to 85, the first 4 values represent the position of the object, the (x, y) coordinates are the center point and the width and height of the bounding box, and the remaining numbers correspond to the object labels, because this is the COCO data set, it has 80 types of labels .

For example, if the detected object is a person, the first value in the 80-length vector should be 1, and all other values should be 0, the second number of bicycles, the third number of cars, up to the 80th Object. Then use np.argmax()
Function to get the class id because it returns the index of the maximum value in the 80-length vector.
''' 
    for detection in output:
        # Extract class id (label) and confidence (as probability)
        # Current target detection
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > CONFIDENCE:
             # Relative to the bounding box coordinates
             # The size of the image, remember that YOLO actually
             # Return the center (x, y) coordinates of the boundary
             # Box, then the width and height of the box
            box = detection[:4] * np.array([w, h, w, h])
            (centerX, centerY, width, height) = box.astype("int")

            # Use the center (x, y) coordinates to export x and y
            # And the left corner of the bounding box
            x = int(centerX - (width / 2))
            y = int(centerY - (height / 2))

            # Update bounding box coordinates, trust level, and class ID
            boxes.append([x, y, int(width), int(height)])
            confidences.append(float(confidence))
            class_ids.append(class_id)
#Perform non-maximum suppression based on the previously defined score
idxs = cv2.dnn.NMSBoxes(boxes, confidences, SCORE_THRESHOLD, IOU_THRESHOLD)
font_scale = 1
thickness = 1
# Make sure there is at least one test
if len(idxs) > 0:
    # Cycle through our saved indexes
    for i in idxs.flatten():
        # Extract bounding box coordinates
        x, y = boxes[i][0], boxes[i][1]
        w, h = boxes[i][2], boxes[i][3]
        # Draw border rectangles and labels on the image
        color = [int(c) for c in colors[class_ids[i]]]
        cv2.rectangle(image, (x, y), (x + w, y + h), color=color, thickness=thickness)
        text = f"{labels[class_ids[i]]}: {confidences[i]:.2f}"
        # Calculate the text width and height to draw a transparent box as the text background
        (text_width, text_height) = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_scale, thickness=thickness)[0]
        text_offset_x = x
        text_offset_y = y - 5
        box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 2, text_offset_y - text_height))
        overlay = image.copy()
        cv2.rectangle(overlay, box_coords[0], box_coords[1], color=color, thickness=cv2.FILLED)
        #Add (transparency of cuboid)
        image = cv2.addWeighted(overlay, 0.6, image, 0.4, 0)
        cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=font_scale, color=(0, 0, 0), thickness=thickness)

cv2.imwrite(filename + "_yolo3." + ext, image)

result

It takes 1.3 seconds for a picture.

Time took: 1.32s

The following combination of opencv reading camera function to realize the recognition of the camera's shooting screen

import cv2
import numpy as np
import time
CONFIDENCE = 0.5
SCORE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5
config_path = "cfg/yolov3.cfg"
weights_path = "weights/yolov3.weights"
font_scale = 1
thickness = 1
LABELS = open("data/coco.names").read().strip().split("\n")
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
cap = cv2.VideoCapture(0)
while True:
    _, image = cap.read()
    h, w = image.shape[:2]
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    start = time.perf_counter()
    layer_outputs = net.forward(ln)
    time_took = time.perf_counter() - start
    print("Time took:", time_took)
    boxes, confidences, class_ids = [], [], []
    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > CONFIDENCE:
                box = detection[:4] * np.array([w, h, w, h])
                (centerX, centerY, width, height) = box.astype("int")
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, SCORE_THRESHOLD, IOU_THRESHOLD)
    font_scale = 1
    thickness = 1
    if len(idxs) > 0:
        for i in idxs.flatten():
            x, y = boxes[i][0], boxes[i][1]
            w, h = boxes[i][2], boxes[i][3]
            color = [int(c) for c in COLORS[class_ids[i]]]
            cv2.rectangle(image, (x, y), (x + w, y + h), color=color, thickness=thickness)
            text = f"{LABELS[class_ids[i]]}: {confidences[i]:.2f}"
            (text_width, text_height) = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_scale, thickness=thickness)[0]
            text_offset_x = x
            text_offset_y = y - 5
            box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 2, text_offset_y - text_height))
            overlay = image.copy()
            cv2.rectangle(overlay, box_coords[0], box_coords[1], color=color, thickness=cv2.FILLED)
            image = cv2.addWeighted(overlay, 0.6, image, 0.4, 0)
            cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=font_scale, color=(0, 0, 0), thickness=thickness)

    cv2.imshow("image", image)
    if ord("q") == cv2.waitKey(1):
        break

cap.release()
cv2.destroyAllWindows()

recognition of the camera's shooting screen

created at:08-03-2021
edited at: 08-03-2021: