OpenShot Library | libopenshot  0.7.0
CVObjectDetection.cpp
Go to the documentation of this file.
1 
10 // Copyright (c) 2008-2019 OpenShot Studios, LLC
11 //
12 // SPDX-License-Identifier: LGPL-3.0-or-later
13 
14 #include <fstream>
15 #include <iomanip>
16 #include <iostream>
17 #include <algorithm>
18 
19 #include "CVObjectDetection.h"
20 #include "Exceptions.h"
21 #include "ZmqLogger.h"
22 
23 #define int64 int64_t
24 #define uint64 uint64_t
25 #include <opencv2/core/ocl.hpp>
26 #undef uint64
27 #undef int64
28 #include "objdetectdata.pb.h"
29 #include <google/protobuf/util/time_util.h>
30 
31 using namespace std;
32 using namespace openshot;
33 using google::protobuf::util::TimeUtil;
34 
35 namespace {
36 
37 bool LooksLikeTransposedYoloOutput(const cv::Mat& out, size_t classCount)
38 {
39  // YOLO26 segmentation exports without end-to-end postprocessing use
40  // [1, attributes, candidates], e.g. [1, 116, 8400]:
41  // 4 box channels + class scores + optional mask coefficients.
42  return out.dims == 3 && out.size[0] == 1 && out.size[1] >= 4 &&
43  out.size[2] > out.size[1] &&
44  (classCount == 0 || out.size[1] >= 4 + static_cast<int>(classCount));
45 }
46 
47 cv::Rect ScaledXYWHBox(
48  float centerX,
49  float centerY,
50  float width,
51  float height,
52  const cv::Size& frameDims,
53  int inputWidth,
54  int inputHeight)
55 {
56  if (centerX <= 1.0f && centerY <= 1.0f && width <= 1.0f && height <= 1.0f) {
57  centerX *= static_cast<float>(frameDims.width);
58  width *= static_cast<float>(frameDims.width);
59  centerY *= static_cast<float>(frameDims.height);
60  height *= static_cast<float>(frameDims.height);
61  } else {
62  const float xFactor = static_cast<float>(frameDims.width) / static_cast<float>(inputWidth);
63  const float yFactor = static_cast<float>(frameDims.height) / static_cast<float>(inputHeight);
64  centerX *= xFactor;
65  width *= xFactor;
66  centerY *= yFactor;
67  height *= yFactor;
68  }
69 
70  float left = centerX - width / 2.0f;
71  float top = centerY - height / 2.0f;
72  float right = centerX + width / 2.0f;
73  float bottom = centerY + height / 2.0f;
74 
75  left = std::max(0.0f, std::min(left, static_cast<float>(frameDims.width - 1)));
76  top = std::max(0.0f, std::min(top, static_cast<float>(frameDims.height - 1)));
77  right = std::max(0.0f, std::min(right, static_cast<float>(frameDims.width)));
78  bottom = std::max(0.0f, std::min(bottom, static_cast<float>(frameDims.height)));
79 
80  return cv::Rect(
81  static_cast<int>(left),
82  static_cast<int>(top),
83  std::max(0, static_cast<int>(right - left)),
84  std::max(0, static_cast<int>(bottom - top)));
85 }
86 
87 std::vector<uint32_t> EncodeBinaryMaskRLE(const std::vector<uint8_t>& mask)
88 {
89  std::vector<uint32_t> rle;
90  if (mask.empty())
91  return rle;
92 
93  uint8_t current = 0;
94  uint32_t count = 0;
95  for (uint8_t value : mask) {
96  value = value ? 1 : 0;
97  if (value == current) {
98  ++count;
99  } else {
100  rle.push_back(count);
101  current = value;
102  count = 1;
103  }
104  }
105  rle.push_back(count);
106  return rle;
107 }
108 
109 cv::Mat DecodeBinaryMaskRLE(const CVObjectMaskData& mask)
110 {
111  cv::Mat image(mask.height, mask.width, CV_8UC1, cv::Scalar(0));
112  if (!mask.HasData())
113  return image;
114 
115  const int total = mask.width * mask.height;
116  int offset = 0;
117  bool value = false;
118  uint8_t* data = image.ptr<uint8_t>();
119  for (uint32_t count : mask.rle) {
120  const int end = std::min(total, offset + static_cast<int>(count));
121  if (value) {
122  std::fill(data + offset, data + end, static_cast<uint8_t>(1));
123  }
124  offset = end;
125  value = !value;
126  if (offset >= total)
127  break;
128  }
129  return image;
130 }
131 
132 CVObjectMaskData TransformMaskToBox(
133  const CVObjectMaskData& sourceMask,
134  const cv::Rect_<float>& sourceBox,
135  const cv::Rect_<float>& targetBox,
136  const cv::Size& frameDims)
137 {
138  CVObjectMaskData result;
139  if (!sourceMask.HasData() || sourceBox.width <= 0.0f || sourceBox.height <= 0.0f ||
140  targetBox.width <= 0.0f || targetBox.height <= 0.0f ||
141  frameDims.width <= 0 || frameDims.height <= 0) {
142  return result;
143  }
144 
145  const float scaleX = sourceMask.width / static_cast<float>(frameDims.width);
146  const float scaleY = sourceMask.height / static_cast<float>(frameDims.height);
147  const cv::Rect_<float> sourceMaskBox(
148  sourceBox.x * scaleX,
149  sourceBox.y * scaleY,
150  sourceBox.width * scaleX,
151  sourceBox.height * scaleY);
152  const cv::Rect_<float> targetMaskBox(
153  targetBox.x * scaleX,
154  targetBox.y * scaleY,
155  targetBox.width * scaleX,
156  targetBox.height * scaleY);
157  if (sourceMaskBox.width <= 0.0f || sourceMaskBox.height <= 0.0f)
158  return result;
159 
160  const double xScale = targetMaskBox.width / sourceMaskBox.width;
161  const double yScale = targetMaskBox.height / sourceMaskBox.height;
162  cv::Mat transform = (cv::Mat_<double>(2, 3) <<
163  xScale, 0.0, targetMaskBox.x - xScale * sourceMaskBox.x,
164  0.0, yScale, targetMaskBox.y - yScale * sourceMaskBox.y);
165 
166  cv::Mat source = DecodeBinaryMaskRLE(sourceMask);
167  cv::Mat transformed;
168  cv::warpAffine(
169  source, transformed, transform, source.size(),
170  cv::INTER_NEAREST, cv::BORDER_CONSTANT, cv::Scalar(0));
171  if (cv::countNonZero(transformed) == 0)
172  return result;
173 
174  result.width = sourceMask.width;
175  result.height = sourceMask.height;
176  result.rle = EncodeBinaryMaskRLE(
177  std::vector<uint8_t>(transformed.data, transformed.data + transformed.total()));
178  return result;
179 }
180 
181 CVObjectMaskData BuildMaskFromPrototype(
182  const cv::Mat& prototype,
183  const std::vector<float>& coefficients,
184  const cv::Rect& box,
185  const cv::Size& frameDims)
186 {
187  CVObjectMaskData result;
188  if (prototype.dims != 4 || prototype.size[0] != 1 ||
189  prototype.size[1] != static_cast<int>(coefficients.size()))
190  return result;
191 
192  const int channels = prototype.size[1];
193  const int maskHeight = prototype.size[2];
194  const int maskWidth = prototype.size[3];
195  const int maskPixels = maskWidth * maskHeight;
196  const float* protoData = reinterpret_cast<const float*>(prototype.data);
197 
198  const int left = std::max(0, static_cast<int>(box.x * maskWidth / static_cast<float>(frameDims.width)));
199  const int top = std::max(0, static_cast<int>(box.y * maskHeight / static_cast<float>(frameDims.height)));
200  const int right = std::min(maskWidth, static_cast<int>((box.x + box.width) * maskWidth / static_cast<float>(frameDims.width)));
201  const int bottom = std::min(maskHeight, static_cast<int>((box.y + box.height) * maskHeight / static_cast<float>(frameDims.height)));
202  if (left >= right || top >= bottom)
203  return result;
204 
205  std::vector<uint8_t> binary(maskPixels, 0);
206  for (int y = top; y < bottom; ++y) {
207  for (int x = left; x < right; ++x) {
208  const int pixel = y * maskWidth + x;
209  float value = 0.0f;
210  for (int channel = 0; channel < channels; ++channel) {
211  value += coefficients[channel] * protoData[channel * maskPixels + pixel];
212  }
213  binary[pixel] = value > 0.0f ? 1 : 0;
214  }
215  }
216 
217  result.width = maskWidth;
218  result.height = maskHeight;
219  result.rle = EncodeBinaryMaskRLE(binary);
220  return result;
221 }
222 
223 std::string LoadONNXModel(std::string modelPath, cv::dnn::Net *net)
224 {
225 #if CV_VERSION_MAJOR < 4 || (CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 3)
226  return std::string("Failed to load ONNX model: YOLO requires OpenCV 4.3.0 or newer. "
227  "This OpenCV build is ") + CV_VERSION + ".";
228 #else
229  try {
230  cv::dnn::Net loaded_net = cv::dnn::readNetFromONNX(modelPath);
231  if (net) {
232  *net = loaded_net;
233  }
234  return "";
235  } catch (const cv::Exception& e) {
236  std::string error_text = std::string("Failed to load ONNX model: ") + e.what();
237  if (error_text.find("Unsupported data type: FLOAT16") != std::string::npos) {
238  error_text = "Failed to load ONNX model: FLOAT16 is not supported by this OpenCV build. "
239  "Please use an FP32 ONNX model.";
240  }
241  return error_text;
242  } catch (const std::exception& e) {
243  return std::string("Failed to load ONNX model: ") + e.what();
244  } catch (...) {
245  return "Failed to load ONNX model: unknown error";
246  }
247 #endif
248 }
249 
250 }
251 
252 CVObjectDetection::CVObjectDetection(std::string processInfoJson, ProcessingController &processingController)
253 : processingController(&processingController), processingDevice("CPU"), inpWidth(640), inpHeight(640), generateMasks(true){
254  confThreshold = 0.10;
255  nmsThreshold = 0.1;
256  SetJson(processInfoJson);
257 }
258 
259 std::string CVObjectDetection::ValidateONNXModel(std::string modelPath)
260 {
261  return LoadONNXModel(modelPath, nullptr);
262 }
263 
264 void CVObjectDetection::setProcessingDevice(){
265  const std::string requestedDevice = processingDevice;
266  if (processingDevice == "CPU") {
267  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
268  net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
269  ZmqLogger::Instance()->Log("Object Detection DNN device: requested CPU, selected CPU");
270  return;
271  }
272 
273  if(processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA"){
274  try {
275  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
276  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
277  net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
278  net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
279  ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected CUDA");
280  return;
281  }
282  } catch (const cv::Exception&) {
283  }
284  }
285 
286  if(processingDevice == "GPU_OPENCL"){
287  try {
288  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV);
289  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) {
290  cv::ocl::setUseOpenCL(true);
291  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
292  net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
293  ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected OpenCL");
294  return;
295  }
296  } catch (const cv::Exception&) {
297  }
298  }
299 
300  processingDevice = "CPU";
301  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
302  net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
303  ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected CPU");
304 }
305 
306 void CVObjectDetection::detectObjectsClip(openshot::Clip &video, size_t _start, size_t _end, bool process_interval)
307 {
308 
309  start = _start; end = _end;
310 
311  video.Open();
312 
313  if(error){
314  return;
315  }
316 
317  processingController->SetError(false, "");
318 
319  if(modelPath.empty()) {
320  processingController->SetError(true, "Missing path to YOLO ONNX model file");
321  error = true;
322  return;
323  }
324  if(classesFile.empty()) {
325  processingController->SetError(true, "Missing path to class name file");
326  error = true;
327  return;
328  }
329 
330  std::ifstream model_file(modelPath);
331  if(!model_file.good()){
332  processingController->SetError(true, "Incorrect path to YOLO ONNX model file");
333  error = true;
334  return;
335  }
336  std::ifstream classes_file(classesFile);
337  if(!classes_file.good()){
338  processingController->SetError(true, "Incorrect path to class name file");
339  error = true;
340  return;
341  }
342 
343  // Load names of classes
344  classNames.clear();
345  std::string line;
346  while (std::getline(classes_file, line)) classNames.push_back(line);
347 
348  // Load the network
349  std::string error_text = LoadONNXModel(modelPath, &net);
350  if (!error_text.empty()) {
351  processingController->SetError(true, error_text);
352  error = true;
353  return;
354  }
355  setProcessingDevice();
356 
357  size_t frame_number;
358  if(!process_interval || end <= 1 || end-start == 0){
359  // Get total number of frames in video
360  start = (int)(video.Start() * video.Reader()->info.fps.ToFloat());
361  end = (int)(video.End() * video.Reader()->info.fps.ToFloat());
362  }
363 
364  for (frame_number = start; frame_number <= end; frame_number++)
365  {
366  // Stop the feature tracker process
367  if(processingController->ShouldStop()){
368  return;
369  }
370 
371  std::shared_ptr<openshot::Frame> f = video.GetFrame(frame_number);
372 
373  // Grab OpenCV Mat image
374  cv::Mat cvimage = f->GetImageCV();
375 
376  DetectObjects(cvimage, frame_number);
377 
378  // Update progress
379  processingController->SetProgress(uint(100*(frame_number-start)/(end-start)));
380 
381  }
382 }
383 
384 void CVObjectDetection::DetectObjects(const cv::Mat &frame, size_t frameId){
385  // Get frame as OpenCV Mat
386  cv::Mat blob;
387 
388  // Create a 4D blob from the frame.
389  cv::dnn::blobFromImage(frame, blob, 1/255.0, cv::Size(inpWidth, inpHeight), cv::Scalar(0,0,0), true, false);
390 
391  std::vector<cv::Mat> outs;
392  try {
393  // Sets the input to the network
394  net.setInput(blob);
395  // Runs the forward pass to get output of the output layers
396  net.forward(outs, getOutputsNames(net));
397  } catch (const cv::Exception& e) {
398  processingController->SetError(true, std::string("Object detection inference failed: ") + e.what());
399  error = true;
400  return;
401  }
402 
403  // Remove the bounding boxes with low confidence
404  postprocess(frame.size(), outs, frameId);
405 
406 }
407 
408 
409 // Remove the bounding boxes with low confidence using non-maxima suppression
410 void CVObjectDetection::postprocess(const cv::Size &frameDims, const std::vector<cv::Mat>& outs, size_t frameId)
411 {
412  std::vector<int> classIds;
413  std::vector<float> confidences;
414  std::vector<cv::Rect> boxes;
415  std::vector<std::vector<ClassScore>> detectionClassScores;
416  std::vector<CVObjectMaskData> detectionMasks;
417  std::vector<int> objectIds;
418  const int maxClassCandidates = 5;
419 
420  for (size_t i = 0; i < outs.size(); ++i) {
421  cv::Mat det = outs[i];
422 
423  if (LooksLikeTransposedYoloOutput(det, classNames.size())) {
424  const int attributes = det.size[1];
425  const int candidates = det.size[2];
426  const int classCount = !classNames.empty()
427  ? static_cast<int>(classNames.size())
428  : attributes - 4;
429  const int maskCoefficientCount = attributes - 4 - classCount;
430  const cv::Mat* prototype = nullptr;
431  if (generateMasks && maskCoefficientCount > 0) {
432  auto prototypeIt = std::find_if(outs.begin(), outs.end(),
433  [maskCoefficientCount](const cv::Mat& out) {
434  return out.dims == 4 && out.size[0] == 1 && out.size[1] == maskCoefficientCount;
435  });
436  if (prototypeIt != outs.end()) {
437  prototype = &(*prototypeIt);
438  }
439  }
440  const float* data = reinterpret_cast<const float*>(det.data);
441 
442  for (int candidateIndex = 0; candidateIndex < candidates; ++candidateIndex) {
443  std::vector<ClassScore> rowClassScores;
444  rowClassScores.reserve(maxClassCandidates);
445 
446  for (int classIndex = 0; classIndex < classCount; ++classIndex) {
447  const float classConfidence = data[(4 + classIndex) * candidates + candidateIndex];
448  if (rowClassScores.size() < static_cast<size_t>(maxClassCandidates)) {
449  rowClassScores.emplace_back(classIndex, classConfidence);
450  std::sort(rowClassScores.begin(), rowClassScores.end(),
451  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
452  } else if (classConfidence > rowClassScores.back().score) {
453  rowClassScores.back() = ClassScore(classIndex, classConfidence);
454  std::sort(rowClassScores.begin(), rowClassScores.end(),
455  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
456  }
457  }
458 
459  if (rowClassScores.empty() || rowClassScores.front().score <= confThreshold) {
460  continue;
461  }
462 
463  cv::Rect box = ScaledXYWHBox(
464  data[candidateIndex],
465  data[candidates + candidateIndex],
466  data[2 * candidates + candidateIndex],
467  data[3 * candidates + candidateIndex],
468  frameDims, inpWidth, inpHeight);
469  if (box.width <= 0 || box.height <= 0) {
470  continue;
471  }
472 
473  classIds.push_back(rowClassScores.front().classId);
474  confidences.push_back(rowClassScores.front().score);
475  boxes.push_back(box);
476  detectionClassScores.push_back(rowClassScores);
477  if (prototype) {
478  std::vector<float> coefficients;
479  coefficients.reserve(maskCoefficientCount);
480  for (int coefficientIndex = 0; coefficientIndex < maskCoefficientCount; ++coefficientIndex) {
481  coefficients.push_back(data[(4 + classCount + coefficientIndex) * candidates + candidateIndex]);
482  }
483  detectionMasks.push_back(BuildMaskFromPrototype(*prototype, coefficients, box, frameDims));
484  } else {
485  detectionMasks.push_back({});
486  }
487  }
488  continue;
489  }
490 
491  // YOLOv5-style ONNX output is usually [1, num_boxes, num_classes + 5].
492  if (det.dims == 3) {
493  det = det.reshape(1, det.size[1]);
494  }
495  if (det.dims != 2 || det.cols < 6) {
496  continue;
497  }
498 
499  const float xFactor = static_cast<float>(frameDims.width) / static_cast<float>(inpWidth);
500  const float yFactor = static_cast<float>(frameDims.height) / static_cast<float>(inpHeight);
501 
502  float* data = reinterpret_cast<float*>(det.data);
503  for (int j = 0; j < det.rows; ++j, data += det.cols) {
504  std::vector<ClassScore> rowClassScores;
505  rowClassScores.reserve(maxClassCandidates);
506  int classScoresEnd = det.cols;
507  if (!classNames.empty()) {
508  classScoresEnd = std::min(det.cols, 5 + static_cast<int>(classNames.size()));
509  }
510  for (int classIndex = 5; classIndex < classScoresEnd; ++classIndex) {
511  const float classConfidence = data[classIndex] * data[4];
512  if (rowClassScores.size() < static_cast<size_t>(maxClassCandidates)) {
513  rowClassScores.emplace_back(classIndex - 5, classConfidence);
514  std::sort(rowClassScores.begin(), rowClassScores.end(),
515  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
516  } else if (classConfidence > rowClassScores.back().score) {
517  rowClassScores.back() = ClassScore(classIndex - 5, classConfidence);
518  std::sort(rowClassScores.begin(), rowClassScores.end(),
519  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
520  }
521  }
522  if (rowClassScores.empty()) {
523  continue;
524  }
525 
526  float confidence = rowClassScores.front().score;
527 
528  if (confidence > confThreshold) {
529  int centerX = 0;
530  int centerY = 0;
531  int width = 0;
532  int height = 0;
533 
534  if (data[0] > 1.0f || data[1] > 1.0f || data[2] > 1.0f || data[3] > 1.0f) {
535  centerX = static_cast<int>(data[0] * xFactor);
536  centerY = static_cast<int>(data[1] * yFactor);
537  width = static_cast<int>(data[2] * xFactor);
538  height = static_cast<int>(data[3] * yFactor);
539  } else {
540  centerX = static_cast<int>(data[0] * frameDims.width);
541  centerY = static_cast<int>(data[1] * frameDims.height);
542  width = static_cast<int>(data[2] * frameDims.width);
543  height = static_cast<int>(data[3] * frameDims.height);
544  }
545 
546  int left = centerX - width / 2;
547  int top = centerY - height / 2;
548 
549  classIds.push_back(rowClassScores.front().classId);
550  confidences.push_back(confidence);
551  boxes.push_back(cv::Rect(left, top, width, height));
552  detectionClassScores.push_back(rowClassScores);
553  detectionMasks.push_back({});
554  }
555  }
556  }
557 
558  // Perform non maximum suppression to eliminate redundant overlapping boxes with
559  // lower confidences
560  std::vector<int> indices;
561  cv::dnn::NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
562 
563  // Pass boxes to SORT algorithm
564  std::vector<cv::Rect> sortBoxes;
565  std::vector<float> sortConfidences;
566  std::vector<int> sortClassIds;
567  std::vector<std::vector<ClassScore>> sortClassScores;
568  std::vector<CVObjectMaskData> sortMasks;
569  for(auto index : indices) {
570  sortBoxes.push_back(boxes[index]);
571  sortConfidences.push_back(confidences[index]);
572  sortClassIds.push_back(classIds[index]);
573  sortClassScores.push_back(detectionClassScores[index]);
574  sortMasks.push_back(index < static_cast<int>(detectionMasks.size()) ? detectionMasks[index] : CVObjectMaskData());
575  }
576  sort.update(sortBoxes, frameId, sqrt(pow(frameDims.width,2) + pow(frameDims.height, 2)), sortConfidences, sortClassIds, sortClassScores);
577 
578  // Clear data vectors
579  boxes.clear(); confidences.clear(); classIds.clear(); objectIds.clear();
580  std::vector<CVObjectMaskData> masks;
581  // Get SORT predicted boxes
582  for(auto TBox : sort.frameTrackingResult){
583  if(TBox.frame == frameId){
584  boxes.push_back(TBox.box);
585  confidences.push_back(TBox.confidence);
586  classIds.push_back(TBox.classId);
587  objectIds.push_back(TBox.id);
588  CVObjectMaskData mask;
589  double bestIoU = 0.0;
590  for (size_t maskIndex = 0; maskIndex < sortMasks.size(); ++maskIndex) {
591  if (!sortMasks[maskIndex].HasData() || sortClassIds[maskIndex] != TBox.classId)
592  continue;
593  double score = SortTracker::GetIOU(cv::Rect_<float>(sortBoxes[maskIndex]), TBox.box);
594  if (score > bestIoU) {
595  bestIoU = score;
596  mask = sortMasks[maskIndex];
597  }
598  }
599  if (mask.HasData()) {
600  recentObjectMasks[TBox.id] = CVTrackedMaskData{frameId, mask, TBox.box};
601  } else {
602  const auto recentMask = recentObjectMasks.find(TBox.id);
603  if (recentMask != recentObjectMasks.end() &&
604  frameId > recentMask->second.frameId &&
605  frameId - recentMask->second.frameId <= 5) {
606  mask = TransformMaskToBox(
607  recentMask->second.mask,
608  recentMask->second.box,
609  TBox.box,
610  frameDims);
611  if (mask.HasData()) {
612  recentObjectMasks[TBox.id] = CVTrackedMaskData{frameId, mask, TBox.box};
613  }
614  }
615  }
616  masks.push_back(mask);
617  }
618  }
619 
620  // Remove boxes based on controids distance
621  for(uint i = 0; i<boxes.size(); i++){
622  for(uint j = i+1; j<boxes.size(); j++){
623  int xc_1 = boxes[i].x + (int)(boxes[i].width/2), yc_1 = boxes[i].y + (int)(boxes[i].height/2);
624  int xc_2 = boxes[j].x + (int)(boxes[j].width/2), yc_2 = boxes[j].y + (int)(boxes[j].height/2);
625 
626  if(fabs(xc_1 - xc_2) < 10 && fabs(yc_1 - yc_2) < 10){
627  if(classIds[i] == classIds[j]){
628  if(confidences[i] >= confidences[j]){
629  boxes.erase(boxes.begin() + j);
630  classIds.erase(classIds.begin() + j);
631  confidences.erase(confidences.begin() + j);
632  objectIds.erase(objectIds.begin() + j);
633  masks.erase(masks.begin() + j);
634  break;
635  }
636  else{
637  boxes.erase(boxes.begin() + i);
638  classIds.erase(classIds.begin() + i);
639  confidences.erase(confidences.begin() + i);
640  objectIds.erase(objectIds.begin() + i);
641  masks.erase(masks.begin() + i);
642  i = 0;
643  break;
644  }
645  }
646  }
647  }
648  }
649 
650  // Remove boxes based in IOU score
651  for(uint i = 0; i<boxes.size(); i++){
652  for(uint j = i+1; j<boxes.size(); j++){
653 
654  if( iou(boxes[i], boxes[j])){
655  if(classIds[i] == classIds[j]){
656  if(confidences[i] >= confidences[j]){
657  boxes.erase(boxes.begin() + j);
658  classIds.erase(classIds.begin() + j);
659  confidences.erase(confidences.begin() + j);
660  objectIds.erase(objectIds.begin() + j);
661  masks.erase(masks.begin() + j);
662  break;
663  }
664  else{
665  boxes.erase(boxes.begin() + i);
666  classIds.erase(classIds.begin() + i);
667  confidences.erase(confidences.begin() + i);
668  objectIds.erase(objectIds.begin() + i);
669  masks.erase(masks.begin() + i);
670  i = 0;
671  break;
672  }
673  }
674  }
675  }
676  }
677 
678  // Normalize boxes coordinates
679  std::vector<cv::Rect_<float>> normalized_boxes;
680  for(auto box : boxes){
681  cv::Rect_<float> normalized_box;
682  normalized_box.x = (box.x)/(float)frameDims.width;
683  normalized_box.y = (box.y)/(float)frameDims.height;
684  normalized_box.width = (box.width)/(float)frameDims.width;
685  normalized_box.height = (box.height)/(float)frameDims.height;
686  normalized_boxes.push_back(normalized_box);
687  }
688 
689  detectionsData[frameId] = CVDetectionData(classIds, confidences, normalized_boxes, frameId, objectIds, masks);
690 }
691 
692 // Compute IOU between 2 boxes
693 bool CVObjectDetection::iou(cv::Rect pred_box, cv::Rect sort_box){
694  // Determine the (x, y)-coordinates of the intersection rectangle
695  int xA = std::max(pred_box.x, sort_box.x);
696  int yA = std::max(pred_box.y, sort_box.y);
697  int xB = std::min(pred_box.x + pred_box.width, sort_box.x + sort_box.width);
698  int yB = std::min(pred_box.y + pred_box.height, sort_box.y + sort_box.height);
699 
700  // Compute the area of intersection rectangle
701  int interArea = std::max(0, xB - xA + 1) * std::max(0, yB - yA + 1);
702 
703  // Compute the area of both the prediction and ground-truth rectangles
704  int boxAArea = (pred_box.width + 1) * (pred_box.height + 1);
705  int boxBArea = (sort_box.width + 1) * (sort_box.height + 1);
706 
707  // Compute the intersection over union by taking the intersection
708  float iou = interArea / (float)(boxAArea + boxBArea - interArea);
709 
710  // If IOU is above this value the boxes are very close (probably a variation of the same bounding box)
711  if(iou > 0.5)
712  return true;
713  return false;
714 }
715 
716 // Get the names of the output layers
717 std::vector<cv::String> CVObjectDetection::getOutputsNames(const cv::dnn::Net& net)
718 {
719  //Get the indices of the output layers, i.e. the layers with unconnected outputs
720  std::vector<int> outLayers = net.getUnconnectedOutLayers();
721 
722  //get the names of all the layers in the network
723  std::vector<cv::String> layersNames = net.getLayerNames();
724 
725  // Get the names of the output layers in names
726  std::vector<cv::String> names;
727  names.resize(outLayers.size());
728  for (size_t i = 0; i < outLayers.size(); ++i)
729  names[i] = layersNames[outLayers[i] - 1];
730  return names;
731 }
732 
734  // Check if the stabilizer info for the requested frame exists
735  if ( detectionsData.find(frameId) == detectionsData.end() ) {
736 
737  return CVDetectionData();
738  } else {
739 
740  return detectionsData[frameId];
741  }
742 }
743 
744 void CVObjectDetection::NormalizeTrackedClasses()
745 {
746  struct ClassEvidence {
747  float confidenceSum = 0.0f;
748  size_t count = 0;
749  };
750 
751  std::map<int, std::map<int, ClassEvidence>> objectClassEvidence;
752  for (const auto& frameData : detectionsData) {
753  const CVDetectionData& detections = frameData.second;
754  const size_t detectionCount = std::min(detections.objectIds.size(), detections.classIds.size());
755  for (size_t i = 0; i < detectionCount; ++i) {
756  const float confidence = i < detections.confidences.size() ? detections.confidences[i] : 1.0f;
757  ClassEvidence& evidence = objectClassEvidence[detections.objectIds[i]][detections.classIds[i]];
758  evidence.confidenceSum += confidence;
759  ++evidence.count;
760  }
761  }
762 
763  std::map<int, int> dominantClassByObject;
764  for (const auto& objectEvidence : objectClassEvidence) {
765  const int objectId = objectEvidence.first;
766  int bestClassId = -1;
767  ClassEvidence bestEvidence;
768  for (const auto& classEvidence : objectEvidence.second) {
769  const int classId = classEvidence.first;
770  const ClassEvidence& evidence = classEvidence.second;
771  if (bestClassId < 0 ||
772  evidence.confidenceSum > bestEvidence.confidenceSum ||
773  (evidence.confidenceSum == bestEvidence.confidenceSum && evidence.count > bestEvidence.count)) {
774  bestClassId = classId;
775  bestEvidence = evidence;
776  }
777  }
778  if (bestClassId >= 0) {
779  dominantClassByObject[objectId] = bestClassId;
780  }
781  }
782 
783  for (auto& frameData : detectionsData) {
784  CVDetectionData& detections = frameData.second;
785  const size_t detectionCount = std::min(detections.objectIds.size(), detections.classIds.size());
786  for (size_t i = 0; i < detectionCount; ++i) {
787  const auto dominantClass = dominantClassByObject.find(detections.objectIds[i]);
788  if (dominantClass != dominantClassByObject.end()) {
789  detections.classIds[i] = dominantClass->second;
790  }
791  }
792  }
793 }
794 
796  if(protobuf_data_path.empty()) {
797  cerr << "Missing path to object detection protobuf data file." << endl;
798  return false;
799  }
800 
801  NormalizeTrackedClasses();
802 
803  // Create tracker message
804  pb_objdetect::ObjDetect objMessage;
805 
806  //Save class names in protobuf message
807  for(int i = 0; i<classNames.size(); i++){
808  std::string* className = objMessage.add_classnames();
809  className->assign(classNames.at(i));
810  }
811 
812  // Iterate over all frames data and save in protobuf message
813  for(std::map<size_t,CVDetectionData>::iterator it=detectionsData.begin(); it!=detectionsData.end(); ++it){
814  CVDetectionData dData = it->second;
815  AddFrameDataToProto(objMessage.add_frame(), dData);
816  }
817 
818  // Add timestamp
819  *objMessage.mutable_last_updated() = TimeUtil::SecondsToTimestamp(time(NULL));
820 
821  {
822  // Write the new message to disk.
823  std::fstream output(protobuf_data_path, ios::out | ios::trunc | ios::binary);
824  if (!objMessage.SerializeToOstream(&output)) {
825  cerr << "Failed to write protobuf message." << endl;
826  return false;
827  }
828  }
829 
830  // Delete all global objects allocated by libprotobuf.
831  google::protobuf::ShutdownProtobufLibrary();
832 
833  return true;
834 
835 }
836 
837 // Add frame object detection into protobuf message.
838 void CVObjectDetection::AddFrameDataToProto(pb_objdetect::Frame* pbFrameData, CVDetectionData& dData) {
839 
840  // Save frame number and rotation
841  pbFrameData->set_id(dData.frameId);
842 
843  for(size_t i = 0; i < dData.boxes.size(); i++){
844  pb_objdetect::Frame_Box* box = pbFrameData->add_bounding_box();
845 
846  // Save bounding box data
847  box->set_x(dData.boxes.at(i).x);
848  box->set_y(dData.boxes.at(i).y);
849  box->set_w(dData.boxes.at(i).width);
850  box->set_h(dData.boxes.at(i).height);
851  box->set_classid(dData.classIds.at(i));
852  box->set_confidence(dData.confidences.at(i));
853  box->set_objectid(dData.objectIds.at(i));
854 
855  if (i < dData.masks.size() && dData.masks.at(i).HasData()) {
856  pb_objdetect::Frame_Box_Mask* mask = box->mutable_mask();
857  mask->set_width(dData.masks.at(i).width);
858  mask->set_height(dData.masks.at(i).height);
859  for (uint32_t count : dData.masks.at(i).rle) {
860  mask->add_rle(count);
861  }
862  }
863  }
864 }
865 
866 // Load JSON string into this object
867 void CVObjectDetection::SetJson(const std::string value) {
868  // Parse JSON string into JSON objects
869  try
870  {
871  const Json::Value root = openshot::stringToJson(value);
872  // Set all values that match
873 
874  SetJsonValue(root);
875  }
876  catch (const std::exception& e)
877  {
878  // Error parsing JSON (or missing keys)
879  // throw InvalidJSON("JSON is invalid (missing keys or invalid data types)");
880  std::cout<<"JSON is invalid (missing keys or invalid data types)"<<std::endl;
881  }
882 }
883 
884 // Load Json::Value into this object
885 void CVObjectDetection::SetJsonValue(const Json::Value root) {
886 
887  // Set data from Json (if key is found)
888  if (!root["protobuf_data_path"].isNull()){
889  protobuf_data_path = (root["protobuf_data_path"].asString());
890  }
891 
892  if (!root["processing-device"].isNull()){
893  processingDevice = (root["processing-device"].asString());
894  }
895  if (!root["processing_device"].isNull()){
896  processingDevice = (root["processing_device"].asString());
897  }
898  if (!root["class-names"].isNull()){
899  classesFile = (root["class-names"].asString());
900  }
901  if (!root["classes_file"].isNull()){
902  classesFile = (root["classes_file"].asString());
903  }
904  if (!root["model"].isNull()){
905  modelPath = (root["model"].asString());
906  }
907  if (!root["model_path"].isNull()){
908  modelPath = (root["model_path"].asString());
909  }
910  if (!root["input-width"].isNull()){
911  inpWidth = root["input-width"].asInt();
912  }
913  if (!root["input_width"].isNull()){
914  inpWidth = root["input_width"].asInt();
915  }
916  if (!root["input-height"].isNull()){
917  inpHeight = root["input-height"].asInt();
918  }
919  if (!root["input_height"].isNull()){
920  inpHeight = root["input_height"].asInt();
921  }
922  if (!root["confidence-threshold"].isNull()){
923  confThreshold = root["confidence-threshold"].asFloat();
924  }
925  if (!root["confidence_threshold"].isNull()){
926  confThreshold = root["confidence_threshold"].asFloat();
927  }
928  if (!root["nms-threshold"].isNull()){
929  nmsThreshold = root["nms-threshold"].asFloat();
930  }
931  if (!root["nms_threshold"].isNull()){
932  nmsThreshold = root["nms_threshold"].asFloat();
933  }
934  if (!root["generate-masks"].isNull()){
935  generateMasks = root["generate-masks"].asBool();
936  }
937  if (!root["generate_masks"].isNull()){
938  generateMasks = root["generate_masks"].asBool();
939  }
940 }
941 
942 /*
943 ||||||||||||||||||||||||||||||||||||||||||||||||||
944  ONLY FOR MAKE TEST
945 ||||||||||||||||||||||||||||||||||||||||||||||||||
946 */
947 
948 // Load protobuf data file
950  if(protobuf_data_path.empty()) {
951  cerr << "Missing path to object detection protobuf data file." << endl;
952  return false;
953  }
954 
955  // Create tracker message
956  pb_objdetect::ObjDetect objMessage;
957 
958  {
959  // Read the existing tracker message.
960  fstream input(protobuf_data_path, ios::in | ios::binary);
961  if (!objMessage.ParseFromIstream(&input)) {
962  cerr << "Failed to parse protobuf message." << endl;
963  return false;
964  }
965  }
966 
967  // Make sure classNames and detectionsData are empty
968  classNames.clear(); detectionsData.clear();
969 
970  // Get all classes names and assign a color to them
971  for(int i = 0; i < objMessage.classnames_size(); i++){
972  classNames.push_back(objMessage.classnames(i));
973  }
974 
975  // Iterate over all frames of the saved message
976  for (size_t i = 0; i < objMessage.frame_size(); i++) {
977  // Create protobuf message reader
978  const pb_objdetect::Frame& pbFrameData = objMessage.frame(i);
979 
980  // Get frame Id
981  size_t id = pbFrameData.id();
982 
983  // Load bounding box data
984  const google::protobuf::RepeatedPtrField<pb_objdetect::Frame_Box > &pBox = pbFrameData.bounding_box();
985 
986  // Construct data vectors related to detections in the current frame
987  std::vector<int> classIds;
988  std::vector<float> confidences;
989  std::vector<cv::Rect_<float>> boxes;
990  std::vector<int> objectIds;
991  std::vector<CVObjectMaskData> masks;
992 
993  for(int i = 0; i < pbFrameData.bounding_box_size(); i++){
994  // Get bounding box coordinates
995  float x = pBox.Get(i).x(); float y = pBox.Get(i).y();
996  float w = pBox.Get(i).w(); float h = pBox.Get(i).h();
997  // Create OpenCV rectangle with the bouding box info
998  cv::Rect_<float> box(x, y, w, h);
999 
1000  // Get class Id (which will be assign to a class name) and prediction confidence
1001  int classId = pBox.Get(i).classid(); float confidence = pBox.Get(i).confidence();
1002  // Get object Id
1003  int objectId = pBox.Get(i).objectid();
1004 
1005  // Push back data into vectors
1006  boxes.push_back(box); classIds.push_back(classId); confidences.push_back(confidence);
1007  objectIds.push_back(objectId);
1008  CVObjectMaskData mask;
1009  if (pBox.Get(i).has_mask()) {
1010  mask.width = pBox.Get(i).mask().width();
1011  mask.height = pBox.Get(i).mask().height();
1012  for (int rleIndex = 0; rleIndex < pBox.Get(i).mask().rle_size(); ++rleIndex) {
1013  mask.rle.push_back(pBox.Get(i).mask().rle(rleIndex));
1014  }
1015  }
1016  masks.push_back(mask);
1017  }
1018 
1019  // Assign data to object detector map
1020  detectionsData[id] = CVDetectionData(classIds, confidences, boxes, id, objectIds, masks);
1021  }
1022 
1023  // Delete all global objects allocated by libprotobuf.
1024  google::protobuf::ShutdownProtobufLibrary();
1025 
1026  return true;
1027 }
openshot::stringToJson
const Json::Value stringToJson(const std::string value)
Definition: Json.cpp:16
openshot::Clip::Open
void Open() override
Open the internal reader.
Definition: Clip.cpp:387
CVObjectDetection.h
Header file for CVObjectDetection class.
openshot::CVObjectMaskData::width
int width
Definition: CVObjectDetection.h:37
ProcessingController::ShouldStop
bool ShouldStop()
Definition: ProcessingController.h:68
ProcessingController::SetError
void SetError(bool err, std::string message)
Definition: ProcessingController.h:74
openshot
This namespace is the default namespace for all code in the openshot library.
Definition: AnimatedCurve.h:24
openshot::CVDetectionData::classIds
std::vector< int > classIds
Definition: CVObjectDetection.h:69
openshot::ZmqLogger::Log
void Log(std::string message)
Log message to all subscribers of this logger (if any)
Definition: ZmqLogger.cpp:103
openshot::Clip
This class represents a clip (used to arrange readers on the timeline)
Definition: Clip.h:89
openshot::CVDetectionData::masks
std::vector< CVObjectMaskData > masks
Definition: CVObjectDetection.h:73
openshot::Clip::End
float End() const override
Get end position (in seconds) of clip (trim end of video), which can be affected by the time curve.
Definition: Clip.cpp:423
openshot::CVObjectDetection::detectionsData
std::map< size_t, CVDetectionData > detectionsData
Definition: CVObjectDetection.h:127
openshot::Clip::GetFrame
std::shared_ptr< openshot::Frame > GetFrame(int64_t clip_frame_number) override
Get an openshot::Frame object for a specific frame number of this clip. The image size and number of ...
Definition: Clip.cpp:458
openshot::CVDetectionData::confidences
std::vector< float > confidences
Definition: CVObjectDetection.h:70
SortTracker::frameTrackingResult
std::vector< TrackingBox > frameTrackingResult
Definition: sort.hpp:57
openshot::CVObjectDetection::GetDetectionData
CVDetectionData GetDetectionData(size_t frameId)
Definition: CVObjectDetection.cpp:733
ZmqLogger.h
Header file for ZeroMQ-based Logger class.
openshot::CVObjectDetection::AddFrameDataToProto
void AddFrameDataToProto(pb_objdetect::Frame *pbFrameData, CVDetectionData &dData)
Definition: CVObjectDetection.cpp:838
openshot::CVDetectionData::objectIds
std::vector< int > objectIds
Definition: CVObjectDetection.h:72
ClassScore
Definition: KalmanTracker.h:20
openshot::CVObjectDetection::_LoadObjDetectdData
bool _LoadObjDetectdData()
Definition: CVObjectDetection.cpp:949
openshot::CVObjectMaskData::rle
std::vector< uint32_t > rle
Definition: CVObjectDetection.h:39
openshot::CVDetectionData::frameId
size_t frameId
Definition: CVObjectDetection.h:68
openshot::CVDetectionData
Definition: CVObjectDetection.h:51
openshot::ClipBase::Start
void Start(float value)
Set start position (in seconds) of clip (trim start of video)
Definition: ClipBase.cpp:42
openshot::CVObjectMaskData::HasData
bool HasData() const
Definition: CVObjectDetection.h:41
openshot::CVObjectMaskData
Definition: CVObjectDetection.h:36
SortTracker::GetIOU
static double GetIOU(cv::Rect_< float > bb_test, cv::Rect_< float > bb_gt)
Definition: sort.cpp:89
openshot::CVObjectDetection::SetJsonValue
void SetJsonValue(const Json::Value root)
Load Json::Value into this object.
Definition: CVObjectDetection.cpp:885
openshot::CVDetectionData::boxes
std::vector< cv::Rect_< float > > boxes
Definition: CVObjectDetection.h:71
openshot::ZmqLogger::Instance
static ZmqLogger * Instance()
Create or get an instance of this logger singleton (invoke the class with this method)
Definition: ZmqLogger.cpp:35
openshot::CVObjectDetection::SetJson
void SetJson(const std::string value)
Load JSON string into this object.
Definition: CVObjectDetection.cpp:867
ProcessingController
Definition: ProcessingController.h:20
openshot::CVTrackedMaskData
Definition: CVObjectDetection.h:44
openshot::CVObjectDetection::ValidateONNXModel
static std::string ValidateONNXModel(std::string modelPath)
Definition: CVObjectDetection.cpp:259
openshot::CVObjectDetection::SaveObjDetectedData
bool SaveObjDetectedData()
Protobuf Save and Load methods.
Definition: CVObjectDetection.cpp:795
SortTracker::update
void update(std::vector< cv::Rect > detection, int frame_count, double image_diagonal, std::vector< float > confidences, std::vector< int > classIds, std::vector< std::vector< ClassScore >> classScores={})
Definition: sort.cpp:151
openshot::Clip::Reader
void Reader(openshot::ReaderBase *new_reader)
Set the current reader.
Definition: Clip.cpp:340
ProcessingController::SetProgress
void SetProgress(uint p)
Definition: ProcessingController.h:52
openshot::CVObjectDetection::detectObjectsClip
void detectObjectsClip(openshot::Clip &video, size_t start=0, size_t end=0, bool process_interval=false)
Definition: CVObjectDetection.cpp:306
openshot::CVObjectMaskData::height
int height
Definition: CVObjectDetection.h:38
Exceptions.h
Header file for all Exception classes.