OpenShot Library | libopenshot  0.7.0
CVObjectMask.cpp
Go to the documentation of this file.
1 
9 // Copyright (c) 2026 OpenShot Studios, LLC
10 //
11 // SPDX-License-Identifier: LGPL-3.0-or-later
12 
13 #include "CVObjectMask.h"
14 
15 #include "Exceptions.h"
16 #include "ZmqLogger.h"
17 #include "objdetectdata.pb.h"
18 
19 #define int64 int64_t
20 #define uint64 uint64_t
21 #include <opencv2/core/ocl.hpp>
22 #undef uint64
23 #undef int64
24 
25 #include <algorithm>
26 #include <cctype>
27 #include <cmath>
28 #include <deque>
29 #include <fstream>
30 #include <iostream>
31 #include <cstring>
32 #include <limits>
33 #include <numeric>
34 
35 #include <google/protobuf/util/time_util.h>
36 
37 using namespace openshot;
38 using google::protobuf::util::TimeUtil;
39 
40 namespace {
41 
42 std::string LoadONNXModel(const std::string& modelPath, cv::dnn::Net* net)
43 {
44  try {
45  cv::dnn::Net loadedNet = cv::dnn::readNetFromONNX(modelPath);
46  if (net)
47  *net = loadedNet;
48  return "";
49  } catch (const cv::Exception& e) {
50  return std::string("Failed to load ONNX model: ") + e.what();
51  } catch (const std::exception& e) {
52  return std::string("Failed to load ONNX model: ") + e.what();
53  }
54 }
55 
56 std::vector<uint32_t> EncodeBinaryMaskRLE(const cv::Mat& mask)
57 {
58  std::vector<uint32_t> rle;
59  if (mask.empty())
60  return rle;
61 
62  uint8_t current = 0;
63  uint32_t count = 0;
64  for (int y = 0; y < mask.rows; ++y) {
65  const uint8_t* row = mask.ptr<uint8_t>(y);
66  for (int x = 0; x < mask.cols; ++x) {
67  const uint8_t value = row[x] ? 1 : 0;
68  if (value == current) {
69  ++count;
70  } else {
71  rle.push_back(count);
72  current = value;
73  count = 1;
74  }
75  }
76  }
77  rle.push_back(count);
78  return rle;
79 }
80 
81 struct EfficientSamPreprocessResult {
82  cv::Mat blob;
83  float scaleX = 1.0f;
84  float scaleY = 1.0f;
85 };
86 
87 EfficientSamPreprocessResult MakeEfficientSamBlob(const cv::Mat& bgr, int modelSize)
88 {
89  EfficientSamPreprocessResult result;
90  result.scaleX = static_cast<float>(modelSize) / static_cast<float>(bgr.cols);
91  result.scaleY = static_cast<float>(modelSize) / static_cast<float>(bgr.rows);
92 
93  cv::Mat resized;
94  cv::resize(bgr, resized, cv::Size(modelSize, modelSize), 0, 0, cv::INTER_LINEAR);
95 
96  const int shape[] = {1, 3, modelSize, modelSize};
97  result.blob = cv::Mat(4, shape, CV_32F);
98  float* dst = result.blob.ptr<float>();
99 
100  for (int y = 0; y < resized.rows; ++y) {
101  const cv::Vec3b* row = resized.ptr<cv::Vec3b>(y);
102  for (int x = 0; x < resized.cols; ++x) {
103  const float rgb[] = {
104  static_cast<float>(row[x][2]) / 255.0f,
105  static_cast<float>(row[x][1]) / 255.0f,
106  static_cast<float>(row[x][0]) / 255.0f,
107  };
108  for (int c = 0; c < 3; ++c)
109  dst[(c * modelSize + y) * modelSize + x] = rgb[c];
110  }
111  }
112 
113  return result;
114 }
115 
116 cv::Rect_<float> NormalizedBoundingBox(const cv::Mat& mask)
117 {
118  std::vector<cv::Point> points;
119  cv::findNonZero(mask, points);
120  if (points.empty())
121  return {};
122 
123  cv::Rect rect = cv::boundingRect(points);
124  return cv::Rect_<float>(
125  rect.x / static_cast<float>(mask.cols),
126  rect.y / static_cast<float>(mask.rows),
127  rect.width / static_cast<float>(mask.cols),
128  rect.height / static_cast<float>(mask.rows));
129 }
130 
131 cv::Mat EfficientSamMaskToFrameMask(const cv::Mat& modelMask, const cv::Size& frameSize, float maskThreshold)
132 {
133  cv::Mat fullSize;
134  cv::resize(modelMask, fullSize, frameSize, 0, 0, cv::INTER_LINEAR);
135 
136  cv::Mat binary;
137  cv::threshold(fullSize, binary, maskThreshold, 255.0, cv::THRESH_BINARY);
138  if (cv::countNonZero(binary) == 0) {
139  double maxValue = 0.0;
140  cv::minMaxLoc(fullSize, nullptr, &maxValue);
141  if (maxValue > 0.0) {
142  cv::threshold(fullSize, binary, maxValue * 0.5, 255.0, cv::THRESH_BINARY);
143  }
144  }
145  binary.convertTo(binary, CV_8U);
146  return binary;
147 }
148 
149 cv::Mat MakeEfficientSamPromptBlob(
150  const CVObjectMaskPromptSet& prompts,
151  const EfficientSamPreprocessResult& prep,
152  int promptSlots,
153  std::vector<cv::Point>& backgroundPoints,
154  std::vector<cv::Rect>& backgroundRects)
155 {
156  const int coordsShape[] = {1, 1, promptSlots, 2};
157  cv::Mat pointCoords(4, coordsShape, CV_32F, cv::Scalar(0.0f));
158 
159  float* coords = pointCoords.ptr<float>();
160  int promptIndex = 0;
161  for (const auto& rect : prompts.positiveRects) {
162  if (promptIndex + 1 >= promptSlots)
163  break;
164  coords[promptIndex * 2] = rect.x * prep.scaleX;
165  coords[promptIndex * 2 + 1] = rect.y * prep.scaleY;
166  ++promptIndex;
167  coords[promptIndex * 2] = (rect.x + rect.width) * prep.scaleX;
168  coords[promptIndex * 2 + 1] = (rect.y + rect.height) * prep.scaleY;
169  ++promptIndex;
170  }
171  for (const auto& point : prompts.positivePoints) {
172  if (promptIndex >= promptSlots)
173  break;
174  coords[promptIndex * 2] = point.x * prep.scaleX;
175  coords[promptIndex * 2 + 1] = point.y * prep.scaleY;
176  ++promptIndex;
177  }
178  for (const auto& point : prompts.negativePoints) {
179  backgroundPoints.emplace_back(
180  static_cast<int>(std::lround(point.x * prep.scaleX)),
181  static_cast<int>(std::lround(point.y * prep.scaleY)));
182  }
183  for (const auto& rect : prompts.negativeRects) {
184  const int x1 = static_cast<int>(std::floor(rect.x * prep.scaleX));
185  const int y1 = static_cast<int>(std::floor(rect.y * prep.scaleY));
186  const int x2 = static_cast<int>(std::ceil((rect.x + rect.width) * prep.scaleX));
187  const int y2 = static_cast<int>(std::ceil((rect.y + rect.height) * prep.scaleY));
188  const int modelWidth = prep.blob.size[3];
189  const int modelHeight = prep.blob.size[2];
190  const int left = std::max(0, std::min(modelWidth - 1, x1));
191  const int top = std::max(0, std::min(modelHeight - 1, y1));
192  const int right = std::max(left + 1, std::min(modelWidth, x2));
193  const int bottom = std::max(top + 1, std::min(modelHeight, y2));
194  backgroundRects.emplace_back(left, top, right - left, bottom - top);
195  }
196 
197  return pointCoords;
198 }
199 
200 cv::Mat MakeEfficientSamLabelBlob(const CVObjectMaskPromptSet& prompts, int promptSlots)
201 {
202  const int labelsShape[] = {1, 1, promptSlots, 1};
203  cv::Mat pointLabels(4, labelsShape, CV_32F, cv::Scalar(-1.0f));
204 
205  float* labels = pointLabels.ptr<float>();
206  int promptIndex = 0;
207  for (size_t i = 0; i < prompts.positiveRects.size() && promptIndex + 1 < promptSlots; ++i) {
208  labels[promptIndex++] = 2.0f;
209  labels[promptIndex++] = 3.0f;
210  }
211  for (size_t i = 0; i < prompts.positivePoints.size() && promptIndex < promptSlots; ++i, ++promptIndex)
212  labels[promptIndex] = 1.0f;
213 
214  return pointLabels;
215 }
216 
217 cv::Mat SelectEfficientSamMask(const cv::Mat& outputMasks, const cv::Mat& iouPredictions,
218  const std::vector<cv::Point>& backgroundPoints,
219  const std::vector<cv::Rect>& backgroundRects,
220  float maskThreshold)
221 {
222  if (outputMasks.dims != 5 || iouPredictions.empty())
223  return cv::Mat();
224 
225  const int candidateCount = outputMasks.size[2];
226  const int maskHeight = outputMasks.size[3];
227  const int maskWidth = outputMasks.size[4];
228  const float* ious = iouPredictions.ptr<float>();
229 
230  const float* masks = outputMasks.ptr<float>();
231  const size_t candidatePixels = static_cast<size_t>(maskHeight) * static_cast<size_t>(maskWidth);
232  cv::Mat bestMask;
233  float bestScore = -std::numeric_limits<float>::infinity();
234  for (int candidate = 0; candidate < candidateCount; ++candidate) {
235  cv::Mat mask(maskHeight, maskWidth, CV_32F,
236  const_cast<float*>(masks + static_cast<size_t>(candidate) * candidatePixels));
237 
238  int backgroundHits = 0;
239  for (const cv::Point& point : backgroundPoints) {
240  const int x = std::max(0, std::min(maskWidth - 1, point.x));
241  const int y = std::max(0, std::min(maskHeight - 1, point.y));
242  if (mask.at<float>(y, x) >= maskThreshold)
243  ++backgroundHits;
244  }
245 
246  float rectOverlapPenalty = 0.0f;
247  for (const cv::Rect& rect : backgroundRects) {
248  const cv::Rect clipped = rect & cv::Rect(0, 0, maskWidth, maskHeight);
249  const int area = clipped.area();
250  if (area <= 0)
251  continue;
252  int overlap = 0;
253  for (int y = clipped.y; y < clipped.y + clipped.height; ++y) {
254  const float* row = mask.ptr<float>(y);
255  for (int x = clipped.x; x < clipped.x + clipped.width; ++x) {
256  if (row[x] >= maskThreshold)
257  ++overlap;
258  }
259  }
260  rectOverlapPenalty += static_cast<float>(overlap) / static_cast<float>(area);
261  }
262 
263  const float pointPenalty = backgroundPoints.empty()
264  ? 0.0f
265  : static_cast<float>(backgroundHits) / static_cast<float>(backgroundPoints.size());
266  if (!backgroundRects.empty())
267  rectOverlapPenalty /= static_cast<float>(backgroundRects.size());
268 
269  const float score = ious[candidate] - (0.35f * pointPenalty) - (0.75f * rectOverlapPenalty);
270  if (bestMask.empty() || score > bestScore) {
271  bestScore = score;
272  bestMask = mask.clone();
273  }
274  }
275  return bestMask;
276 }
277 
278 CVObjectMaskFrameData FrameDataFromMask(const cv::Mat& mask, size_t frameId, float score)
279 {
280  CVObjectMaskFrameData frameData;
281  frameData.frameId = frameId;
282  frameData.objectId = 1;
283  if (mask.empty())
284  return frameData;
285 
286  frameData.score = score;
287  frameData.width = mask.cols;
288  frameData.height = mask.rows;
289  frameData.rle = EncodeBinaryMaskRLE(mask);
290  frameData.box = NormalizedBoundingBox(mask);
291  return frameData;
292 }
293 
294 cv::Point2f JsonPoint(const Json::Value& value)
295 {
296  if (!value.isObject() || value["x"].isNull() || value["y"].isNull())
297  return cv::Point2f(-1.0f, -1.0f);
298  return cv::Point2f(value["x"].asFloat(), value["y"].asFloat());
299 }
300 
301 bool IsValidPoint(const cv::Point2f& point)
302 {
303  return point.x >= 0.0f && point.y >= 0.0f;
304 }
305 
306 void AppendJsonPoints(const Json::Value& values, std::vector<cv::Point2f>& points)
307 {
308  if (!values.isArray())
309  return;
310  for (const auto& value : values) {
311  cv::Point2f point = JsonPoint(value);
312  if (IsValidPoint(point))
313  points.push_back(point);
314  }
315 }
316 
317 size_t JsonFrameNumber(const std::string& frameName)
318 {
319  try {
320  return static_cast<size_t>(std::max(0, std::stoi(frameName)));
321  } catch (...) {
322  return 0;
323  }
324 }
325 
326 bool RectFromJson(const Json::Value& rect, cv::Rect_<float>& output)
327 {
328  if (!rect.isObject() || rect["x1"].isNull() || rect["y1"].isNull() ||
329  rect["x2"].isNull() || rect["y2"].isNull()) {
330  return false;
331  }
332 
333  const float x1 = std::min(rect["x1"].asFloat(), rect["x2"].asFloat());
334  const float y1 = std::min(rect["y1"].asFloat(), rect["y2"].asFloat());
335  const float x2 = std::max(rect["x1"].asFloat(), rect["x2"].asFloat());
336  const float y2 = std::max(rect["y1"].asFloat(), rect["y2"].asFloat());
337  cv::Point2f topLeft(x1, y1);
338  cv::Point2f bottomRight(x2, y2);
339  if (!IsValidPoint(topLeft) || !IsValidPoint(bottomRight) || x2 <= x1 || y2 <= y1)
340  return false;
341 
342  output = cv::Rect_<float>(x1, y1, x2 - x1, y2 - y1);
343  return true;
344 }
345 
346 void AppendJsonRects(const Json::Value& values, std::vector<cv::Rect_<float>>& rects)
347 {
348  if (!values.isArray())
349  return;
350  for (const auto& rect : values) {
351  cv::Rect_<float> parsed;
352  if (RectFromJson(rect, parsed))
353  rects.push_back(parsed);
354  }
355 }
356 
357 CVObjectMaskPromptSet PromptSetFromJson(const Json::Value& framePayload)
358 {
359  CVObjectMaskPromptSet prompts;
360  AppendJsonPoints(framePayload["positive_points"], prompts.positivePoints);
361  AppendJsonPoints(framePayload["negative_points"], prompts.negativePoints);
362  AppendJsonRects(framePayload["positive_rects"], prompts.positiveRects);
363  AppendJsonRects(framePayload["negative_rects"], prompts.negativeRects);
364  return prompts;
365 }
366 
367 cv::Mat MakeBlob(const std::vector<int>& shape, float value = 0.0f)
368 {
369  cv::Mat output(static_cast<int>(shape.size()), shape.data(), CV_32F);
370  output.setTo(value);
371  return output;
372 }
373 
374 std::string SetNetDevice(cv::dnn::Net& net, const std::string& processingDevice)
375 {
376  if (processingDevice == "CPU") {
377  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
378  net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
379  return "CPU";
380  }
381 
382  if (processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA") {
383  try {
384  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
385  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
386  net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
387  net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
388  return "CUDA";
389  }
390  } catch (const cv::Exception&) {
391  }
392  }
393 
394  if (processingDevice == "GPU_OPENCL") {
395  try {
396  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV);
397  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) {
398  cv::ocl::setUseOpenCL(true);
399  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
400  net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
401  return "OpenCL";
402  }
403  } catch (const cv::Exception&) {
404  }
405  }
406 
407  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
408  net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
409  return "CPU";
410 }
411 
412 class CutiePropagator {
413 private:
414  static constexpr int memorySlots = 6;
415  int modelWidth = 640;
416  int modelHeight = 368;
417  int stride16Width = modelWidth / 16;
418  int stride16Height = modelHeight / 16;
419 
420  struct MemoryFrame {
421  cv::Mat key;
422  cv::Mat shrinkage;
423  cv::Mat value;
424  cv::Mat valid;
425  };
426 
427  struct LetterboxTransform {
428  cv::Size originalSize;
429  cv::Rect contentRect;
430  };
431 
432  cv::dnn::Net encodeKey;
433  cv::dnn::Net encodeValue;
434  cv::dnn::Net memoryReadout;
435  cv::dnn::Net decode;
436  cv::Mat sensory;
437  cv::Mat lastMask;
438  cv::Mat objectMemory;
439  MemoryFrame permanentMemory;
440  bool hasPermanentMemory = false;
441  std::deque<MemoryFrame> workingMemoryFrames;
442  int frameIndex = 0;
443  int lastMemoryFrame = -1000000;
444  int memEvery = 5;
445  int maxMemoryFrames = memorySlots;
446 
447  static bool ParseModelSize(const std::string& modelPath, int& width, int& height)
448  {
449  size_t xPos = modelPath.find('x');
450  while (xPos != std::string::npos) {
451  size_t widthStart = xPos;
452  while (widthStart > 0 && std::isdigit(static_cast<unsigned char>(modelPath[widthStart - 1])))
453  --widthStart;
454 
455  size_t heightEnd = xPos + 1;
456  while (heightEnd < modelPath.size() && std::isdigit(static_cast<unsigned char>(modelPath[heightEnd])))
457  ++heightEnd;
458 
459  if (widthStart != xPos && heightEnd != xPos + 1) {
460  width = std::stoi(modelPath.substr(widthStart, xPos - widthStart));
461  height = std::stoi(modelPath.substr(xPos + 1, heightEnd - xPos - 1));
462  if (width > 0 && height > 0 && width % 16 == 0 && height % 16 == 0)
463  return true;
464  }
465  xPos = modelPath.find('x', xPos + 1);
466  }
467  return false;
468  }
469 
470  void ConfigureModelSize(const std::string& modelPath)
471  {
472  int width = modelWidth;
473  int height = modelHeight;
474  if (!ParseModelSize(modelPath, width, height))
475  return;
476  modelWidth = width;
477  modelHeight = height;
478  stride16Width = modelWidth / 16;
479  stride16Height = modelHeight / 16;
480  }
481 
482  LetterboxTransform ComputeLetterbox(const cv::Size& sourceSize) const
483  {
484  LetterboxTransform transform;
485  transform.originalSize = sourceSize;
486  if (sourceSize.width <= 0 || sourceSize.height <= 0) {
487  transform.contentRect = cv::Rect(0, 0, modelWidth, modelHeight);
488  return transform;
489  }
490 
491  const float scaleX = static_cast<float>(modelWidth) / static_cast<float>(sourceSize.width);
492  const float scaleY = static_cast<float>(modelHeight) / static_cast<float>(sourceSize.height);
493  const float scale = std::min(scaleX, scaleY);
494 
495  const int resizedWidth = std::max(1, std::min(
496  modelWidth, static_cast<int>(std::lround(sourceSize.width * scale))));
497  const int resizedHeight = std::max(1, std::min(
498  modelHeight, static_cast<int>(std::lround(sourceSize.height * scale))));
499  const int offsetX = (modelWidth - resizedWidth) / 2;
500  const int offsetY = (modelHeight - resizedHeight) / 2;
501  transform.contentRect = cv::Rect(offsetX, offsetY, resizedWidth, resizedHeight);
502  return transform;
503  }
504 
505  cv::Mat MakeImageBlob(const cv::Mat& bgr, const LetterboxTransform& transform) const
506  {
507  cv::Mat resized;
508  cv::resize(bgr, resized, transform.contentRect.size(), 0, 0, cv::INTER_LINEAR);
509  cv::Mat canvas(modelHeight, modelWidth, bgr.type(), cv::Scalar::all(0));
510  resized.copyTo(canvas(transform.contentRect));
511 
512  const int shape[] = {1, 3, modelHeight, modelWidth};
513  cv::Mat blob(4, shape, CV_32F);
514  float* dst = blob.ptr<float>();
515  for (int y = 0; y < canvas.rows; ++y) {
516  const cv::Vec3b* row = canvas.ptr<cv::Vec3b>(y);
517  for (int x = 0; x < canvas.cols; ++x) {
518  dst[(0 * modelHeight + y) * modelWidth + x] = static_cast<float>(row[x][2]) / 255.0f;
519  dst[(1 * modelHeight + y) * modelWidth + x] = static_cast<float>(row[x][1]) / 255.0f;
520  dst[(2 * modelHeight + y) * modelWidth + x] = static_cast<float>(row[x][0]) / 255.0f;
521  }
522  }
523  return blob;
524  }
525 
526  cv::Mat MakeMaskBlob(const cv::Mat& mask, const LetterboxTransform& transform) const
527  {
528  cv::Mat resized;
529  cv::resize(mask, resized, transform.contentRect.size(), 0, 0, cv::INTER_NEAREST);
530  cv::Mat canvas(modelHeight, modelWidth, CV_8U, cv::Scalar(0));
531  resized.copyTo(canvas(transform.contentRect));
532 
533  const int shape[] = {1, 1, modelHeight, modelWidth};
534  cv::Mat blob(4, shape, CV_32F, cv::Scalar(0.0f));
535  float* dst = blob.ptr<float>();
536  for (int y = 0; y < canvas.rows; ++y) {
537  const uint8_t* row = canvas.ptr<uint8_t>(y);
538  for (int x = 0; x < canvas.cols; ++x)
539  dst[y * modelWidth + x] = row[x] ? 1.0f : 0.0f;
540  }
541  return blob;
542  }
543 
544  cv::Mat ForegroundFromProb(const cv::Mat& prob) const
545  {
546  const int shape[] = {1, 1, modelHeight, modelWidth};
547  cv::Mat foreground(4, shape, CV_32F);
548  const float* src = prob.ptr<float>();
549  float* dst = foreground.ptr<float>();
550  const int plane = modelWidth * modelHeight;
551  std::memcpy(dst, src + plane, sizeof(float) * plane);
552  return foreground;
553  }
554 
555  cv::Mat BinaryMaskFromForeground(const cv::Mat& foreground, const LetterboxTransform& transform) const
556  {
557  cv::Mat modelMask(modelHeight, modelWidth, CV_8U, cv::Scalar(0));
558  const float* src = foreground.ptr<float>();
559  for (int y = 0; y < modelMask.rows; ++y) {
560  uint8_t* row = modelMask.ptr<uint8_t>(y);
561  for (int x = 0; x < modelMask.cols; ++x)
562  row[x] = src[y * modelWidth + x] >= 0.5f ? 255 : 0;
563  }
564 
565  cv::Mat cropped = modelMask(transform.contentRect);
566  cv::Mat restored;
567  cv::resize(cropped, restored, transform.originalSize, 0, 0, cv::INTER_NEAREST);
568  return restored;
569  }
570 
571  cv::Mat ValidMaskFromLetterbox(const LetterboxTransform& transform) const
572  {
573  cv::Mat valid(stride16Height, stride16Width, CV_32F, cv::Scalar(0.0f));
574  for (int y = 0; y < stride16Height; ++y) {
575  float* row = valid.ptr<float>(y);
576  const int centerY = y * 16 + 8;
577  for (int x = 0; x < stride16Width; ++x) {
578  const int centerX = x * 16 + 8;
579  if (transform.contentRect.contains(cv::Point(centerX, centerY)))
580  row[x] = 1.0f;
581  }
582  }
583 
584  const int shape[] = {1, 1, stride16Height, stride16Width};
585  cv::Mat blob(4, shape, CV_32F);
586  std::memcpy(blob.ptr<float>(), valid.ptr<float>(), sizeof(float) * valid.total());
587  return blob;
588  }
589 
590  void CopyKeySlot(const cv::Mat& src, cv::Mat& dst, int slot, int channels) const
591  {
592  const float* in = src.ptr<float>();
593  float* out = dst.ptr<float>();
594  const int plane = stride16Width * stride16Height;
595  for (int c = 0; c < channels; ++c) {
596  std::memcpy(out + (c * memorySlots + slot) * plane,
597  in + c * plane,
598  sizeof(float) * plane);
599  }
600  }
601 
602  void CopyValueSlot(const cv::Mat& src, cv::Mat& dst, int slot) const
603  {
604  const float* in = src.ptr<float>();
605  float* out = dst.ptr<float>();
606  const int plane = stride16Width * stride16Height;
607  for (int c = 0; c < 256; ++c) {
608  std::memcpy(out + (c * memorySlots + slot) * plane,
609  in + c * plane,
610  sizeof(float) * plane);
611  }
612  }
613 
614  cv::Mat MemoryKeyBlob() const
615  {
616  cv::Mat output = MakeBlob({1, 64, memorySlots, stride16Height, stride16Width});
617  int slot = 0;
618  if (hasPermanentMemory)
619  CopyKeySlot(permanentMemory.key, output, slot++, 64);
620  for (int index = 0;
621  index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
622  ++index, ++slot)
623  CopyKeySlot(workingMemoryFrames[index].key, output, slot, 64);
624  return output;
625  }
626 
627  cv::Mat MemoryShrinkageBlob() const
628  {
629  cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width});
630  int slot = 0;
631  if (hasPermanentMemory)
632  CopyKeySlot(permanentMemory.shrinkage, output, slot++, 1);
633  for (int index = 0;
634  index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
635  ++index, ++slot)
636  CopyKeySlot(workingMemoryFrames[index].shrinkage, output, slot, 1);
637  return output;
638  }
639 
640  cv::Mat MemoryValueBlob() const
641  {
642  cv::Mat output = MakeBlob({1, 1, 256, memorySlots, stride16Height, stride16Width});
643  int slot = 0;
644  if (hasPermanentMemory)
645  CopyValueSlot(permanentMemory.value, output, slot++);
646  for (int index = 0;
647  index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
648  ++index, ++slot)
649  CopyValueSlot(workingMemoryFrames[index].value, output, slot);
650  return output;
651  }
652 
653  cv::Mat MemoryValidBlob() const
654  {
655  cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width});
656  float* data = output.ptr<float>();
657  const int plane = stride16Width * stride16Height;
658  auto copyValidSlot = [&](const cv::Mat& valid, int slot) {
659  std::memcpy(data + slot * plane, valid.ptr<float>(), sizeof(float) * plane);
660  };
661 
662  int slot = 0;
663  if (hasPermanentMemory)
664  copyValidSlot(permanentMemory.valid, slot++);
665  for (int index = 0;
666  index < static_cast<int>(workingMemoryFrames.size()) && slot < memorySlots;
667  ++index, ++slot)
668  copyValidSlot(workingMemoryFrames[index].valid, slot);
669  return output;
670  }
671 
672  void AddMemory(const cv::Mat& key, const cv::Mat& shrinkage, const cv::Mat& value,
673  const cv::Mat& valid, bool asPermanent)
674  {
675  MemoryFrame frame;
676  frame.key = key.clone();
677  frame.shrinkage = shrinkage.clone();
678  frame.value = value.clone();
679  frame.valid = valid.clone();
680 
681  if (asPermanent || !hasPermanentMemory) {
682  permanentMemory = frame;
683  hasPermanentMemory = true;
684  return;
685  }
686 
687  workingMemoryFrames.push_back(frame);
688  const int workingCapacity = std::max(0, maxMemoryFrames - 1);
689  while (static_cast<int>(workingMemoryFrames.size()) > workingCapacity)
690  workingMemoryFrames.pop_front();
691  }
692 
693  void AddObjectMemory(const cv::Mat& value)
694  {
695  if (objectMemory.empty()) {
696  objectMemory = MakeBlob({1, 1, 1, 16, 257});
697  std::memcpy(objectMemory.ptr<float>(), value.ptr<float>(), sizeof(float) * value.total());
698  return;
699  }
700 
701  float* dst = objectMemory.ptr<float>();
702  const float* src = value.ptr<float>();
703  for (size_t i = 0; i < value.total(); ++i)
704  dst[i] += src[i];
705  }
706 
707 public:
708  void Load(const std::string& encodeKeyPath, const std::string& encodeValuePath,
709  const std::string& memoryReadoutPath, const std::string& decodePath)
710  {
711  ConfigureModelSize(encodeKeyPath);
712  encodeKey = cv::dnn::readNetFromONNX(encodeKeyPath);
713  encodeValue = cv::dnn::readNetFromONNX(encodeValuePath);
714  memoryReadout = cv::dnn::readNetFromONNX(memoryReadoutPath);
715  decode = cv::dnn::readNetFromONNX(decodePath);
716  sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width});
717  }
718 
719  std::string SetDevice(const std::string& processingDevice)
720  {
721  std::string selected = SetNetDevice(encodeKey, processingDevice);
722  const std::string valueDevice = SetNetDevice(encodeValue, processingDevice);
723  const std::string readoutDevice = SetNetDevice(memoryReadout, processingDevice);
724  const std::string decodeDevice = SetNetDevice(decode, processingDevice);
725  if (selected != valueDevice || selected != readoutDevice || selected != decodeDevice)
726  return "Mixed";
727  return selected;
728  }
729 
730  void Reset()
731  {
732  sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width});
733  lastMask.release();
734  objectMemory.release();
735  permanentMemory = MemoryFrame();
736  hasPermanentMemory = false;
737  workingMemoryFrames.clear();
738  frameIndex = 0;
739  lastMemoryFrame = -1000000;
740  }
741 
742  bool HasMemory() const
743  {
744  return hasPermanentMemory || !workingMemoryFrames.empty();
745  }
746 
747  cv::Mat Step(const cv::Mat& frame, const cv::Mat& seedMask = cv::Mat())
748  {
749  const LetterboxTransform transform = ComputeLetterbox(frame.size());
750  const cv::Mat validMask = ValidMaskFromLetterbox(transform);
751  cv::Mat image = MakeImageBlob(frame, transform);
752 
753  encodeKey.setInput(image, "image");
754  std::vector<cv::Mat> keyOutputs;
755  encodeKey.forward(keyOutputs, std::vector<cv::String>{"f16", "f8", "f4", "pix_feat", "key", "shrinkage", "selection"});
756  cv::Mat f8 = keyOutputs[1];
757  cv::Mat f4 = keyOutputs[2];
758  cv::Mat pixFeat = keyOutputs[3];
759  cv::Mat key = keyOutputs[4];
760  cv::Mat shrinkage = keyOutputs[5];
761  cv::Mat selection = keyOutputs[6];
762 
763  cv::Mat foreground;
764  if (!seedMask.empty()) {
765  foreground = MakeMaskBlob(seedMask, transform);
766  } else if (HasMemory()) {
767  memoryReadout.setInput(key, "query_key");
768  memoryReadout.setInput(selection, "query_selection");
769  memoryReadout.setInput(MemoryKeyBlob(), "memory_key");
770  memoryReadout.setInput(MemoryShrinkageBlob(), "memory_shrinkage");
771  memoryReadout.setInput(MemoryValueBlob(), "memory_value");
772  memoryReadout.setInput(MemoryValidBlob(), "memory_valid");
773  memoryReadout.setInput(objectMemory, "object_memory");
774  memoryReadout.setInput(pixFeat, "pix_feat");
775  memoryReadout.setInput(sensory, "sensory");
776  memoryReadout.setInput(lastMask, "last_mask");
777  std::vector<cv::Mat> readoutOutputs;
778  memoryReadout.forward(readoutOutputs, std::vector<cv::String>{"memory_readout"});
779 
780  decode.setInput(f8, "f8");
781  decode.setInput(f4, "f4");
782  decode.setInput(readoutOutputs[0], "memory_readout");
783  decode.setInput(sensory, "sensory");
784  std::vector<cv::Mat> decodeOutputs;
785  decode.forward(decodeOutputs, std::vector<cv::String>{"new_sensory", "logits", "prob"});
786  sensory = decodeOutputs[0].clone();
787  foreground = ForegroundFromProb(decodeOutputs[2]);
788  } else {
789  ++frameIndex;
790  return cv::Mat();
791  }
792 
793  const bool isMemoryFrame = !seedMask.empty() || frameIndex - lastMemoryFrame >= memEvery;
794  if (isMemoryFrame) {
795  encodeValue.setInput(image, "image");
796  encodeValue.setInput(pixFeat, "pix_feat");
797  encodeValue.setInput(sensory, "sensory");
798  encodeValue.setInput(foreground, "mask");
799  std::vector<cv::Mat> valueOutputs;
800  encodeValue.forward(valueOutputs, std::vector<cv::String>{"mask_value", "new_sensory", "object_memory"});
801  sensory = valueOutputs[1].clone();
802  AddObjectMemory(valueOutputs[2]);
803  AddMemory(key, shrinkage, valueOutputs[0], validMask, !seedMask.empty());
804  lastMemoryFrame = frameIndex;
805  }
806 
807  lastMask = foreground.clone();
808  cv::Mat outputMask = BinaryMaskFromForeground(foreground, transform);
809  ++frameIndex;
810  return outputMask;
811  }
812 };
813 
814 }
815 
816 CVObjectMask::CVObjectMask(std::string processInfoJson, ProcessingController& controller)
817  : processingController(&controller)
818 {
819  SetJson(processInfoJson);
820 }
821 
822 std::string CVObjectMask::ValidateONNXModel(std::string modelPath)
823 {
824  return LoadONNXModel(modelPath, nullptr);
825 }
826 
827 std::shared_ptr<Frame> CVObjectMask::PreviewSeedMask(std::shared_ptr<Frame> frame)
828 {
829  if (!frame || efficientSamModelPath.empty() || promptKeyframes.empty())
830  return std::shared_ptr<Frame>();
831 
832  std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam);
833  if (!loadError.empty())
834  return std::shared_ptr<Frame>();
835  SetProcessingDevice();
836 
837  CVObjectMaskPromptSet prompts = promptKeyframes.begin()->second;
838  cv::Mat frameImage = frame->GetImageCV();
839  cv::Mat seedMask = CreateEfficientSAMSeedMask(frameImage, prompts);
840  if (seedMask.empty())
841  return std::shared_ptr<Frame>();
842 
843  auto maskImage = std::make_shared<QImage>(
844  seedMask.cols, seedMask.rows, QImage::Format_RGBA8888_Premultiplied);
845  maskImage->fill(Qt::transparent);
846  for (int y = 0; y < seedMask.rows; ++y) {
847  const uint8_t* src = seedMask.ptr<uint8_t>(y);
848  QRgb* dst = reinterpret_cast<QRgb*>(maskImage->scanLine(y));
849  for (int x = 0; x < seedMask.cols; ++x)
850  dst[x] = src[x] ? qRgba(255, 255, 255, 255) : qRgba(0, 0, 0, 0);
851  }
852 
853  auto result = std::make_shared<Frame>(frame->number, seedMask.cols, seedMask.rows, "#000000");
854  result->AddImage(maskImage);
855  return result;
856 }
857 
858 void CVObjectMask::SetProcessingDevice()
859 {
860  const std::string requestedDevice = processingDevice;
861  if (processingDevice == "CPU") {
862  efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
863  efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
864  ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested CPU, selected CPU");
865  return;
866  }
867 
868  if (processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA") {
869  try {
870  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
871  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
872  efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
873  efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
874  ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected CUDA");
875  return;
876  }
877  } catch (const cv::Exception&) {
878  }
879  }
880 
881  if (processingDevice == "GPU_OPENCL") {
882  try {
883  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV);
884  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) {
885  cv::ocl::setUseOpenCL(true);
886  efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
887  efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
888  ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected OpenCL");
889  return;
890  }
891  } catch (const cv::Exception&) {
892  }
893  }
894 
895  processingDevice = "CPU";
896  efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
897  efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
898  ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected CPU");
899 }
900 
901 void CVObjectMask::maskClip(openshot::Clip& video, size_t _start, size_t _end, bool process_interval)
902 {
903  start = _start;
904  end = _end;
905 
906  video.Open();
907  processingController->SetError(false, "");
908 
909  if (efficientSamModelPath.empty()) {
910  processingController->SetError(true, "Missing path to EfficientSAM ONNX model file");
911  error = true;
912  return;
913  }
914  if (protobufDataPath.empty()) {
915  processingController->SetError(true, "Missing path to object mask protobuf data file");
916  error = true;
917  return;
918  }
919  if (promptKeyframes.empty()) {
920  processingController->SetError(true, "Missing positive prompt point for Object Mask preprocessing");
921  error = true;
922  return;
923  }
924 
925  std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam);
926  if (!loadError.empty()) {
927  processingController->SetError(true, loadError);
928  error = true;
929  return;
930  }
931  SetProcessingDevice();
932 
933  CutiePropagator cutie;
934  if (cutieEncodeKeyModelPath.empty() && !cutieModelDir.empty())
935  cutieEncodeKeyModelPath = cutieModelDir + "/cutie-encode-key-640x368.onnx";
936  if (cutieEncodeValueModelPath.empty() && !cutieModelDir.empty())
937  cutieEncodeValueModelPath = cutieModelDir + "/cutie-encode-value-640x368.onnx";
938  if (cutieMemoryReadoutModelPath.empty() && !cutieModelDir.empty())
939  cutieMemoryReadoutModelPath = cutieModelDir + "/cutie-memory-readout-floatmask-valid-640x368-m6-topk30-opencv.onnx";
940  if (cutieDecodeModelPath.empty() && !cutieModelDir.empty())
941  cutieDecodeModelPath = cutieModelDir + "/cutie-decode-640x368.onnx";
942  if (cutieEncodeKeyModelPath.empty() || cutieEncodeValueModelPath.empty() ||
943  cutieMemoryReadoutModelPath.empty() || cutieDecodeModelPath.empty()) {
944  processingController->SetError(true, "Missing path to Cutie ONNX model files");
945  error = true;
946  return;
947  }
948  try {
949  cutie.Load(cutieEncodeKeyModelPath, cutieEncodeValueModelPath, cutieMemoryReadoutModelPath, cutieDecodeModelPath);
950  const std::string cutieDevice = cutie.SetDevice(processingDevice);
951  ZmqLogger::Instance()->Log("Object Mask Cutie DNN device: requested " + processingDevice + ", selected " + cutieDevice);
952  } catch (const cv::Exception& e) {
953  processingController->SetError(true, std::string("Failed to load Cutie ONNX models: ") + e.what());
954  error = true;
955  return;
956  } catch (const std::exception& e) {
957  processingController->SetError(true, std::string("Failed to load Cutie ONNX models: ") + e.what());
958  error = true;
959  return;
960  }
961 
962  if (!process_interval || end <= 1 || end - start == 0) {
963  start = static_cast<size_t>(video.Start() * video.Reader()->info.fps.ToFloat());
964  end = static_cast<size_t>(video.End() * video.Reader()->info.fps.ToFloat());
965  }
966  if (end < start)
967  end = start;
968 
969  CVObjectMaskPromptSet activePrompts;
970  auto promptBeforeStart = promptKeyframes.upper_bound(start);
971  if (promptBeforeStart != promptKeyframes.begin()) {
972  --promptBeforeStart;
973  activePrompts = promptBeforeStart->second;
974  }
975  auto firstPromptAtOrAfterStart = promptKeyframes.lower_bound(start);
976 
977  for (size_t frameNumber = start; frameNumber <= end; ++frameNumber) {
978  if (processingController->ShouldStop())
979  return;
980 
981  std::shared_ptr<openshot::Frame> frame = video.GetFrame(frameNumber);
982  if (!frame)
983  continue;
984 
985  auto promptIt = promptKeyframes.find(frameNumber);
986  bool isPromptKeyframe = promptIt != promptKeyframes.end();
987  if (promptIt != promptKeyframes.end()) {
988  activePrompts = promptIt->second;
989  cutie.Reset();
990  } else if (!activePrompts.HasPositivePrompt()) {
991  if (firstPromptAtOrAfterStart != promptKeyframes.end() && frameNumber >= firstPromptAtOrAfterStart->first) {
992  activePrompts = firstPromptAtOrAfterStart->second;
993  isPromptKeyframe = true;
994  cutie.Reset();
995  } else {
996  CVObjectMaskFrameData emptyFrame;
997  emptyFrame.frameId = frameNumber;
998  masksData[frameNumber] = emptyFrame;
999  continue;
1000  }
1001  }
1002 
1003  const cv::Mat frameImage = frame->GetImageCV();
1004  cv::Mat seedMask;
1005  if (isPromptKeyframe || !cutie.HasMemory()) {
1006  seedMask = CreateEfficientSAMSeedMask(frameImage, activePrompts);
1007  if (seedMask.empty()) {
1008  CVObjectMaskFrameData emptyFrame;
1009  emptyFrame.frameId = frameNumber;
1010  masksData[frameNumber] = emptyFrame;
1011  continue;
1012  }
1013  if (!isPromptKeyframe)
1014  cutie.Reset();
1015  }
1016 
1017  cv::Mat propagatedMask;
1018  try {
1019  propagatedMask = cutie.Step(frameImage, seedMask);
1020  } catch (const cv::Exception& e) {
1021  processingController->SetError(true, std::string("Failed to propagate Object Mask with Cutie: ") + e.what());
1022  error = true;
1023  return;
1024  }
1025 
1026  cv::Mat outputMask;
1027  if (!seedMask.empty()) {
1028  outputMask = seedMask;
1029  } else if (!propagatedMask.empty()) {
1030  cv::resize(propagatedMask, outputMask, frameImage.size(), 0, 0, cv::INTER_NEAREST);
1031  }
1032  masksData[frameNumber] = FrameDataFromMask(outputMask, frameNumber, 1.0f);
1033 
1034  const size_t range = std::max<size_t>(1, end - start);
1035  processingController->SetProgress(uint(100 * (frameNumber - start) / range));
1036  }
1037 }
1038 
1039 cv::Mat CVObjectMask::CreateEfficientSAMSeedMask(const cv::Mat& frame, const CVObjectMaskPromptSet& prompts)
1040 {
1041  EfficientSamPreprocessResult prep = MakeEfficientSamBlob(frame, modelSize);
1042 
1043  auto runPromptSet = [&](const CVObjectMaskPromptSet& promptSet) -> cv::Mat {
1044  std::vector<cv::Point> backgroundPoints;
1045  std::vector<cv::Rect> backgroundRects;
1046  cv::Mat pointCoords = MakeEfficientSamPromptBlob(promptSet, prep, promptSlots, backgroundPoints, backgroundRects);
1047  cv::Mat pointLabels = MakeEfficientSamLabelBlob(promptSet, promptSlots);
1048 
1049  efficientSam.setInput(prep.blob, "batched_images");
1050  efficientSam.setInput(pointCoords, "batched_point_coords");
1051  efficientSam.setInput(pointLabels, "batched_point_labels");
1052 
1053  std::vector<cv::Mat> outputs;
1054  efficientSam.forward(outputs, std::vector<cv::String>{"output_masks", "iou_predictions"});
1055  if (outputs.size() != 2)
1056  return cv::Mat();
1057 
1058  cv::Mat modelMask = SelectEfficientSamMask(outputs[0], outputs[1], backgroundPoints, backgroundRects, maskThreshold);
1059  if (modelMask.empty())
1060  return cv::Mat();
1061  return EfficientSamMaskToFrameMask(modelMask, frame.size(), maskThreshold);
1062  };
1063 
1064  if (prompts.positiveRects.size() <= 1)
1065  return runPromptSet(prompts);
1066 
1067  cv::Mat combinedMask(frame.rows, frame.cols, CV_8U, cv::Scalar(0));
1068  bool hasMask = false;
1069  for (const auto& rect : prompts.positiveRects) {
1070  CVObjectMaskPromptSet rectPrompt;
1071  rectPrompt.positiveRects.push_back(rect);
1072  rectPrompt.negativePoints = prompts.negativePoints;
1073  rectPrompt.negativeRects = prompts.negativeRects;
1074  cv::Mat rectMask = runPromptSet(rectPrompt);
1075  if (rectMask.empty())
1076  continue;
1077  cv::bitwise_or(combinedMask, rectMask, combinedMask);
1078  hasMask = true;
1079  }
1080 
1081  if (!prompts.positivePoints.empty()) {
1082  CVObjectMaskPromptSet pointPrompt;
1083  pointPrompt.positivePoints = prompts.positivePoints;
1084  pointPrompt.negativePoints = prompts.negativePoints;
1085  pointPrompt.negativeRects = prompts.negativeRects;
1086  cv::Mat pointMask = runPromptSet(pointPrompt);
1087  if (!pointMask.empty()) {
1088  cv::bitwise_or(combinedMask, pointMask, combinedMask);
1089  hasMask = true;
1090  }
1091  }
1092 
1093  return hasMask ? combinedMask : cv::Mat();
1094 }
1095 
1097 {
1098  if (protobufDataPath.empty()) {
1099  std::cerr << "Missing path to object mask protobuf data file." << std::endl;
1100  return false;
1101  }
1102  if (error)
1103  return false;
1104 
1105  pb_objdetect::ObjDetect objMessage;
1106  objMessage.add_classnames()->assign("object mask");
1107 
1108  for (const auto& frameData : masksData)
1109  AddFrameDataToProto(objMessage.add_frame(), frameData.second);
1110 
1111  *objMessage.mutable_last_updated() = TimeUtil::SecondsToTimestamp(time(NULL));
1112 
1113  std::fstream output(protobufDataPath, std::ios::out | std::ios::trunc | std::ios::binary);
1114  if (!objMessage.SerializeToOstream(&output)) {
1115  std::cerr << "Failed to write object mask protobuf message." << std::endl;
1116  return false;
1117  }
1118 
1119  return true;
1120 }
1121 
1122 void CVObjectMask::AddFrameDataToProto(pb_objdetect::Frame* pbFrameData, const CVObjectMaskFrameData& frameData)
1123 {
1124  pbFrameData->set_id(frameData.frameId);
1125  if (!frameData.HasMask())
1126  return;
1127 
1128  pb_objdetect::Frame_Box* box = pbFrameData->add_bounding_box();
1129  box->set_x(frameData.box.x);
1130  box->set_y(frameData.box.y);
1131  box->set_w(frameData.box.width);
1132  box->set_h(frameData.box.height);
1133  box->set_classid(0);
1134  box->set_confidence(frameData.score);
1135  box->set_objectid(frameData.objectId);
1136 
1137  pb_objdetect::Frame_Box_Mask* mask = box->mutable_mask();
1138  mask->set_width(frameData.width);
1139  mask->set_height(frameData.height);
1140  for (uint32_t count : frameData.rle)
1141  mask->add_rle(count);
1142 }
1143 
1144 void CVObjectMask::SetJson(const std::string value)
1145 {
1146  try {
1148  } catch (const std::exception&) {
1149  std::cout << "JSON is invalid (missing keys or invalid data types)" << std::endl;
1150  }
1151 }
1152 
1153 void CVObjectMask::SetJsonValue(const Json::Value root)
1154 {
1155  if (!root["protobuf_data_path"].isNull())
1156  protobufDataPath = root["protobuf_data_path"].asString();
1157  if (!root["efficient_sam_model"].isNull())
1158  efficientSamModelPath = root["efficient_sam_model"].asString();
1159  if (!root["efficient_sam_model_path"].isNull())
1160  efficientSamModelPath = root["efficient_sam_model_path"].asString();
1161  if (!root["sam_model"].isNull())
1162  efficientSamModelPath = root["sam_model"].asString();
1163  if (!root["sam_model_path"].isNull())
1164  efficientSamModelPath = root["sam_model_path"].asString();
1165  if (!root["encoder_model"].isNull())
1166  efficientSamModelPath = root["encoder_model"].asString();
1167  if (!root["encoder_model_path"].isNull())
1168  efficientSamModelPath = root["encoder_model_path"].asString();
1169  if (!root["cutie_model_dir"].isNull())
1170  cutieModelDir = root["cutie_model_dir"].asString();
1171  if (!root["cutie_encode_key_model"].isNull())
1172  cutieEncodeKeyModelPath = root["cutie_encode_key_model"].asString();
1173  if (!root["cutie_encode_key_model_path"].isNull())
1174  cutieEncodeKeyModelPath = root["cutie_encode_key_model_path"].asString();
1175  if (!root["cutie_encode_value_model"].isNull())
1176  cutieEncodeValueModelPath = root["cutie_encode_value_model"].asString();
1177  if (!root["cutie_encode_value_model_path"].isNull())
1178  cutieEncodeValueModelPath = root["cutie_encode_value_model_path"].asString();
1179  if (!root["cutie_memory_readout_model"].isNull())
1180  cutieMemoryReadoutModelPath = root["cutie_memory_readout_model"].asString();
1181  if (!root["cutie_memory_readout_model_path"].isNull())
1182  cutieMemoryReadoutModelPath = root["cutie_memory_readout_model_path"].asString();
1183  if (!root["cutie_decode_model"].isNull())
1184  cutieDecodeModelPath = root["cutie_decode_model"].asString();
1185  if (!root["cutie_decode_model_path"].isNull())
1186  cutieDecodeModelPath = root["cutie_decode_model_path"].asString();
1187  if (!root["processing-device"].isNull())
1188  processingDevice = root["processing-device"].asString();
1189  if (!root["processing_device"].isNull())
1190  processingDevice = root["processing_device"].asString();
1191  if (!root["prompt_slots"].isNull())
1192  promptSlots = std::max(1, std::min(6, root["prompt_slots"].asInt()));
1193  if (!root["mask_threshold"].isNull())
1194  maskThreshold = root["mask_threshold"].asFloat();
1195  if (!root["model_size"].isNull())
1196  modelSize = root["model_size"].asInt();
1197  promptKeyframes.clear();
1198  if (!root["object_mask_selection"].isNull()) {
1199  const Json::Value& selection = root["object_mask_selection"];
1200  const Json::Value& frames = selection["frames"];
1201  if (frames.isObject()) {
1202  for (const auto& frameName : frames.getMemberNames()) {
1203  const size_t frameNumber = JsonFrameNumber(frameName);
1204  if (frameNumber == 0)
1205  continue;
1206  CVObjectMaskPromptSet prompts = PromptSetFromJson(frames[frameName]);
1207  if (prompts.HasPositivePrompt())
1208  promptKeyframes[frameNumber] = prompts;
1209  }
1210  }
1211  }
1212 
1213  CVObjectMaskPromptSet legacyPrompts;
1214  if (!root["positive_points"].isNull())
1215  AppendJsonPoints(root["positive_points"], legacyPrompts.positivePoints);
1216  if (!root["negative_points"].isNull())
1217  AppendJsonPoints(root["negative_points"], legacyPrompts.negativePoints);
1218 
1219  if (!root["positive_x"].isNull() && !root["positive_y"].isNull()) {
1220  cv::Point2f point(root["positive_x"].asFloat(), root["positive_y"].asFloat());
1221  if (IsValidPoint(point) && legacyPrompts.positivePoints.empty())
1222  legacyPrompts.positivePoints.push_back(point);
1223  }
1224  if (!root["negative_x"].isNull() && !root["negative_y"].isNull()) {
1225  cv::Point2f point(root["negative_x"].asFloat(), root["negative_y"].asFloat());
1226  if (IsValidPoint(point) && legacyPrompts.negativePoints.empty())
1227  legacyPrompts.negativePoints.push_back(point);
1228  }
1229  if (!root["rect_x1"].isNull() && !root["rect_y1"].isNull() &&
1230  !root["rect_x2"].isNull() && !root["rect_y2"].isNull()) {
1231  Json::Value rect;
1232  rect["x1"] = root["rect_x1"];
1233  rect["y1"] = root["rect_y1"];
1234  rect["x2"] = root["rect_x2"];
1235  rect["y2"] = root["rect_y2"];
1236  cv::Rect_<float> parsed;
1237  if (RectFromJson(rect, parsed))
1238  legacyPrompts.positiveRects.push_back(parsed);
1239  }
1240  if (legacyPrompts.HasPositivePrompt() && promptKeyframes.empty())
1241  promptKeyframes[1] = legacyPrompts;
1242 }
openshot::stringToJson
const Json::Value stringToJson(const std::string value)
Definition: Json.cpp:16
openshot::Clip::Open
void Open() override
Open the internal reader.
Definition: Clip.cpp:387
CVObjectMask.h
Header file for CVObjectMask class.
openshot::CVObjectMaskFrameData::score
float score
Definition: CVObjectMask.h:36
openshot::CVObjectMaskFrameData::box
cv::Rect_< float > box
Definition: CVObjectMask.h:35
openshot::CVObjectMask::maskClip
void maskClip(openshot::Clip &video, size_t start=0, size_t end=0, bool process_interval=false)
Definition: CVObjectMask.cpp:901
ProcessingController::ShouldStop
bool ShouldStop()
Definition: ProcessingController.h:68
ProcessingController::SetError
void SetError(bool err, std::string message)
Definition: ProcessingController.h:74
openshot::CVObjectMaskFrameData::rle
std::vector< uint32_t > rle
Definition: CVObjectMask.h:40
openshot
This namespace is the default namespace for all code in the openshot library.
Definition: AnimatedCurve.h:24
openshot::CVObjectMaskPromptSet::negativePoints
std::vector< cv::Point2f > negativePoints
Definition: CVObjectMask.h:47
openshot::ZmqLogger::Log
void Log(std::string message)
Log message to all subscribers of this logger (if any)
Definition: ZmqLogger.cpp:103
openshot::Clip
This class represents a clip (used to arrange readers on the timeline)
Definition: Clip.h:89
openshot::Clip::End
float End() const override
Get end position (in seconds) of clip (trim end of video), which can be affected by the time curve.
Definition: Clip.cpp:423
openshot::Clip::GetFrame
std::shared_ptr< openshot::Frame > GetFrame(int64_t clip_frame_number) override
Get an openshot::Frame object for a specific frame number of this clip. The image size and number of ...
Definition: Clip.cpp:458
openshot::CVObjectMaskPromptSet
Definition: CVObjectMask.h:45
openshot::CVObjectMaskFrameData::HasMask
bool HasMask() const
Definition: CVObjectMask.h:42
openshot::CVObjectMask::SaveObjMaskData
bool SaveObjMaskData()
Definition: CVObjectMask.cpp:1096
openshot::CVObjectMaskFrameData::width
int width
Definition: CVObjectMask.h:38
ZmqLogger.h
Header file for ZeroMQ-based Logger class.
openshot::CVObjectMaskPromptSet::positivePoints
std::vector< cv::Point2f > positivePoints
Definition: CVObjectMask.h:46
openshot::CVObjectMask::PreviewSeedMask
std::shared_ptr< Frame > PreviewSeedMask(std::shared_ptr< Frame > frame)
Definition: CVObjectMask.cpp:827
openshot::CVObjectMaskFrameData::frameId
size_t frameId
Definition: CVObjectMask.h:34
openshot::CVObjectMaskFrameData
Definition: CVObjectMask.h:33
openshot::ClipBase::Start
void Start(float value)
Set start position (in seconds) of clip (trim start of video)
Definition: ClipBase.cpp:42
openshot::CVObjectMaskFrameData::objectId
int objectId
Definition: CVObjectMask.h:37
openshot::ZmqLogger::Instance
static ZmqLogger * Instance()
Create or get an instance of this logger singleton (invoke the class with this method)
Definition: ZmqLogger.cpp:35
openshot::CVObjectMask::SetJsonValue
void SetJsonValue(const Json::Value root)
Definition: CVObjectMask.cpp:1153
openshot::CVObjectMask::ValidateONNXModel
static std::string ValidateONNXModel(std::string modelPath)
Definition: CVObjectMask.cpp:822
openshot::CVObjectMask::CVObjectMask
CVObjectMask(std::string processInfoJson, ProcessingController &processingController)
Definition: CVObjectMask.cpp:816
openshot::CVObjectMaskPromptSet::negativeRects
std::vector< cv::Rect_< float > > negativeRects
Definition: CVObjectMask.h:49
openshot::CVObjectMaskFrameData::height
int height
Definition: CVObjectMask.h:39
ProcessingController
Definition: ProcessingController.h:20
openshot::CVObjectMaskPromptSet::HasPositivePrompt
bool HasPositivePrompt() const
Definition: CVObjectMask.h:51
openshot::Clip::Reader
void Reader(openshot::ReaderBase *new_reader)
Set the current reader.
Definition: Clip.cpp:340
openshot::CVObjectMask::masksData
std::map< size_t, CVObjectMaskFrameData > masksData
Definition: CVObjectMask.h:87
ProcessingController::SetProgress
void SetProgress(uint p)
Definition: ProcessingController.h:52
openshot::CVObjectMaskPromptSet::positiveRects
std::vector< cv::Rect_< float > > positiveRects
Definition: CVObjectMask.h:48
openshot::CVObjectMask::SetJson
void SetJson(const std::string value)
Definition: CVObjectMask.cpp:1144
Exceptions.h
Header file for all Exception classes.