Skip to content

Commit aea5685

Browse files
committed
feat(server): align vision pipeline with normalized events
1 parent ddeb92d commit aea5685

File tree

10 files changed

+96
-12
lines changed

10 files changed

+96
-12
lines changed

config.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
"everyNthFrame": 6,
3838
"minIntervalUsec": 200000,
3939
"queueDepth": 8,
40+
"normalize": {
41+
"width": 0,
42+
"height": 0,
43+
"pixelFmt": ""
44+
},
4045
"snapshots": {
4146
"enabled": false,
4247
"dir": "./recordings/snapshots",

config.rtsp.example.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@
3333
"everyNthFrame": 6,
3434
"minIntervalUsec": 200000,
3535
"queueDepth": 8,
36+
"normalize": {
37+
"width": 0,
38+
"height": 0,
39+
"pixelFmt": ""
40+
},
3641
"snapshots": {
3742
"enabled": true,
3843
"dir": "./recordings/snapshots",

packaging/config.example.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
"everyNthFrame": 6,
3838
"minIntervalUsec": 200000,
3939
"queueDepth": 8,
40+
"normalize": {
41+
"width": 0,
42+
"height": 0,
43+
"pixelFmt": ""
44+
},
4045
"snapshots": {
4146
"enabled": false,
4247
"dir": "./recordings/snapshots",

packaging/config.rtsp.example.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
"everyNthFrame": 6,
3838
"minIntervalUsec": 200000,
3939
"queueDepth": 8,
40+
"normalize": {
41+
"width": 0,
42+
"height": 0,
43+
"pixelFmt": ""
44+
},
4045
"snapshots": {
4146
"enabled": true,
4247
"dir": "./recordings/snapshots",

src/server/internal/config.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,15 @@ ConfigLoadResult loadConfigResult(const std::string& path)
129129
c.vision.everyNthFrame = v.value("everyNthFrame", c.vision.everyNthFrame);
130130
c.vision.minIntervalUsec = v.value("minIntervalUsec", c.vision.minIntervalUsec);
131131
c.vision.queueDepth = v.value("queueDepth", c.vision.queueDepth);
132+
if (v.contains("normalize")) {
133+
auto& normalize = v["normalize"];
134+
c.vision.normalize.width =
135+
normalize.value("width", c.vision.normalize.width);
136+
c.vision.normalize.height =
137+
normalize.value("height", c.vision.normalize.height);
138+
c.vision.normalize.pixelFmt =
139+
normalize.value("pixelFmt", c.vision.normalize.pixelFmt);
140+
}
132141
if (v.contains("motion")) {
133142
auto& motion = v["motion"];
134143
c.vision.motionGridWidth = motion.value("gridWidth", c.vision.motionGridWidth);

src/server/internal/config.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ struct Config
3030

3131
struct VisionConfig
3232
{
33+
struct NormalizerConfig
34+
{
35+
int width = 0;
36+
int height = 0;
37+
std::string pixelFmt;
38+
};
39+
3340
struct SnapshotConfig
3441
{
3542
bool enabled = false;
@@ -49,6 +56,7 @@ struct Config
4956
uint32_t everyNthFrame = 6;
5057
int64_t minIntervalUsec = 200000;
5158
int queueDepth = 8;
59+
NormalizerConfig normalize;
5260
uint32_t motionGridWidth = 32;
5361
uint32_t motionGridHeight = 18;
5462
uint32_t motionWarmupFrames = 2;

src/server/internal/media.cpp

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ std::string intelligenceSourceLabel(const Config& config, const std::string& pee
3131
return config.source.empty() ? peerId : config.source;
3232
}
3333

34+
35+
std::string intelligenceStreamId(const std::string& peerId)
36+
{
37+
return "stream/" + peerId;
38+
}
39+
3440
} // namespace
3541

3642

@@ -425,6 +431,12 @@ json::Value MediaSession::intelligenceStatus() const
425431
vision["sampledFrames"] = stats.forwarded;
426432
vision["sampledDropped"] = stats.dropped;
427433
}
434+
if (_visionNormalizer) {
435+
const auto stats = _visionNormalizer->stats();
436+
vision["normalizedFrames"] = stats.emitted;
437+
vision["normalizerDropped"] = stats.dropped;
438+
vision["normalizerConverted"] = stats.converted;
439+
}
428440
if (_visionQueue) {
429441
vision["queueDepth"] = static_cast<std::uint64_t>(_visionQueue->size());
430442
vision["queueDropped"] = static_cast<std::uint64_t>(_visionQueue->dropped());
@@ -577,6 +589,8 @@ void MediaSession::startStreaming()
577589

578590
if (_visionSampler)
579591
_visionSampler->reset();
592+
if (_visionNormalizer)
593+
_visionNormalizer->reset();
580594
if (_visionDetector)
581595
_visionDetector->reset();
582596
if (_visionArtifacts)
@@ -605,6 +619,7 @@ void MediaSession::setupIntelligence()
605619
return;
606620

607621
const auto sourceLabel = intelligenceSourceLabel(_config, _peerId);
622+
const auto streamId = intelligenceStreamId(_peerId);
608623

609624
if (_config.vision.enabled) {
610625
_visionArtifacts = std::make_unique<VisionArtifacts>(
@@ -623,9 +638,18 @@ void MediaSession::setupIntelligence()
623638
.everyNthFrame = _config.vision.everyNthFrame,
624639
.minIntervalUsec = _config.vision.minIntervalUsec,
625640
});
641+
_visionNormalizer = std::make_shared<vision::FrameNormalizer>(
642+
vision::FrameNormalizerConfig{
643+
.sourceId = sourceLabel,
644+
.streamId = streamId,
645+
.width = _config.vision.normalize.width,
646+
.height = _config.vision.normalize.height,
647+
.pixelFmt = _config.vision.normalize.pixelFmt,
648+
});
626649
_visionQueue = std::make_shared<vision::DetectionQueue>(_config.vision.queueDepth);
627650
_visionDetector = std::make_unique<vision::MotionDetector>(vision::MotionDetectorConfig{
628651
.source = sourceLabel,
652+
.streamId = streamId,
629653
.detectorName = "motion",
630654
.gridWidth = _config.vision.motionGridWidth,
631655
.gridHeight = _config.vision.motionGridHeight,
@@ -644,12 +668,16 @@ void MediaSession::setupIntelligence()
644668
_visionSampler->process(packet);
645669
};
646670
_visionSampler->emitter += [this](IPacket& packet) {
647-
auto* frame = dynamic_cast<av::PlanarVideoPacket*>(&packet);
671+
if (_visionNormalizer)
672+
_visionNormalizer->process(packet);
673+
};
674+
_visionNormalizer->emitter += [this](IPacket& packet) {
675+
auto* frame = dynamic_cast<vision::VisionFramePacket*>(&packet);
648676
if (frame && _visionQueue)
649677
_visionQueue->process(*frame);
650678
};
651679
_visionQueue->emitter += [this](IPacket& packet) {
652-
auto* frame = dynamic_cast<av::PlanarVideoPacket*>(&packet);
680+
auto* frame = dynamic_cast<vision::VisionFramePacket*>(&packet);
653681
if (frame && _visionDetector)
654682
_visionDetector->process(*frame);
655683
};

src/server/internal/media.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "icy/speech/voiceactivitydetector.h"
1414
#include "icy/symple/server.h"
1515
#include "icy/vision/detectionqueue.h"
16+
#include "icy/vision/framenormalizer.h"
1617
#include "icy/vision/framesampler.h"
1718
#include "icy/vision/motiondetector.h"
1819
#include "icy/webrtc/peersession.h"
@@ -113,6 +114,7 @@ class MediaSession : public std::enable_shared_from_this<MediaSession>
113114
std::shared_ptr<av::VideoPacketEncoder> _videoEncoder;
114115
std::shared_ptr<av::AudioPacketEncoder> _audioEncoder;
115116
std::shared_ptr<vision::FrameSampler> _visionSampler;
117+
std::shared_ptr<vision::FrameNormalizer> _visionNormalizer;
116118
std::shared_ptr<vision::DetectionQueue> _visionQueue;
117119
std::unique_ptr<vision::MotionDetector> _visionDetector;
118120
std::shared_ptr<speech::SpeechQueue> _speechQueue;

src/server/internal/visionartifacts.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ double fpsFromWindow(uint64_t frames, int64_t firstUsec, int64_t lastUsec)
2323
(static_cast<long double>(frames - 1) * 1000000.0L) / durationUsec);
2424
}
2525

26+
27+
int64_t eventFrameTimeUsec(const vision::VisionEvent& event)
28+
{
29+
if (event.frame.ptsUsec > 0)
30+
return event.frame.ptsUsec;
31+
return event.emittedAtUsec;
32+
}
33+
2634
} // namespace
2735

2836

@@ -152,22 +160,23 @@ void VisionArtifacts::onFrame(const av::PlanarVideoPacket& packet)
152160
VisionArtifactResult VisionArtifacts::onEvent(const vision::VisionEvent& event)
153161
{
154162
std::lock_guard lock(_mutex);
163+
const int64_t frameTimeUsec = eventFrameTimeUsec(event);
155164

156165
VisionArtifactResult result;
157-
result.latencyUsec = latencyForFrameLocked(event.frame.timeUsec);
166+
result.latencyUsec = latencyForFrameLocked(frameTimeUsec);
158167
_lastLatencyUsec = result.latencyUsec;
159168

160169
if (_config.snapshotsEnabled) {
161-
const int64_t sinceLastSnapshot = event.frame.timeUsec - _lastSnapshotTimeUsec;
170+
const int64_t sinceLastSnapshot = frameTimeUsec - _lastSnapshotTimeUsec;
162171
if (_lastSnapshotTimeUsec == 0 ||
163172
sinceLastSnapshot >= _config.snapshotMinIntervalUsec) {
164-
if (auto* frame = bestFrameLocked(event.frame.timeUsec)) {
165-
const auto path = makeSnapshotPathLocked(event.frame.timeUsec);
173+
if (auto* frame = bestFrameLocked(frameTimeUsec)) {
174+
const auto path = makeSnapshotPathLocked(frameTimeUsec);
166175
if (writeSnapshotLocked(*frame, path)) {
167176
result.snapshotPath = path;
168177
result.snapshotUrl = artifactUrlFor(
169178
fs::makePath("snapshots", fs::filename(path)));
170-
_lastSnapshotTimeUsec = event.frame.timeUsec;
179+
_lastSnapshotTimeUsec = frameTimeUsec;
171180
_lastSnapshotPath = result.snapshotPath;
172181
_lastSnapshotUrl = result.snapshotUrl;
173182
++_snapshotsWritten;
@@ -180,14 +189,14 @@ VisionArtifactResult VisionArtifacts::onEvent(const vision::VisionEvent& event)
180189
}
181190

182191
if (_config.clipsEnabled) {
183-
if (_clip && event.frame.timeUsec <= _clip->deadlineUsec) {
192+
if (_clip && frameTimeUsec <= _clip->deadlineUsec) {
184193
_clip->deadlineUsec = std::max(
185194
_clip->deadlineUsec,
186-
event.frame.timeUsec + _config.clipPostRollUsec);
195+
frameTimeUsec + _config.clipPostRollUsec);
187196
} else {
188197
finishClipLocked();
189-
startClipLocked(event.frame.timeUsec);
190-
flushBufferedFramesLocked(event.frame.timeUsec - _config.clipPreRollUsec);
198+
startClipLocked(frameTimeUsec);
199+
flushBufferedFramesLocked(frameTimeUsec - _config.clipPreRollUsec);
191200
}
192201

193202
if (_clip) {

web/src/app.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,15 @@ function buildArtifactLinks (event) {
461461
}
462462

463463
function formatEventTime (event) {
464-
const usec = Number(event?.time || event?.audio?.time || event?.frame?.time || 0)
464+
const usec = Number(
465+
event?.frame?.ptsUsec ??
466+
event?.audio?.timeUsec ??
467+
event?.time ??
468+
event?.audio?.time ??
469+
event?.frame?.time ??
470+
event?.emittedAtUsec ??
471+
0
472+
)
465473
if (!Number.isFinite(usec) || usec <= 0) {
466474
return 'live'
467475
}

0 commit comments

Comments
 (0)