[FFmpeg-devel] [PATCH 1/4] avcodec/vvcdec: refact out deblock boundary strength stage
Nuo Mi
nuomi2021 at gmail.com
Fri Oct 4 17:31:12 EEST 2024
The deblock boundary strength stage utilizes ~5% of CPU resources for 8K clips.
It's worth considering it as a standalone stage. This stage has been relocated
to follow the parser process, allowing us to reuse CUs and TUs before releasing them.
---
libavcodec/vvc/filter.c | 27 +++++++++++++++------------
libavcodec/vvc/filter.h | 9 +++++++++
libavcodec/vvc/thread.c | 24 +++++++++++++++++++++---
3 files changed, 45 insertions(+), 15 deletions(-)
diff --git a/libavcodec/vvc/filter.c b/libavcodec/vvc/filter.c
index 25bef45eed..707fc24203 100644
--- a/libavcodec/vvc/filter.c
+++ b/libavcodec/vvc/filter.c
@@ -678,12 +678,14 @@ static void vvc_deblock_bs_chroma(const VVCLocalContext *lc,
typedef void (*deblock_bs_fn)(const VVCLocalContext *lc, const int x0, const int y0,
const int width, const int height, const int rs, const int vertical);
-static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0, const int rs, const int vertical)
+void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs)
{
const VVCFrameContext *fc = lc->fc;
const VVCSPS *sps = fc->ps.sps;
const VVCPPS *pps = fc->ps.pps;
const int ctb_size = sps->ctb_size_y;
+ const int x0 = rx << sps->ctb_log2_size_y;
+ const int y0 = ry << sps->ctb_log2_size_y;
const int x_end = FFMIN(x0 + ctb_size, pps->width) >> MIN_TU_LOG2;
const int y_end = FFMIN(y0 + ctb_size, pps->height) >> MIN_TU_LOG2;
const int has_chroma = !!sps->r->sps_chroma_format_idc;
@@ -691,15 +693,18 @@ static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0
vvc_deblock_bs_luma, vvc_deblock_bs_chroma
};
- for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
- const int hs = sps->hshift[is_chroma];
- const int vs = sps->vshift[is_chroma];
- for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
- for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
- const int off = y * fc->ps.pps->min_tu_width + x;
- if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
- deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
- fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
+ ff_vvc_decode_neighbour(lc, x0, y0, rx, ry, rs);
+ for (int vertical = 0; vertical <= 1; vertical++) {
+ for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
+ const int hs = sps->hshift[is_chroma];
+ const int vs = sps->vshift[is_chroma];
+ for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
+ for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
+ const int off = y * fc->ps.pps->min_tu_width + x;
+ if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
+ deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
+ fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
+ }
}
}
}
@@ -795,8 +800,6 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs,
const uint8_t no_p[4] = { 0 };
const uint8_t no_q[4] = { 0 } ;
- vvc_deblock_bs(lc, x0, y0, rs, vertical);
-
if (!vertical) {
FFSWAP(int, x_end, y_end);
FFSWAP(int, x0, y0);
diff --git a/libavcodec/vvc/filter.h b/libavcodec/vvc/filter.h
index 03cc74e071..29abbd98ce 100644
--- a/libavcodec/vvc/filter.h
+++ b/libavcodec/vvc/filter.h
@@ -33,6 +33,15 @@
*/
void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x0, const int y0);
+/**
+ * derive boundary strength for the CTU
+ * @param lc local context for CTU
+ * @param rx raster x position for the CTU
+ * @param ry raster y position for the CTU
+ * @param rs raster position for the CTU
+ */
+void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs);
+
/**
* vertical deblock filter for the CTU
* @param lc local context for CTU
diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
index d75784e242..82c00dd4c9 100644
--- a/libavcodec/vvc/thread.c
+++ b/libavcodec/vvc/thread.c
@@ -42,6 +42,7 @@ typedef struct ProgressListener {
typedef enum VVCTaskStage {
VVC_TASK_STAGE_INIT, // for CTU(0, 0) only
VVC_TASK_STAGE_PARSE,
+ VVC_TASK_STAGE_DEBLOCK_BS,
VVC_TASK_STAGE_INTER,
VVC_TASK_STAGE_RECON,
VVC_TASK_STAGE_LMCS,
@@ -111,6 +112,7 @@ static void add_task(VVCContext *s, VVCTask *t)
const int priorities[] = {
0, // VVC_TASK_STAGE_INIT,
0, // VVC_TASK_STAGE_PARSE,
+ 1, // VVC_TASK_STAGE_DEBLOCK_BS
// For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks.
// We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks.
PRIORITY_LOWEST, // VVC_TASK_STAGE_INTER
@@ -181,6 +183,8 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
// l:left, r:right, t: top, b: bottom
static const uint8_t target_score[] =
{
+ 2, //VVC_TASK_STAGE_DEBLOCK_BS,need l + t parse
+ 0, //VVC_TASK_STAGE_INTER, not used
2, //VVC_TASK_STAGE_RECON, need l + rt recon
3, //VVC_TASK_STAGE_LMCS, need r + b + rb recon
1, //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v
@@ -202,7 +206,7 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
} else if (stage == VVC_TASK_STAGE_INTER) {
target = atomic_load(&t->target_inter_score);
} else {
- target = target_score[stage - VVC_TASK_STAGE_RECON];
+ target = target_score[stage - VVC_TASK_STAGE_DEBLOCK_BS];
}
//+1 for previous stage
@@ -348,6 +352,10 @@ static void task_stage_done(const VVCTask *t, VVCContext *s)
//this is a reserve map of ready_score, ordered by zigzag
if (stage == VVC_TASK_STAGE_PARSE) {
+ ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_BS);
+ ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_BS);
+ if (t->rx < 0 || t->rx >= ft->ctu_width || t->ry < 0 || t->ry >= ft->ctu_height)
+ return;
parse_task_done(s, fc, t->rx, t->ry);
} else if (stage == VVC_TASK_STAGE_RECON) {
ADD(-1, 1, VVC_TASK_STAGE_RECON);
@@ -481,6 +489,14 @@ static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
return 0;
}
+static int run_deblock_bs(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
+{
+ if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag)
+ ff_vvc_deblock_bs(lc, t->rx, t->ry, t->rs);
+
+ return 0;
+}
+
static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
{
VVCFrameContext *fc = lc->fc;
@@ -590,6 +606,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
const static char* task_name[] = {
"INIT",
"P",
+ "B",
"I",
"R",
"L",
@@ -611,6 +628,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc)
static const run_func run[] = {
run_init,
run_parse,
+ run_deblock_bs,
run_inter,
run_recon,
run_lmcs,
@@ -701,9 +719,9 @@ static void frame_thread_init_score(VVCFrameContext *fc)
const VVCFrameThread *ft = fc->ft;
VVCTask task;
- task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0);
+ task_init(&task, VVC_TASK_STAGE_PARSE, fc, 0, 0);
- for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) {
+ for (int i = VVC_TASK_STAGE_PARSE; i < VVC_TASK_STAGE_LAST; i++) {
task.stage = i;
for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) {
--
2.34.1
More information about the ffmpeg-devel
mailing list