[FFmpeg-devel] [PATCH FFmpeg 8/15] libavfilter: add missing temperature application in apply_softmax function and set default temperature to 1. apply_softmax refactoring and improved error handling
m.kaindl0208 at gmail.com
m.kaindl0208 at gmail.com
Sat Mar 8 17:00:59 EET 2025
Try the new filters using my Github Repo https://github.com/MaximilianKaindl/DeepFFMPEGVideoClassification.
Any Feedback is appreciated!
Signed-off-by: MaximilianKaindl <m.kaindl0208 at gmail.com>
---
libavfilter/avf_dnn_classify.c | 2 +-
libavfilter/dnn/dnn_backend_torch.cpp | 66 ++++++++++++++++-----------
2 files changed, 41 insertions(+), 27 deletions(-)
diff --git a/libavfilter/avf_dnn_classify.c b/libavfilter/avf_dnn_classify.c
index 5f294d1d9b..fa3a5ebf99 100644
--- a/libavfilter/avf_dnn_classify.c
+++ b/libavfilter/avf_dnn_classify.c
@@ -134,7 +134,7 @@ static const AVOption dnn_classify_options[] = {
#if (CONFIG_LIBTORCH == 1)
{ "torch", "torch backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TH }, 0, 0, FLAGS, .unit = "backend" },
{ "logit_scale", "logit scale for similarity calculation", OFFSET3(logit_scale), AV_OPT_TYPE_FLOAT, { .dbl = -1.0 }, -1.0, 100.0, FLAGS },
- { "temperature", "softmax temperature", OFFSET3(temperature), AV_OPT_TYPE_FLOAT, { .dbl = 1.0 }, 1, 100.0, FLAGS },
+ { "temperature", "softmax temperature", OFFSET3(temperature), AV_OPT_TYPE_FLOAT, { .dbl = -1.0 }, -1.0, 100.0, FLAGS },
{ "forward_order", "Order of forward output (0: media text, 1: text media) (CLIP/CLAP only)", OFFSET3(forward_order), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, FLAGS },
{ "normalize", "Normalize the input tensor (CLIP/CLAP only)", OFFSET3(normalize), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, FLAGS },
{ "input_res", "video processing model expected input size", OFFSET3(input_resolution), AV_OPT_TYPE_INT64, { .i64 = -1 }, -1, 10000, FLAGS },
diff --git a/libavfilter/dnn/dnn_backend_torch.cpp b/libavfilter/dnn/dnn_backend_torch.cpp
index dc68ad254f..c8804639d9 100644
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@@ -473,15 +473,12 @@ static torch::Tensor calculate_similarity(torch::Tensor &tensor1, torch::Tensor
torch::Tensor similarity = logit_scale * torch::matmul(tensor2, tensor1.transpose(0, 1));
return similarity.transpose(0, 1);
} catch (const c10::Error &e) {
- if (ctx) {
- av_log(ctx, AV_LOG_ERROR, "Similarity computation failed: %s\n", e.what());
- }
+ av_log(ctx, AV_LOG_ERROR, "Similarity computation failed: %s\n", e.what());
return torch::Tensor(); // Return empty tensor properly
}
}
-static torch::Tensor apply_softmax(torch::Tensor input_tensor, const int *softmax_units, int softmax_units_count,
- DnnContext *ctx)
+static torch::Tensor apply_softmax(torch::Tensor input_tensor, float temperature, const int *softmax_units, int softmax_units_count, DnnContext *ctx)
{
try {
// Check for empty or invalid input tensor
@@ -490,44 +487,53 @@ static torch::Tensor apply_softmax(torch::Tensor input_tensor, const int *softma
return input_tensor;
}
+ // Apply temperature if needed
+ torch::Tensor scaled_tensor;
+ if (temperature > 0.0f && temperature != 1.0f) {
+ scaled_tensor = input_tensor / temperature;
+ } else {
+ scaled_tensor = input_tensor;
+ }
+
// If no specific units are provided, apply softmax to the entire tensor
if (!softmax_units || softmax_units_count <= 0) {
- return torch::nn::functional::softmax(input_tensor, torch::nn::functional::SoftmaxFuncOptions(1));
+ return torch::nn::functional::softmax(scaled_tensor, torch::nn::functional::SoftmaxFuncOptions(1));
}
- torch::Tensor result = input_tensor.clone();
+ // Create a new output tensor with the same shape as the input
+ torch::Tensor result = torch::empty_like(scaled_tensor);
int offset = 0;
// Apply softmax to each specified segment
for (int i = 0; i < softmax_units_count; i++) {
int length = softmax_units[i];
- if (length <= 0 || offset + length > input_tensor.size(1)) {
- continue;
+ if (length <= 0 || offset + length > scaled_tensor.size(1)) {
+ av_log(ctx, AV_LOG_ERROR, "Invlid Softmax units were given to softmax. Index invalid or out of Bounds.\n");
+ return input_tensor;
}
- // Select the segment to apply softmax
- torch::Tensor segment = result.slice(1, offset, offset + length);
-
- // Apply softmax along dimension 1 (across labels in segment)
- torch::Tensor softmax_segment =
- torch::nn::functional::softmax(segment, torch::nn::functional::SoftmaxFuncOptions(1));
-
- // Put softmaxed segment back into result tensor
- result.slice(1, offset, offset + length) = softmax_segment;
+ // Apply softmax to the segment and directly place it in the result tensor
+ result.slice(1, offset, offset + length) = torch::nn::functional::softmax(
+ scaled_tensor.slice(1, offset, offset + length), torch::nn::functional::SoftmaxFuncOptions(1));
// Move offset forward
offset += length;
}
+
+ // Copy any remaining unprocessed parts if there are any
+ if (offset < scaled_tensor.size(1)) {
+ result.slice(1, offset, scaled_tensor.size(1)) = scaled_tensor.slice(1, offset, scaled_tensor.size(1));
+ // Copy remaining unprocessed elements without modification
+ av_log(ctx, AV_LOG_ERROR, "Some tensor elements (%d to %ld) were not processed by softmax\n", offset,
+ scaled_tensor.size(1) - 1);
+ }
+
return result;
} catch (const c10::Error &e) {
- if (ctx) {
- av_log(ctx, AV_LOG_ERROR, "Error applying softmax: %s\n", e.what());
- }
+ av_log(ctx, AV_LOG_ERROR, "Error applying softmax: %s\n", e.what());
return input_tensor; // Return original tensor on error
} catch (const std::exception &e) {
- if (ctx) {
- av_log(ctx, AV_LOG_ERROR, "Error applying softmax: %s\n", e.what());
- }
+ av_log(ctx, AV_LOG_ERROR, "Error applying softmax: %s\n", e.what());
return input_tensor; // Return original tensor on error
}
}
@@ -833,8 +839,9 @@ static int th_start_inference(void *args)
*infer_request->output = calculate_similarity(media_embeddings, text_embeddings,
th_model->ctx->torch_option.normalize, logit_scale, ctx);
}
- *infer_request->output = apply_softmax(*infer_request->output, th_model->clxp_ctx->softmax_units,
- th_model->clxp_ctx->softmax_units_count, ctx);
+ *infer_request->output =
+ apply_softmax(*infer_request->output, th_model->ctx->torch_option.temperature,
+ th_model->clxp_ctx->softmax_units, th_model->clxp_ctx->softmax_units_count, ctx);
}
} else {
avpriv_report_missing_feature(ctx, "model function type %d", th_model->model.func_type);
@@ -1071,6 +1078,13 @@ static THModel *init_model_th(DnnContext *ctx, DNNFunctionType func_type, AVFilt
av_log(ctx, AV_LOG_INFO, "Using default logit_scale=%.4f for %s input\n", ctx->torch_option.logit_scale,
func_type == DFT_ANALYTICS_CLAP ? "audio" : "video");
}
+ if (ctx->torch_option.temperature <= 0) {
+ // set default value for logit_scale
+ ctx->torch_option.temperature = 1;
+ // Log the default value for logit_scale
+ av_log(ctx, AV_LOG_INFO, "Using default temperature=%.4f for %s input\n", ctx->torch_option.temperature,
+ func_type == DFT_ANALYTICS_CLAP ? "audio" : "video");
+ }
if (ctx->torch_option.normalize < 0) {
ctx->torch_option.normalize = func_type == DFT_ANALYTICS_CLAP ? 1 : 0;
// Log the default value for logit_scale
--
2.34.1
More information about the ffmpeg-devel
mailing list