[FFmpeg-devel] [PATCH 1/3] sbc: implement SBC codec (low-complexity subband codec)
Rostislav Pehlivanov
atomnuker at gmail.com
Mon Nov 6 06:40:56 EET 2017
On 5 November 2017 at 23:35, Aurelien Jacobs <aurel at gnuage.org> wrote:
> This was originally based on libsbc, and was fully integrated into ffmpeg.
> ---
> doc/general.texi | 2 +
> libavcodec/Makefile | 4 +
> libavcodec/allcodecs.c | 2 +
> libavcodec/arm/Makefile | 3 +
> libavcodec/arm/sbcdsp_armv6.S | 245 ++++++++++++++
> libavcodec/arm/sbcdsp_init_arm.c | 105 ++++++
> libavcodec/arm/sbcdsp_neon.S | 714 ++++++++++++++++++++++++++++++
> +++++++++
> libavcodec/avcodec.h | 2 +
> libavcodec/codec_desc.c | 12 +
> libavcodec/sbc.c | 316 +++++++++++++++++
> libavcodec/sbc.h | 121 +++++++
> libavcodec/sbcdec.c | 469 +++++++++++++++++++++++++
> libavcodec/sbcdec_data.c | 127 +++++++
> libavcodec/sbcdec_data.h | 44 +++
> libavcodec/sbcdsp.c | 569 +++++++++++++++++++++++++++++++
> libavcodec/sbcdsp.h | 86 +++++
> libavcodec/sbcdsp_data.c | 335 ++++++++++++++++++
> libavcodec/sbcdsp_data.h | 57 ++++
> libavcodec/sbcenc.c | 461 +++++++++++++++++++++++++
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/sbcdsp.asm | 290 ++++++++++++++++
> libavcodec/x86/sbcdsp_init.c | 51 +++
> 22 files changed, 4017 insertions(+)
> create mode 100644 libavcodec/arm/sbcdsp_armv6.S
> create mode 100644 libavcodec/arm/sbcdsp_init_arm.c
> create mode 100644 libavcodec/arm/sbcdsp_neon.S
> create mode 100644 libavcodec/sbc.c
> create mode 100644 libavcodec/sbc.h
> create mode 100644 libavcodec/sbcdec.c
> create mode 100644 libavcodec/sbcdec_data.c
> create mode 100644 libavcodec/sbcdec_data.h
> create mode 100644 libavcodec/sbcdsp.c
> create mode 100644 libavcodec/sbcdsp.h
> create mode 100644 libavcodec/sbcdsp_data.c
> create mode 100644 libavcodec/sbcdsp_data.h
> create mode 100644 libavcodec/sbcenc.c
> create mode 100644 libavcodec/x86/sbcdsp.asm
> create mode 100644 libavcodec/x86/sbcdsp_init.c
>
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index c4134424f0..2d541bf64a 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -632,6 +632,8 @@ enum AVCodecID {
> AV_CODEC_ID_ATRAC3AL,
> AV_CODEC_ID_ATRAC3PAL,
> AV_CODEC_ID_DOLBY_E,
> + AV_CODEC_ID_SBC,
> + AV_CODEC_ID_MSBC,
>
>
See below.
> /* subtitle codecs */
> AV_CODEC_ID_FIRST_SUBTITLE = 0x17000, ///< A dummy ID
> pointing at the start of subtitle codecs.
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 92bf1d2681..8d613507e0 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -2859,6 +2859,18 @@ static const AVCodecDescriptor codec_descriptors[]
> = {
> .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
> .props = AV_CODEC_PROP_LOSSY,
> },
> + {
> + .id = AV_CODEC_ID_SBC,
> + .type = AVMEDIA_TYPE_AUDIO,
> + .name = "sbc",
> + .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband
> codec)"),
> + },
> + {
> + .id = AV_CODEC_ID_MSBC,
> + .type = AVMEDIA_TYPE_AUDIO,
> + .name = "msbc",
> + .long_name = NULL_IF_CONFIG_SMALL("mSBC (wideband speech mono
> SBC)"),
> + },
>
Is there a bitstream difference between the two? I don't think so, so you
should instead define FF_PROFILE_SBC_WB and use a single codec ID.
>
> /* subtitle codecs */
> {
> diff --git a/libavcodec/sbc.c b/libavcodec/sbc.c
> new file mode 100644
> index 0000000000..99d02cc56a
> --- /dev/null
> +++ b/libavcodec/sbc.c
> @@ -0,0 +1,316 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2012-2013 Intel Corporation
> + * Copyright (C) 2008-2010 Nokia Corporation
> + * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2008 Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * SBC common functions for the encoder and decoder
> + */
> +
> +#include "avcodec.h"
> +#include "sbc.h"
> +
> +/*
> + * Calculates the CRC-8 of the first len bits in data
> + */
> +static const uint8_t crc_table[256] = {
> + 0x00, 0x1D, 0x3A, 0x27, 0x74, 0x69, 0x4E, 0x53,
> + 0xE8, 0xF5, 0xD2, 0xCF, 0x9C, 0x81, 0xA6, 0xBB,
> + 0xCD, 0xD0, 0xF7, 0xEA, 0xB9, 0xA4, 0x83, 0x9E,
> + 0x25, 0x38, 0x1F, 0x02, 0x51, 0x4C, 0x6B, 0x76,
> + 0x87, 0x9A, 0xBD, 0xA0, 0xF3, 0xEE, 0xC9, 0xD4,
> + 0x6F, 0x72, 0x55, 0x48, 0x1B, 0x06, 0x21, 0x3C,
> + 0x4A, 0x57, 0x70, 0x6D, 0x3E, 0x23, 0x04, 0x19,
> + 0xA2, 0xBF, 0x98, 0x85, 0xD6, 0xCB, 0xEC, 0xF1,
> + 0x13, 0x0E, 0x29, 0x34, 0x67, 0x7A, 0x5D, 0x40,
> + 0xFB, 0xE6, 0xC1, 0xDC, 0x8F, 0x92, 0xB5, 0xA8,
> + 0xDE, 0xC3, 0xE4, 0xF9, 0xAA, 0xB7, 0x90, 0x8D,
> + 0x36, 0x2B, 0x0C, 0x11, 0x42, 0x5F, 0x78, 0x65,
> + 0x94, 0x89, 0xAE, 0xB3, 0xE0, 0xFD, 0xDA, 0xC7,
> + 0x7C, 0x61, 0x46, 0x5B, 0x08, 0x15, 0x32, 0x2F,
> + 0x59, 0x44, 0x63, 0x7E, 0x2D, 0x30, 0x17, 0x0A,
> + 0xB1, 0xAC, 0x8B, 0x96, 0xC5, 0xD8, 0xFF, 0xE2,
> + 0x26, 0x3B, 0x1C, 0x01, 0x52, 0x4F, 0x68, 0x75,
> + 0xCE, 0xD3, 0xF4, 0xE9, 0xBA, 0xA7, 0x80, 0x9D,
> + 0xEB, 0xF6, 0xD1, 0xCC, 0x9F, 0x82, 0xA5, 0xB8,
> + 0x03, 0x1E, 0x39, 0x24, 0x77, 0x6A, 0x4D, 0x50,
> + 0xA1, 0xBC, 0x9B, 0x86, 0xD5, 0xC8, 0xEF, 0xF2,
> + 0x49, 0x54, 0x73, 0x6E, 0x3D, 0x20, 0x07, 0x1A,
> + 0x6C, 0x71, 0x56, 0x4B, 0x18, 0x05, 0x22, 0x3F,
> + 0x84, 0x99, 0xBE, 0xA3, 0xF0, 0xED, 0xCA, 0xD7,
> + 0x35, 0x28, 0x0F, 0x12, 0x41, 0x5C, 0x7B, 0x66,
> + 0xDD, 0xC0, 0xE7, 0xFA, 0xA9, 0xB4, 0x93, 0x8E,
> + 0xF8, 0xE5, 0xC2, 0xDF, 0x8C, 0x91, 0xB6, 0xAB,
> + 0x10, 0x0D, 0x2A, 0x37, 0x64, 0x79, 0x5E, 0x43,
> + 0xB2, 0xAF, 0x88, 0x95, 0xC6, 0xDB, 0xFC, 0xE1,
> + 0x5A, 0x47, 0x60, 0x7D, 0x2E, 0x33, 0x14, 0x09,
> + 0x7F, 0x62, 0x45, 0x58, 0x0B, 0x16, 0x31, 0x2C,
> + 0x97, 0x8A, 0xAD, 0xB0, 0xE3, 0xFE, 0xD9, 0xC4
> +};
> +
> +uint8_t ff_sbc_crc8(const uint8_t *data, size_t len)
> +{
> + uint8_t crc = 0x0f;
> + size_t i;
> + uint8_t octet;
> +
> + for (i = 0; i < len / 8; i++)
> + crc = crc_table[crc ^ data[i]];
> +
> + octet = data[i];
> + for (i = 0; i < len % 8; i++) {
> + char bit = ((octet ^ crc) & 0x80) >> 7;
> +
> + crc = ((crc & 0x7f) << 1) ^ (bit ? 0x1d : 0);
> +
> + octet = octet << 1;
> + }
> +
> + return crc;
> +}
>
We have CRC functions already, look in libavutil/crc.h
> + if (subbands == 4)
> + loudness = frame->scale_factor[ch][sb] -
> sbc_offset4[sf][sb];
> + else
> + loudness = frame->scale_factor[ch][sb] -
> sbc_offset8[sf][sb];
> + if (loudness > 0)
> + bitneed[ch][sb] = loudness / 2;
> + else
> + bitneed[ch][sb] = loudness;
>
bitneed[ch][sb] = loudness >> (loudness > 0);
>
> +
> +static int sbc_decode_frame(AVCodecContext *avctx,
> + void *data, int *got_frame_ptr,
> + AVPacket *avpkt)
> +{
> + SBCDecContext *sbc = avctx->priv_data;
> + int i, ch, samples, ret;
> + AVFrame *frame = data;
> + int16_t *ptr;
> +
> + if (!sbc)
> + return AVERROR(EIO);
> +
> + sbc->frame.length = sbc->unpack_frame(avpkt->data, &sbc->frame,
> avpkt->size);
> + if (sbc->frame.length <= 0)
> + return sbc->frame.length;
> +
> + samples = sbc_synthesize_audio(&sbc->dsp, &sbc->frame);
> +
> + frame->nb_samples = samples;
> + if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
> + return ret;
> + ptr = (int16_t *)frame->data[0];
> +
> + for (i = 0; i < samples; i++)
> + for (ch = 0; ch < sbc->frame.channels; ch++)
> + *ptr++ = sbc->frame.pcm_sample[ch][i];
> +
>
Once again, use planar sample formats
> + *got_frame_ptr = 1;
> +
> + return sbc->frame.length;
> +}
> +
> +#if CONFIG_SBC_DECODER
> +AVCodec ff_sbc_decoder = {
> + .name = "sbc",
> + .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity
> subband codec)"),
> + .type = AVMEDIA_TYPE_AUDIO,
> + .id = AV_CODEC_ID_SBC,
> + .priv_data_size = sizeof(SBCDecContext),
> + .init = sbc_decode_init,
> + .decode = sbc_decode_frame,
> + .capabilities = AV_CODEC_CAP_DR1,
> + .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
> + AV_CH_LAYOUT_STEREO, 0},
> + .sample_fmts = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
>
AV_SAMPLE_FMT_S16P
> +
> AV_SAMPLE_FMT_NONE },
> + .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000,
> 0 },
> +};
> +#endif
> +
> +#if CONFIG_MSBC_DECODER
> +AVCodec ff_msbc_decoder = {
> + .name = "msbc",
> + .long_name = NULL_IF_CONFIG_SMALL("mSBC (wideband speech
> mono SBC)"),
> + .type = AVMEDIA_TYPE_AUDIO,
> + .id = AV_CODEC_ID_MSBC,
> + .priv_data_size = sizeof(SBCDecContext),
> + .init = msbc_decode_init,
> + .decode = sbc_decode_frame,
> + .capabilities = AV_CODEC_CAP_DR1,
> + .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0},
> + .sample_fmts = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
>
AV_SAMPLE_FMT_S16P
> +
> +/*
> + * A reference C code of analysis filter with SIMD-friendly tables
> + * reordering and code layout. This code can be used to develop platform
> + * specific SIMD optimizations. Also it may be used as some kind of test
> + * for compiler autovectorization capabilities (who knows, if the compiler
> + * is very good at this stuff, hand optimized assembly may be not strictly
> + * needed for some platform).
> + *
> + * Note: It is also possible to make a simple variant of analysis filter,
> + * which needs only a single constants table without taking care about
> + * even/odd cases. This simple variant of filter can be implemented
> without
> + * input data permutation. The only thing that would be lost is the
> + * possibility to use pairwise SIMD multiplications. But for some simple
> + * CPU cores without SIMD extensions it can be useful. If anybody is
> + * interested in implementing such variant of a filter, sourcecode from
> + * bluez versions 4.26/4.27 can be used as a reference and the history of
> + * the changes in git repository done around that time may be worth
> checking.
> + */
> +
> +static void sbc_analyze_4_simd(const int16_t *in, int32_t *out,
> + const int16_t *consts)
> +{
> + int32_t t1[4];
> + int16_t t2[4];
> + int hop = 0;
> +
> + /* rounding coefficient */
> + t1[0] = t1[1] = t1[2] = t1[3] =
> + (int32_t) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
> +
> + /* low pass polyphase filter */
> + for (hop = 0; hop < 40; hop += 8) {
> + t1[0] += (int32_t) in[hop] * consts[hop];
> + t1[0] += (int32_t) in[hop + 1] * consts[hop + 1];
> + t1[1] += (int32_t) in[hop + 2] * consts[hop + 2];
> + t1[1] += (int32_t) in[hop + 3] * consts[hop + 3];
> + t1[2] += (int32_t) in[hop + 4] * consts[hop + 4];
> + t1[2] += (int32_t) in[hop + 5] * consts[hop + 5];
> + t1[3] += (int32_t) in[hop + 6] * consts[hop + 6];
> + t1[3] += (int32_t) in[hop + 7] * consts[hop + 7];
> + }
> +
> + /* scaling */
> + t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
> + t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
> + t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
> + t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
> +
> + /* do the cos transform */
> + t1[0] = (int32_t) t2[0] * consts[40 + 0];
> + t1[0] += (int32_t) t2[1] * consts[40 + 1];
> + t1[1] = (int32_t) t2[0] * consts[40 + 2];
> + t1[1] += (int32_t) t2[1] * consts[40 + 3];
> + t1[2] = (int32_t) t2[0] * consts[40 + 4];
> + t1[2] += (int32_t) t2[1] * consts[40 + 5];
> + t1[3] = (int32_t) t2[0] * consts[40 + 6];
> + t1[3] += (int32_t) t2[1] * consts[40 + 7];
> +
> + t1[0] += (int32_t) t2[2] * consts[40 + 8];
> + t1[0] += (int32_t) t2[3] * consts[40 + 9];
> + t1[1] += (int32_t) t2[2] * consts[40 + 10];
> + t1[1] += (int32_t) t2[3] * consts[40 + 11];
> + t1[2] += (int32_t) t2[2] * consts[40 + 12];
> + t1[2] += (int32_t) t2[3] * consts[40 + 13];
> + t1[3] += (int32_t) t2[2] * consts[40 + 14];
> + t1[3] += (int32_t) t2[3] * consts[40 + 15];
> +
> + out[0] = t1[0] >>
> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> + out[1] = t1[1] >>
> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> + out[2] = t1[2] >>
> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> + out[3] = t1[3] >>
> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> +}
> +
> +static void sbc_analyze_8_simd(const int16_t *in, int32_t *out,
> + const int16_t *consts)
> +{
> + int32_t t1[8];
> + int16_t t2[8];
> + int i, hop;
> +
> + /* rounding coefficient */
> + t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
> + (int32_t) 1 << (SBC_PROTO_FIXED8_SCALE-1);
> +
> + /* low pass polyphase filter */
> + for (hop = 0; hop < 80; hop += 16) {
> + t1[0] += (int32_t) in[hop] * consts[hop];
> + t1[0] += (int32_t) in[hop + 1] * consts[hop + 1];
> + t1[1] += (int32_t) in[hop + 2] * consts[hop + 2];
> + t1[1] += (int32_t) in[hop + 3] * consts[hop + 3];
> + t1[2] += (int32_t) in[hop + 4] * consts[hop + 4];
> + t1[2] += (int32_t) in[hop + 5] * consts[hop + 5];
> + t1[3] += (int32_t) in[hop + 6] * consts[hop + 6];
> + t1[3] += (int32_t) in[hop + 7] * consts[hop + 7];
> + t1[4] += (int32_t) in[hop + 8] * consts[hop + 8];
> + t1[4] += (int32_t) in[hop + 9] * consts[hop + 9];
> + t1[5] += (int32_t) in[hop + 10] * consts[hop + 10];
> + t1[5] += (int32_t) in[hop + 11] * consts[hop + 11];
> + t1[6] += (int32_t) in[hop + 12] * consts[hop + 12];
> + t1[6] += (int32_t) in[hop + 13] * consts[hop + 13];
> + t1[7] += (int32_t) in[hop + 14] * consts[hop + 14];
> + t1[7] += (int32_t) in[hop + 15] * consts[hop + 15];
> + }
> +
> + /* scaling */
> + t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
> + t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
> + t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
> + t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
> + t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
> + t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
> + t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
> + t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
> +
> +
> + /* do the cos transform */
> + t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0;
> +
> + for (i = 0; i < 4; i++) {
> + t1[0] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 0];
> + t1[0] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 1];
> + t1[1] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 2];
> + t1[1] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 3];
> + t1[2] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 4];
> + t1[2] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 5];
> + t1[3] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 6];
> + t1[3] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 7];
> + t1[4] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 8];
> + t1[4] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 9];
> + t1[5] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 10];
> + t1[5] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 11];
> + t1[6] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 12];
> + t1[6] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 13];
> + t1[7] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 14];
> + t1[7] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 15];
> + }
> +
> + for (i = 0; i < 8; i++)
> + out[i] = t1[i] >>
> + (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
> +}
> +
>
What does it do here? A PQF into an FFT?
I might investigate using lavc's fixed point mdct for this maybe, I hate
custom fixed-point analysis transforms.
> +static inline void sbc_analyze_4b_4s_simd(SBCDSPContext *s,
> + int16_t *x, int32_t *out, int
> out_stride)
> +{
> + /* Analyze blocks */
> + s->sbc_analyze_4(x + 12, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_odd);
> + out += out_stride;
> + s->sbc_analyze_4(x + 8, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_even);
> + out += out_stride;
> + s->sbc_analyze_4(x + 4, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_odd);
> + out += out_stride;
> + s->sbc_analyze_4(x + 0, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_even);
> +
> + emms_c();
> +}
> +
> +static inline void sbc_analyze_4b_8s_simd(SBCDSPContext *s,
> + int16_t *x, int32_t *out, int
> out_stride)
> +{
> + /* Analyze blocks */
> + s->sbc_analyze_8(x + 24, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_odd);
> + out += out_stride;
> + s->sbc_analyze_8(x + 16, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_even);
> + out += out_stride;
> + s->sbc_analyze_8(x + 8, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_odd);
> + out += out_stride;
> + s->sbc_analyze_8(x + 0, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_even);
> +
> + emms_c();
> +}
> +
> +static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
> + int16_t *x, int32_t *out,
> + int out_stride);
> +
> +static inline void sbc_analyze_1b_8s_simd_odd(SBCDSPContext *s,
> + int16_t *x, int32_t *out,
> + int out_stride)
> +{
> + s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
> + s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_even;
> +
> + emms_c();
> +}
> +
> +static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
> + int16_t *x, int32_t *out,
> + int out_stride)
> +{
> + s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
> + s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
> +
> + emms_c();
> +}
> +
> +#define PCM(i) AV_RN16(pcm + 2*(i))
>
Don't use a define, just substitute it directly.
> +
> +/*
> + * Internal helper functions for input data processing. In order to get
> + * optimal performance, it is important to have "nsamples" and "nchannels"
> + * arguments used with this inline function as compile time constants.
> + */
> +
> +static av_always_inline int sbc_encoder_process_input_s4_internal(
> + int position, const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
> + int nsamples, int nchannels)
> +{
> + /* handle X buffer wraparound */
> + if (position < nsamples) {
> + if (nchannels > 0)
> + memcpy(&X[0][SBC_X_BUFFER_SIZE - 40], &X[0][position],
> + 36 * sizeof(int16_t));
> + if (nchannels > 1)
> + memcpy(&X[1][SBC_X_BUFFER_SIZE - 40], &X[1][position],
> + 36 * sizeof(int16_t));
> + position = SBC_X_BUFFER_SIZE - 40;
> + }
> +
> + /* copy/permutate audio samples */
> + while ((nsamples -= 8) >= 0) {
> + position -= 8;
> + if (nchannels > 0) {
> + int16_t *x = &X[0][position];
> + x[0] = PCM(0 + 7 * nchannels);
> + x[1] = PCM(0 + 3 * nchannels);
> + x[2] = PCM(0 + 6 * nchannels);
> + x[3] = PCM(0 + 4 * nchannels);
> + x[4] = PCM(0 + 0 * nchannels);
> + x[5] = PCM(0 + 2 * nchannels);
> + x[6] = PCM(0 + 1 * nchannels);
> + x[7] = PCM(0 + 5 * nchannels);
> + }
> + if (nchannels > 1) {
> + int16_t *x = &X[1][position];
> + x[0] = PCM(1 + 7 * nchannels);
> + x[1] = PCM(1 + 3 * nchannels);
> + x[2] = PCM(1 + 6 * nchannels);
> + x[3] = PCM(1 + 4 * nchannels);
> + x[4] = PCM(1 + 0 * nchannels);
> + x[5] = PCM(1 + 2 * nchannels);
> + x[6] = PCM(1 + 1 * nchannels);
> + x[7] = PCM(1 + 5 * nchannels);
> + }
> + pcm += 16 * nchannels;
> + }
> +
> + return position;
> +}
> +
> +static av_always_inline int sbc_encoder_process_input_s8_internal(
> + int position, const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
> + int nsamples, int nchannels)
> +{
> + /* handle X buffer wraparound */
> + if (position < nsamples) {
> + if (nchannels > 0)
> + memcpy(&X[0][SBC_X_BUFFER_SIZE - 72], &X[0][position],
> + 72 * sizeof(int16_t));
> + if (nchannels > 1)
> + memcpy(&X[1][SBC_X_BUFFER_SIZE - 72], &X[1][position],
> + 72 * sizeof(int16_t));
> + position = SBC_X_BUFFER_SIZE - 72;
> + }
> +
> + if (position % 16 == 8) {
> + position -= 8;
> + nsamples -= 8;
> + if (nchannels > 0) {
> + int16_t *x = &X[0][position];
> + x[0] = PCM(0 + (15-8) * nchannels);
> + x[2] = PCM(0 + (14-8) * nchannels);
> + x[3] = PCM(0 + (8-8) * nchannels);
> + x[4] = PCM(0 + (13-8) * nchannels);
> + x[5] = PCM(0 + (9-8) * nchannels);
> + x[6] = PCM(0 + (12-8) * nchannels);
> + x[7] = PCM(0 + (10-8) * nchannels);
> + x[8] = PCM(0 + (11-8) * nchannels);
> + }
> + if (nchannels > 1) {
> + int16_t *x = &X[1][position];
> + x[0] = PCM(1 + (15-8) * nchannels);
> + x[2] = PCM(1 + (14-8) * nchannels);
> + x[3] = PCM(1 + (8-8) * nchannels);
> + x[4] = PCM(1 + (13-8) * nchannels);
> + x[5] = PCM(1 + (9-8) * nchannels);
> + x[6] = PCM(1 + (12-8) * nchannels);
> + x[7] = PCM(1 + (10-8) * nchannels);
> + x[8] = PCM(1 + (11-8) * nchannels);
> + }
> +
> + pcm += 16 * nchannels;
> + }
> +
> + /* copy/permutate audio samples */
> + while (nsamples >= 16) {
> + position -= 16;
> + if (nchannels > 0) {
> + int16_t *x = &X[0][position];
> + x[0] = PCM(0 + 15 * nchannels);
> + x[1] = PCM(0 + 7 * nchannels);
> + x[2] = PCM(0 + 14 * nchannels);
> + x[3] = PCM(0 + 8 * nchannels);
> + x[4] = PCM(0 + 13 * nchannels);
> + x[5] = PCM(0 + 9 * nchannels);
> + x[6] = PCM(0 + 12 * nchannels);
> + x[7] = PCM(0 + 10 * nchannels);
> + x[8] = PCM(0 + 11 * nchannels);
> + x[9] = PCM(0 + 3 * nchannels);
> + x[10] = PCM(0 + 6 * nchannels);
> + x[11] = PCM(0 + 0 * nchannels);
> + x[12] = PCM(0 + 5 * nchannels);
> + x[13] = PCM(0 + 1 * nchannels);
> + x[14] = PCM(0 + 4 * nchannels);
> + x[15] = PCM(0 + 2 * nchannels);
> + }
> + if (nchannels > 1) {
> + int16_t *x = &X[1][position];
> + x[0] = PCM(1 + 15 * nchannels);
> + x[1] = PCM(1 + 7 * nchannels);
> + x[2] = PCM(1 + 14 * nchannels);
> + x[3] = PCM(1 + 8 * nchannels);
> + x[4] = PCM(1 + 13 * nchannels);
> + x[5] = PCM(1 + 9 * nchannels);
> + x[6] = PCM(1 + 12 * nchannels);
> + x[7] = PCM(1 + 10 * nchannels);
> + x[8] = PCM(1 + 11 * nchannels);
> + x[9] = PCM(1 + 3 * nchannels);
> + x[10] = PCM(1 + 6 * nchannels);
> + x[11] = PCM(1 + 0 * nchannels);
> + x[12] = PCM(1 + 5 * nchannels);
> + x[13] = PCM(1 + 1 * nchannels);
> + x[14] = PCM(1 + 4 * nchannels);
> + x[15] = PCM(1 + 2 * nchannels);
> + }
> + pcm += 32 * nchannels;
> + nsamples -= 16;
> + }
> +
> + if (nsamples == 8) {
> + position -= 8;
> + if (nchannels > 0) {
> + int16_t *x = &X[0][position];
> + x[-7] = PCM(0 + 7 * nchannels);
> + x[1] = PCM(0 + 3 * nchannels);
> + x[2] = PCM(0 + 6 * nchannels);
> + x[3] = PCM(0 + 0 * nchannels);
> + x[4] = PCM(0 + 5 * nchannels);
> + x[5] = PCM(0 + 1 * nchannels);
> + x[6] = PCM(0 + 4 * nchannels);
> + x[7] = PCM(0 + 2 * nchannels);
> + }
> + if (nchannels > 1) {
> + int16_t *x = &X[1][position];
> + x[-7] = PCM(1 + 7 * nchannels);
> + x[1] = PCM(1 + 3 * nchannels);
> + x[2] = PCM(1 + 6 * nchannels);
> + x[3] = PCM(1 + 0 * nchannels);
> + x[4] = PCM(1 + 5 * nchannels);
> + x[5] = PCM(1 + 1 * nchannels);
> + x[6] = PCM(1 + 4 * nchannels);
> + x[7] = PCM(1 + 2 * nchannels);
> + }
> + }
> +
> + return position;
> +}
> +
> +/*
> + * Input data processing functions. The data is endian converted if
> needed,
> + * channels are deintrleaved and audio samples are reordered for use in
> + * SIMD-friendly analysis filter function. The results are put into "X"
> + * array, getting appended to the previous data (or it is better to say
> + * prepended, as the buffer is filled from top to bottom). Old data is
> + * discarded when neededed, but availability of (10 * nrof_subbands)
> + * contiguous samples is always guaranteed for the input to the analysis
> + * filter. This is achieved by copying a sufficient part of old data
> + * to the top of the buffer on buffer wraparound.
> + */
> +
> +static int sbc_enc_process_input_4s(int position, const uint8_t *pcm,
> + int16_t X[2][SBC_X_BUFFER_SIZE],
> + int nsamples, int nchannels)
> +{
> + if (nchannels > 1)
> + return sbc_encoder_process_input_s4_internal(
> + position, pcm, X, nsamples, 2);
> + else
> + return sbc_encoder_process_input_s4_internal(
> + position, pcm, X, nsamples, 1);
> +}
>
That's just silly, do
return sbc_encoder_process_input_s4_internal(position, pcm, X, nsamples, 1
+ (nchannels > 1));
Or better yet remove the wrapper function.
> +
> +static int sbc_enc_process_input_8s(int position, const uint8_t *pcm,
> + int16_t X[2][SBC_X_BUFFER_SIZE],
> + int nsamples, int nchannels)
> +{
> + if (nchannels > 1)
> + return sbc_encoder_process_input_s8_internal(
> + position, pcm, X, nsamples, 2);
> + else
> + return sbc_encoder_process_input_s8_internal(
> + position, pcm, X, nsamples, 1);
> +}
> +
>
Same here.
>
> +++ b/libavcodec/sbcdsp_data.c
> @@ -0,0 +1,335 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2008-2010 Nokia Corporation
> + * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * miscellaneous SBC tables
> + */
> +
> +#include "sbcdsp_data.h"
> +
> +#define F_PROTO4(x) (int32_t) ((x * 2) * \
> + ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_COS4(x) (int32_t) ((x) * \
> + ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_PROTO8(x) (int32_t) ((x * 2) * \
> + ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_COS8(x) (int32_t) ((x) * \
> + ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +
>
We require 8 bit bytes, so s/CHAR_BIT/8/g throughout.
+++ b/libavcodec/sbcdsp_data.h
> @@ -0,0 +1,57 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2008-2010 Nokia Corporation
> + * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * miscellaneous SBC tables
> + */
> +
> +#ifndef AVCODEC_SBCDSP_DATA_H
> +#define AVCODEC_SBCDSP_DATA_H
> +
> +#include "sbc.h"
> +
> +#define SBC_PROTO_FIXED4_SCALE ((sizeof(int16_t) * CHAR_BIT - 1) + 1)
> +#define SBC_COS_TABLE_FIXED4_SCALE ((sizeof(int16_t) * CHAR_BIT - 1) )
> +#define SBC_PROTO_FIXED8_SCALE ((sizeof(int16_t) * CHAR_BIT - 1) + 1)
> +#define SBC_COS_TABLE_FIXED8_SCALE ((sizeof(int16_t) * CHAR_BIT - 1) )
> +
>
Same
>
> +
> + /* align the last crc byte */
> + if (crc_pos % 8)
> + crc_header[crc_pos >> 3] <<= 8 - (crc_pos % 8);
>
>
put_bits_align?
> + avpkt->data[3] = ff_sbc_crc8(crc_header, crc_pos);
> +
> + ff_sbc_calculate_bits(frame, bits);
> +
> + for (ch = 0; ch < frame_channels; ch++) {
> + for (sb = 0; sb < frame_subbands; sb++) {
> + levels[ch][sb] = ((1 << bits[ch][sb]) - 1) <<
> + (32 - (frame->scale_factor[ch][sb] +
> + SCALE_OUT_BITS + 2));
> + sb_sample_delta[ch][sb] = (uint32_t) 1 <<
> + (frame->scale_factor[ch][sb] +
> + SCALE_OUT_BITS + 1);
> + }
> + }
> +
> + for (blk = 0; blk < frame->blocks; blk++) {
> + for (ch = 0; ch < frame_channels; ch++) {
> + for (sb = 0; sb < frame_subbands; sb++) {
> +
> + if (bits[ch][sb] == 0)
> + continue;
> +
> + audio_sample = ((uint64_t) levels[ch][sb] *
> + (sb_sample_delta[ch][sb] +
> + frame->sb_sample_f[blk][ch][sb])) >> 32;
> +
> + put_bits(&pb, bits[ch][sb], audio_sample);
> + }
> + }
> + }
> +
> + flush_put_bits(&pb);
> +
> + return (put_bits_count(&pb) + 7) / 8;
> +}
> +
> +static size_t sbc_pack_frame(AVPacket *avpkt, struct sbc_frame *frame,
> int joint)
> +{
> + int frame_subbands = 4;
> +
> + avpkt->data[0] = SBC_SYNCWORD;
> +
> + avpkt->data[1] = (frame->frequency & 0x03) << 6;
> + avpkt->data[1] |= (frame->block_mode & 0x03) << 4;
> + avpkt->data[1] |= (frame->mode & 0x03) << 2;
> + avpkt->data[1] |= (frame->allocation & 0x01) << 1;
> +
>
Use put_bits?
> +
> + if (frame->subbands == 4) {
> + if (frame->channels == 1)
> + return sbc_pack_frame_internal(avpkt, frame, 4, 1, joint);
>
return sbc_pack_frame_internal(avpkt, frame, 4, 1 + (frame->channels == 1),
joint);
> + return sbc_pack_frame_internal(avpkt, frame, 8, 1, joint);
> + else
> + return sbc_pack_frame_internal(avpkt, frame, 8, 2, joint);
>
return sbc_pack_frame_internal(avpkt, frame, 8, 1 + (frame->channels == 1),
joint);
> + }
> +}
> +
> +static size_t msbc_pack_frame(AVPacket *avpkt, struct sbc_frame *frame,
> int joint)
> +{
> + avpkt->data[0] = MSBC_SYNCWORD;
> + avpkt->data[1] = 0;
> + avpkt->data[2] = 0;
> +
> + return sbc_pack_frame_internal(avpkt, frame, 8, 1, joint);
> +}
> +
> +static void sbc_encoder_init(bool msbc, SBCDSPContext *s,
> + const struct sbc_frame *frame)
> +{
> + memset(&s->X, 0, sizeof(s->X));
> + s->position = (SBC_X_BUFFER_SIZE - frame->subbands * 9) & ~7;
> + if (msbc)
> + s->increment = 1;
> + else
> + s->increment = 4;
+
>
Save a line, use a ternary.
> +
> + sbc->pack_frame = sbc_pack_frame;
> +
> + sbc->frequency = SBC_FREQ_44100;
>
Yet in the AVCodec structure the encoder specifies it supports 16khz, 32khz
and 48khz.
You should remove the SBC_FREQ macros and use avctx->sample_rate directly.
Also remove any unsupported samplerates.
> + sbc->mode = SBC_MODE_STEREO;
> + if (sbc->joint_stereo)
> + sbc->mode = SBC_MODE_JOINT_STEREO;
> + else if (sbc->dual_channel)
> + sbc->mode = SBC_MODE_DUAL_CHANNEL;
> + sbc->subbands >>= 3;
> + sbc->blocks = (sbc->blocks >> 2) - 1;
> +
> + if (!avctx->frame_size)
> + avctx->frame_size = 4*(sbc->subbands + 1) * 4*(sbc->blocks + 1);
> +
> + for (int i = 0; avctx->codec->supported_samplerates[i]; i++)
> + if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
> + sbc->frequency = i;
> +
> + if (avctx->channels == 1)
> + sbc->mode = SBC_MODE_MONO;
> +
> + return 0;
> +}
> +
> +static int msbc_encode_init(AVCodecContext *avctx)
> +{
> + SBCEncContext *sbc = avctx->priv_data;
> +
> + sbc->msbc = true;
> + sbc->pack_frame = msbc_pack_frame;
> +
> + sbc->frequency = SBC_FREQ_16000;
> + sbc->blocks = MSBC_BLOCKS;
> + sbc->subbands = SBC_SB_8;
> + sbc->mode = SBC_MODE_MONO;
> + sbc->allocation = SBC_AM_LOUDNESS;
> + sbc->bitpool = 26;
> +
> + if (!avctx->frame_size)
> + avctx->frame_size = 8 * MSBC_BLOCKS;
> +
>
Does the encoder actually accept arbitrary custom frame sizes?
>
> +
> +#if CONFIG_SBC_ENCODER
> +AVCodec ff_sbc_encoder = {
> + .name = "sbc",
> + .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity
> subband codec)"),
> + .type = AVMEDIA_TYPE_AUDIO,
> + .id = AV_CODEC_ID_SBC,
> + .priv_data_size = sizeof(SBCEncContext),
> + .init = sbc_encode_init,
> + .encode2 = sbc_encode_frame,
> + .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
> + AV_CH_LAYOUT_STEREO, 0},
> + .sample_fmts = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
> +
> AV_SAMPLE_FMT_NONE },
>
Planar?
> + .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000,
> 0 },
>
Remove the samplerates the encoder doesn't support.
Also add the internal codec cap about threadsafe init since the encoder
doesn't init any global tables to both this and the aptX encoders.
>
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> + movq m0, [inq]
> + movq m1, [inq+8]
> + pmaddwd m0, [constsq]
> + pmaddwd m1, [constsq+8]
> + paddd m0, [scale_mask]
> + paddd m1, [scale_mask]
> +
> + movq m2, [inq+16]
> + movq m3, [inq+24]
> + pmaddwd m2, [constsq+16]
> + pmaddwd m3, [constsq+24]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+32]
> + movq m3, [inq+40]
> + pmaddwd m2, [constsq+32]
> + pmaddwd m3, [constsq+40]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+48]
> + movq m3, [inq+56]
> + pmaddwd m2, [constsq+48]
> + pmaddwd m3, [constsq+56]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+64]
> + movq m3, [inq+72]
> + pmaddwd m2, [constsq+64]
> + pmaddwd m3, [constsq+72]
> + paddd m0, m2
> + paddd m1, m3
> +
>
Loops?
> + psrad m0, 16 ; SBC_PROTO_FIXED4_SCALE
> + psrad m1, 16 ; SBC_PROTO_FIXED4_SCALE
> + packssdw m0, m0
> + packssdw m1, m1
> +
> + movq m2, m0
> + pmaddwd m0, [constsq+80]
> + pmaddwd m2, [constsq+88]
> +
> + movq m3, m1
> + pmaddwd m1, [constsq+96]
> + pmaddwd m3, [constsq+104]
> + paddd m0, m1
> + paddd m2, m3
> +
> + movq [outq ], m0
> + movq [outq+8], m2
> +
> + RET
> +
> +
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> + movq m0, [inq]
> + movq m1, [inq+8]
> + movq m2, [inq+16]
> + movq m3, [inq+24]
> + pmaddwd m0, [constsq]
> + pmaddwd m1, [constsq+8]
> + pmaddwd m2, [constsq+16]
> + pmaddwd m3, [constsq+24]
> + paddd m0, [scale_mask]
> + paddd m1, [scale_mask]
> + paddd m2, [scale_mask]
> + paddd m3, [scale_mask]
> +
> + movq m4, [inq+32]
> + movq m5, [inq+40]
> + movq m6, [inq+48]
> + movq m7, [inq+56]
> + pmaddwd m4, [constsq+32]
> + pmaddwd m5, [constsq+40]
> + pmaddwd m6, [constsq+48]
> + pmaddwd m7, [constsq+56]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+64]
> + movq m5, [inq+72]
> + movq m6, [inq+80]
> + movq m7, [inq+88]
> + pmaddwd m4, [constsq+64]
> + pmaddwd m5, [constsq+72]
> + pmaddwd m6, [constsq+80]
> + pmaddwd m7, [constsq+88]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+96]
> + movq m5, [inq+104]
> + movq m6, [inq+112]
> + movq m7, [inq+120]
> + pmaddwd m4, [constsq+96]
> + pmaddwd m5, [constsq+104]
> + pmaddwd m6, [constsq+112]
> + pmaddwd m7, [constsq+120]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+128]
> + movq m5, [inq+136]
> + movq m6, [inq+144]
> + movq m7, [inq+152]
> + pmaddwd m4, [constsq+128]
> + pmaddwd m5, [constsq+136]
> + pmaddwd m6, [constsq+144]
> + pmaddwd m7, [constsq+152]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + psrad m0, 16 ; SBC_PROTO_FIXED8_SCALE
> + psrad m1, 16 ; SBC_PROTO_FIXED8_SCALE
> + psrad m2, 16 ; SBC_PROTO_FIXED8_SCALE
> + psrad m3, 16 ; SBC_PROTO_FIXED8_SCALE
> +
> + packssdw m0, m0
> + packssdw m1, m1
> + packssdw m2, m2
> + packssdw m3, m3
> +
> + movq m4, m0
> + movq m5, m0
> + pmaddwd m4, [constsq+160]
> + pmaddwd m5, [constsq+168]
> +
> + movq m6, m1
> + movq m7, m1
> + pmaddwd m6, [constsq+192]
> + pmaddwd m7, [constsq+200]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq m6, m2
> + movq m7, m2
> + pmaddwd m6, [constsq+224]
> + pmaddwd m7, [constsq+232]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq m6, m3
> + movq m7, m3
> + pmaddwd m6, [constsq+256]
> + pmaddwd m7, [constsq+264]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq [outq ], m4
> + movq [outq+8], m5
> +
> + movq m5, m0
> + pmaddwd m0, [constsq+176]
> + pmaddwd m5, [constsq+184]
> +
> + movq m7, m1
> + pmaddwd m1, [constsq+208]
> + pmaddwd m7, [constsq+216]
> + paddd m0, m1
> + paddd m5, m7
> +
> + movq m7, m2
> + pmaddwd m2, [constsq+240]
> + pmaddwd m7, [constsq+248]
> + paddd m0, m2
> + paddd m5, m7
> +
> + movq m7, m3
> + pmaddwd m3, [constsq+272]
> + pmaddwd m7, [constsq+280]
> + paddd m0, m3
> + paddd m5, m7
> +
Has the person writing the SIMD seriously not heard of loops?
I see no reason for this to not work on larger registers if loops were used
here.
This seems trivial do to properly so if you can't be bothered to fix it
leave it to me or jamrial to do after the core of the encoder has been
merged.
> + movq [outq+16], m0
> + movq [outq+24], m5
> +
> + RET
> +
> +
> +;*******************************************************************
> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
> +; uint32_t scale_factor[2][8],
> +; int blocks, int channels, int subbands)
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_calc_scalefactors, 5, 9, 3, sb_sample_f, scale_factor,
> blocks, channels, subbands, ch, sb, sa, sf, blk
> + shl channelsd, 5
> + mov chq, 0
> +.loop_1:
> + lea saq, [sb_sample_fq + chq]
> + lea sfq, [scale_factorq + chq]
> +
> + mov sbd, 0
> +.loop_2:
> + ; blk = (blocks - 1) * 64;
> + lea blkq, [blocksq - 1]
> + shl blkd, 6
> +
> + movq m0, [scale_mask]
> +.loop_3:
> + movq m1, [saq+blkq]
> + pxor m2, m2
> + pcmpgtd m1, m2
> + paddd m1, [saq+blkq]
> + pcmpgtd m2, m1
> + pxor m1, m2
> +
> + por m0, m1
> +
> + sub blkd, 64
> + jns .loop_3
> +
> + movd blkd, m0
> + psrlq m0, 32
> + bsr blkd, blkd
> + sub blkd, 15 ; SCALE_OUT_BITS
> + mov [sfq], blkd
> +
> + movd blkd, m0
> + bsr blkd, blkd
> + sub blkd, 15 ; SCALE_OUT_BITS
> + mov [sfq+4], blkd
> +
> + add saq, 8
> + add sfq, 8
> +
> + add sbd, 2
> + cmp sbd, subbandsd
> + jl .loop_2
> +
> + add chd, 32
> + cmp chd, channelsd
> + jl .loop_1
> +
>
This function's hardly doing SIMD and I would like to see comparison to the
C version before accepting it. I somehow doubt it'll be faster.
+
> +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (EXTERNAL_MMX(cpu_flags)) {
> + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
> + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
> + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
> + }
> +}
>
MMX? In this day and age?
Anyway, its mostly not bad, will need some work before its cleaned of
libsbc's NIH.
More information about the ffmpeg-devel
mailing list