diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:12:29 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:12:29 +0000 |
commit | 2392ca434cdd13464fd35633462326c68711add7 (patch) | |
tree | b341d3f924dbdedcb41b77de3cff93833290881e | |
parent | d6ec3ccdf3c873ea461eb340f58d9e072e9bb3a4 (diff) | |
parent | fb61b35abb4b7d3f9d6efde2879fa0da61c14f3b (diff) | |
download | libopus-android14-mainline-uwb-release.tar.gz |
Snap for 10453563 from fb61b35abb4b7d3f9d6efde2879fa0da61c14f3b to mainline-uwb-releaseaml_uwb_341513070aml_uwb_341511050aml_uwb_341310300aml_uwb_341310030aml_uwb_341111010aml_uwb_341011000android14-mainline-uwb-release
Change-Id: If70323a4896a46f69ee1931b3faaf1f8e899a92a
87 files changed, 1639 insertions, 1028 deletions
@@ -383,7 +383,7 @@ cc_library { apex_available: [ "//apex_available:platform", // used by libstagefright_soft_opusdec "com.android.media.swcodec", - "com.android.bluetooth", + "com.android.btservices", ], min_sdk_version: "29", } diff --git a/CMakeLists.txt b/CMakeLists.txt index a28f441c..9d824cdc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -366,11 +366,23 @@ if(NOT OPUS_ENABLE_FLOAT_API) endif() if(NOT OPUS_DISABLE_INTRINSICS) - if((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR + if(((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR (OPUS_X86_MAY_HAVE_SSE2 AND NOT OPUS_X86_PRESUME_SSE2) OR (OPUS_X86_MAY_HAVE_SSE4_1 AND NOT OPUS_X86_PRESUME_SSE4_1) OR - (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX)) + (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX)) AND + RUNTIME_CPU_CAPABILITY_DETECTION) target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD) + if(NOT MSVC) + if(CPU_INFO_BY_ASM_SUPPORTED) + target_compile_definitions(opus PRIVATE CPU_INFO_BY_ASM) + elseif(CPU_INFO_BY_C_SUPPORTED) + target_compile_definitions(opus PRIVATE CPU_INFO_BY_C) + else() + message(ERROR "Runtime cpu capability detection is enabled while CPU_INFO is not supported") + endif() + endif() + add_sources_group(opus celt ${celt_sources_x86_rtcd}) + add_sources_group(opus silk ${silk_sources_x86_rtcd}) endif() if(SSE1_SUPPORTED) @@ -455,15 +467,13 @@ if(NOT OPUS_DISABLE_INTRINSICS) endif() endif() - if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)") - add_sources_group(opus celt ${celt_sources_arm}) - endif() - if(COMPILER_SUPPORT_NEON) if(OPUS_MAY_HAVE_NEON) if(RUNTIME_CPU_CAPABILITY_DETECTION) message(STATUS "OPUS_MAY_HAVE_NEON enabling runtime detection") target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD) + add_sources_group(opus celt ${celt_sources_arm_rtcd}) + add_sources_group(opus silk ${silk_sources_arm_rtcd}) else() message(ERROR "Runtime cpu capability detection needed for MAY_HAVE_NEON") endif() @@ -565,6 +575,7 @@ if(OPUS_BUILD_PROGRAMS) target_include_directories(opus_custom_demo PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(opus_custom_demo PRIVATE opus) + target_compile_definitions(opus_custom_demo PRIVATE OPUS_BUILD) endif() add_executable(opus_demo ${opus_demo_sources}) @@ -572,14 +583,19 @@ if(OPUS_BUILD_PROGRAMS) target_include_directories(opus_demo PRIVATE silk) # debug.h target_include_directories(opus_demo PRIVATE celt) # arch.h target_link_libraries(opus_demo PRIVATE opus ${OPUS_REQUIRED_LIBRARIES}) + target_compile_definitions(opus_demo PRIVATE OPUS_BUILD) # compare add_executable(opus_compare ${opus_compare_sources}) target_include_directories(opus_compare PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(opus_compare PRIVATE opus ${OPUS_REQUIRED_LIBRARIES}) + if(MSVC) + # move cosmetic warning to level 4 for opus_compare + target_compile_options(opus_compare PRIVATE /w44244) + endif() endif() -if(BUILD_TESTING) +if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS) enable_testing() # tests @@ -587,32 +603,44 @@ if(BUILD_TESTING) target_include_directories(test_opus_decode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(test_opus_decode PRIVATE opus) + target_compile_definitions(test_opus_decode PRIVATE OPUS_BUILD) if(OPUS_FIXED_POINT) target_compile_definitions(test_opus_decode PRIVATE DISABLE_FLOAT_API) endif() - add_test(NAME test_opus_decode COMMAND $<TARGET_FILE:test_opus_decode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_test(NAME test_opus_decode COMMAND ${CMAKE_COMMAND} + -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_decode> + -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} + -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake") add_executable(test_opus_padding ${test_opus_padding_sources}) target_include_directories(test_opus_padding PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(test_opus_padding PRIVATE opus) - add_test(NAME test_opus_padding COMMAND $<TARGET_FILE:test_opus_padding> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) - - if(NOT BUILD_SHARED_LIBS) - # disable tests that depends on private API when building shared lib - add_executable(test_opus_api ${test_opus_api_sources}) - target_include_directories(test_opus_api - PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt) - target_link_libraries(test_opus_api PRIVATE opus) - if(OPUS_FIXED_POINT) - target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API) - endif() - add_test(NAME test_opus_api COMMAND $<TARGET_FILE:test_opus_api> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) - - add_executable(test_opus_encode ${test_opus_encode_sources}) - target_include_directories(test_opus_encode - PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt) - target_link_libraries(test_opus_encode PRIVATE opus) - add_test(NAME test_opus_encode COMMAND $<TARGET_FILE:test_opus_encode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_test(NAME test_opus_padding COMMAND ${CMAKE_COMMAND} + -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_padding> + -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} + -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake") + + add_executable(test_opus_api ${test_opus_api_sources}) + target_include_directories(test_opus_api + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt) + target_link_libraries(test_opus_api PRIVATE opus) + target_compile_definitions(test_opus_api PRIVATE OPUS_BUILD) + if(OPUS_FIXED_POINT) + target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API) endif() + add_test(NAME test_opus_api COMMAND ${CMAKE_COMMAND} + -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_api> + -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} + -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake") + + add_executable(test_opus_encode ${test_opus_encode_sources}) + target_include_directories(test_opus_encode + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt) + target_link_libraries(test_opus_encode PRIVATE opus) + target_compile_definitions(test_opus_encode PRIVATE OPUS_BUILD) + add_test(NAME test_opus_encode COMMAND ${CMAKE_COMMAND} + -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_encode> + -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} + -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake") endif() @@ -1,3 +1,7 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update libopus +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + name: "libopus" description: "Android fork of the opus library." third_party { @@ -5,14 +9,14 @@ third_party { type: GIT value: "https://gitlab.xiph.org/xiph/opus.git" } - version: "d633f523e36e3b6d01cc6d57386458d770d618be" + version: "8cf872a186b96085b1bb3a547afd598354ebeb87" license_type: NOTICE - last_upgrade_date { - year: 2021 - month: 2 - day: 5 - } security { tag: "NVD-CPE2.3:cpe:/a:opus-codec:opus:1.0.3" } + last_upgrade_date { + year: 2023 + month: 1 + day: 18 + } } diff --git a/Makefile.am b/Makefile.am index 83beaa3f..492fc09d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -36,6 +36,11 @@ else OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) endif +if CPU_X86 +if HAVE_RTCD +CELT_SOURCES += $(CELT_SOURCES_X86_RTCD) +SILK_SOURCES += $(SILK_SOURCES_X86_RTCD) +endif if HAVE_SSE CELT_SOURCES += $(CELT_SOURCES_SSE) endif @@ -45,10 +50,13 @@ endif if HAVE_SSE4_1 CELT_SOURCES += $(CELT_SOURCES_SSE4_1) endif +endif if CPU_ARM -CELT_SOURCES += $(CELT_SOURCES_ARM) -SILK_SOURCES += $(SILK_SOURCES_ARM) +if HAVE_RTCD +CELT_SOURCES += $(CELT_SOURCES_ARM_RTCD) +SILK_SOURCES += $(SILK_SOURCES_ARM_RTCD) +endif if HAVE_ARM_NEON_INTR CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) @@ -222,8 +230,11 @@ EXTRA_DIST = opus.pc.in \ cmake/OpusFunctions.cmake \ cmake/OpusPackageVersion.cmake \ cmake/OpusSources.cmake \ + cmake/RunTest.cmake \ cmake/config.h.cmake.in \ cmake/vla.c \ + cmake/cpu_info_by_asm.c \ + cmake/cpu_info_by_c.c \ meson/get-version.py \ meson/read-sources-list.py \ meson.build \ diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index cce3ae3a..c7d16e6d 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void) "your platform. Reconfigure with --disable-rtcd (or send patches)." #endif -int opus_select_arch(void) +static int opus_select_arch_impl(void) { opus_uint32 flags = opus_cpu_capabilities(); int arch = 0; @@ -184,4 +184,11 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + arch = rand()%(arch+1); +#endif + return arch; +} #endif diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c index 1ac38c43..35cc46e2 100644 --- a/celt/arm/pitch_neon_intr.c +++ b/celt/arm/pitch_neon_intr.c @@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ /* operations of celt_inner_prod_neon(), and both functions should have bit */ /* exact output. */ -static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N) +static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N) { int i; + *err = 0; opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; for (i = 0; i < N - 3; i += 4) { xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); + *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3); } xy0 += xy2; xy1 += xy3; xy = xy0 + xy1; + *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy); for (; i < N; i++) { xy = MAC16_16(xy, x[i], y[i]); + *err += ABS32(xy); } + *err = *err*2e-7 + N*1e-37; return xy; } @@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c /* operations of dual_inner_prod_neon(), and both functions should have bit */ /* exact output. */ static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) + int N, opus_val32 *xy1, opus_val32 *xy2, float *err) { - int i; - opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; - for (i = 0; i < N - 3; i += 4) { - xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); - xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); - xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); - xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); - xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); - xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); - xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); - xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); - } - xy01_0 += xy01_2; - xy02_0 += xy02_2; - xy01_1 += xy01_3; - xy02_1 += xy02_3; - xy01 = xy01_0 + xy01_1; - xy02 = xy02_0 + xy02_1; - for (; i < N; i++) { - xy01 = MAC16_16(xy01, x[i], y01[i]); - xy02 = MAC16_16(xy02, x[i], y02[i]); - } - *xy1 = xy01; - *xy2 = xy02; + *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N); + *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N); } #endif /* OPUS_CHECK_ASM */ @@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) } #ifdef OPUS_CHECK_ASM - celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL); + { + float err, res; + res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N); + /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/ + celt_assert(ABS32(res - xy) <= err); + } #endif return xy; @@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus #ifdef OPUS_CHECK_ASM { opus_val32 xy1_c, xy2_c; - dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c); - celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL); - celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL); + float err[2]; + dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err); + /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]); + if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/ + celt_assert(ABS32(xy1_c - *xy1) <= err[0]); + celt_assert(ABS32(xy2_c - *xy2) <= err[1]); } #endif } diff --git a/celt/bands.c b/celt/bands.c index 2702963c..5320ffab 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx, sctx->itheta = itheta; sctx->qalloc = qalloc; } -static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b, +static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, celt_norm *lowband_out) { int c; @@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, sign = ec_dec_bits(ec, 1); } ctx->remaining_bits -= 1<<BITRES; - b-=1<<BITRES; } if (ctx->resynth) x[0] = sign ? -NORM_SCALING : NORM_SCALING; @@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X, /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, NULL, b, lowband_out); + return quant_band_n1(ctx, X, NULL, lowband_out); } if (tf_change>0) @@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, Y, b, lowband_out); + return quant_band_n1(ctx, X, Y, lowband_out); } orig_fill = fill; @@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm return cm; } +#ifndef DISABLE_UPDATE_DRAFT static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo) { int n1, n2; @@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm if (dual_stereo) OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1); } +#endif void quant_all_bands(int encode, const CELTMode *m, int start, int end, celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 74ca3b74..883dae15 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -90,7 +90,7 @@ struct OpusCustomDecoder { opus_uint32 rng; int error; int last_pitch_index; - int loss_count; + int loss_duration; int skip_plc; int postfilter_period; int postfilter_period_old; @@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int nbEBands; int overlap; int start; - int loss_count; + int loss_duration; int noise_based; const opus_int16 *eBands; SAVE_STACK; @@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; - loss_count = st->loss_count; + loss_duration = st->loss_duration; start = st->start; - noise_based = loss_count >= 5 || start != 0 || st->skip_plc; + noise_based = loss_duration >= 40 || start != 0 || st->skip_plc; if (noise_based) { /* Noise-based PLC/CNG */ @@ -557,9 +557,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) #else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ #endif + c=0; do { + OPUS_MOVE(decode_mem[c], decode_mem[c]+N, + DECODE_BUFFER_SIZE-N+(overlap>>1)); + } while (++c<C); /* Energy decay */ - decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); + decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); c=0; do { for (i=start;i<end;i++) @@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } st->rng = seed; - c=0; do { - OPUS_MOVE(decode_mem[c], decode_mem[c]+N, - DECODE_BUFFER_SIZE-N+(overlap>>1)); - } while (++c<C); - celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch); } else { int exc_length; @@ -602,7 +601,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) VARDECL(opus_val16, _exc); VARDECL(opus_val16, fir_tmp); - if (loss_count == 0) + if (loss_duration == 0) { st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); } else { @@ -630,9 +629,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) buf = decode_mem[c]; for (i=0;i<MAX_PERIOD+LPC_ORDER;i++) - exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT); + exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT); - if (loss_count == 0) + if (loss_duration == 0) { opus_val32 ac[LPC_ORDER+1]; /* Compute LPC coefficients for the last MAX_PERIOD samples before @@ -732,7 +731,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) exc[extrapolation_offset+j])), SIG_SHIFT); /* Compute the energy of the previously decoded signal whose excitation we're copying. */ - tmp = ROUND16( + tmp = SROUND16( buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j], SIG_SHIFT); S1 += SHR32(MULT16_16(tmp, tmp), 10); @@ -742,7 +741,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) /* Copy the last decoded samples (prior to the overlap region) to synthesis filter memory so we can have a continuous signal. */ for (i=0;i<LPC_ORDER;i++) - lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT); + lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT); /* Apply the synthesis filter to convert the excitation back into the signal domain. */ celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER, @@ -761,7 +760,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val32 S2=0; for (i=0;i<extrapolation_len;i++) { - opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT); + opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT); S2 += SHR32(MULT16_16(tmp, tmp), 10); } /* This checks for an "explosion" in the synthesis. */ @@ -812,7 +811,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } while (++c<C); } - st->loss_count = loss_count+1; + /* Saturate to soemthing large to avoid wrap-around. */ + st->loss_duration = IMIN(10000, loss_duration+(1<<LM)); RESTORE_STACK; } @@ -868,6 +868,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat int nbEBands; int overlap; const opus_int16 *eBands; + opus_val16 max_background_increase; ALLOC_STACK; VALIDATE_CELT_DECODER(st); @@ -942,7 +943,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat /* Check if there are at least two packets received consecutively before * turning on the pitch-based PLC */ - st->skip_plc = st->loss_count != 0; + st->skip_plc = st->loss_duration != 0; if (dec == NULL) { @@ -1140,25 +1141,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (C==1) OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands); - /* In case start or end were to change */ if (!isTransient) { - opus_val16 max_background_increase; OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands); OPUS_COPY(oldLogE, oldBandE, 2*nbEBands); - /* In normal circumstances, we only allow the noise floor to increase by - up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB - increase for each update.*/ - if (st->loss_count < 10) - max_background_increase = M*QCONST16(0.001f,DB_SHIFT); - else - max_background_increase = QCONST16(1.f,DB_SHIFT); - for (i=0;i<2*nbEBands;i++) - backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); } else { for (i=0;i<2*nbEBands;i++) oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]); } + /* In normal circumstances, we only allow the noise floor to increase by + up to 2.4 dB/second, but when we're in DTX we give the weight of + all missing packets to the update packet. */ + max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT); + for (i=0;i<2*nbEBands;i++) + backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); + /* In case start or end were to change */ c=0; do { for (i=0;i<start;i++) @@ -1175,7 +1172,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat st->rng = dec->rng; deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); - st->loss_count = 0; + st->loss_duration = 0; RESTORE_STACK; if (ec_tell(dec) > 8*len) return OPUS_INTERNAL_ERROR; diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index d6f8afc2..637d442c 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); - for (i=0;i<C*nbEBands;i++) - bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); + for (c=0;c<C;c++) + { + for (i=0;i<end;i++) + bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT)); + } } compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch); @@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE, C); /* Compensate for the scaling of short vs long mdcts */ - for (i=0;i<C*nbEBands;i++) - bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); + for (c=0;c<C;c++) + { + for (i=0;i<end;i++) + bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT)); + } tf_estimate = QCONST16(.2f,14); } } diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 8ecb693e..f91721bc 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -50,17 +50,21 @@ int p #endif OPUS_CLEAR(lpc, p); +#ifdef FIXED_POINT if (ac[0] != 0) +#else + if (ac[0] > 1e-10f) +#endif { for (i = 0; i < p; i++) { /* Sum up this iteration's reflection coefficient */ opus_val32 rr = 0; for (j = 0; j < i; j++) rr += MULT32_32_Q31(lpc[j],ac[i - j]); - rr += SHR32(ac[i + 1],3); - r = -frac_div32(SHL32(rr,3), error); + rr += SHR32(ac[i + 1],6); + r = -frac_div32(SHL32(rr,6), error); /* Update LPC coefficients and total error */ - lpc[i] = SHR32(r,3); + lpc[i] = SHR32(r,6); for (j = 0; j < (i+1)>>1; j++) { opus_val32 tmp1, tmp2; @@ -73,17 +77,61 @@ int p error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); /* Bail out once we get 30 dB gain */ #ifdef FIXED_POINT - if (error<SHR32(ac[0],10)) + if (error<=SHR32(ac[0],10)) break; #else - if (error<.001f*ac[0]) + if (error<=.001f*ac[0]) break; #endif } } #ifdef FIXED_POINT - for (i=0;i<p;i++) - _lpc[i] = ROUND16(lpc[i],16); + { + /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds. + This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug + fixes should also be applied there. */ + int iter, idx = 0; + opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16; + + for (iter = 0; iter < 10; iter++) { + maxabs = 0; + for (i = 0; i < p; i++) { + absval = ABS32(lpc[i]); + if (absval > maxabs) { + maxabs = absval; + idx = i; + } + } + maxabs = PSHR32(maxabs, 13); /* Q25->Q12 */ + + if (maxabs > 32767) { + maxabs = MIN32(maxabs, 163838); + chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14), + SHR32(MULT32_32_32(maxabs, idx + 1), 2)); + chirp_minus_one_Q16 = chirp_Q16 - 65536; + + /* Apply bandwidth expansion. */ + for (i = 0; i < p - 1; i++) { + lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]); + chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16); + } + lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]); + } else { + break; + } + } + + if (iter == 10) { + /* If the coeffs still do not fit into the 16 bit range after 10 iterations, + fall back to the A(z)=1 filter. */ + OPUS_CLEAR(lpc, p); + _lpc[0] = 4096; /* Q12 */ + } else { + for (i = 0; i < p; i++) { + _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13)); /* Q25->Q12 */ + } + } + } #endif } @@ -111,17 +159,17 @@ void celt_fir_c( sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT); sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT); xcorr_kernel(rnum, x+i-ord, sum, ord, arch); - y[i ] = ROUND16(sum[0], SIG_SHIFT); - y[i+1] = ROUND16(sum[1], SIG_SHIFT); - y[i+2] = ROUND16(sum[2], SIG_SHIFT); - y[i+3] = ROUND16(sum[3], SIG_SHIFT); + y[i ] = SROUND16(sum[0], SIG_SHIFT); + y[i+1] = SROUND16(sum[1], SIG_SHIFT); + y[i+2] = SROUND16(sum[2], SIG_SHIFT); + y[i+3] = SROUND16(sum[3], SIG_SHIFT); } for (;i<N;i++) { opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); for (j=0;j<ord;j++) sum = MAC16_16(sum,rnum[j],x[i+j-ord]); - y[i] = ROUND16(sum, SIG_SHIFT); + y[i] = SROUND16(sum, SIG_SHIFT); } RESTORE_STACK; } diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 68fc6067..7b5c56ca 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -43,10 +43,11 @@ */ #define OPUS_ARCHMASK 3 -#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#elif defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) #include "x86/x86cpu.h" /* We currently support 5 x86 variants: diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h index f4352952..ef2e5d02 100644 --- a/celt/fixed_debug.h +++ b/celt/fixed_debug.h @@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line) #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__) static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) { - int res; + opus_int32 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) { fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line); @@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) celt_assert(0); #endif } - res = a<<shift; + res = (opus_int32)((opus_uint32)a<<shift); if (!VERIFY_SHORT(res)) { fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line); @@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line) opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_SHORT(shift)) { - fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line); + fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif } - res = a<<shift; + res = (opus_int64)((opus_uint64)a<<shift); if (!VERIFY_INT(res)) { - fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line); + fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) { - fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file res = a+b; if (!VERIFY_UINT(res)) { - fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line); + fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) { - fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif } if (a<b) { - fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file res = a-b; if (!VERIFY_UINT(res)) { - fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line); + fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b) return res; } +/* result fits in 32 bits */ +static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a*b; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=5; + return res; +} + +static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)(a)*(opus_int64)(b)) >> 16; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=5; + return res; +} + #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__) static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line) { @@ -446,7 +491,7 @@ static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int celt_assert(0); #endif } - if (ABS32(b)>=((opus_val32)(1)<<(15+Q))) + if (ABS32(b)>=((opus_int64)(1)<<(16+Q))) { fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -479,7 +524,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int celt_assert(0); #endif } - if (ABS32(b)>=((opus_int64)(1)<<(15+Q))) + if (ABS32(b)>=((opus_int64)(1)<<(16+Q))) { fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line); #ifdef FIXED_DEBUG_ASSERT @@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x) #undef PRINT_MIPS -#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0); +#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0); #endif diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 0ecbb899..8f29d46b 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -57,6 +57,13 @@ #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) #endif +/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16)) +#else +#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16))) +#endif + /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ #if OPUS_FAST_INT64 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31)) @@ -131,6 +138,9 @@ /** 16x16 multiplication where the result fits in 16 bits */ #define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b)))) +/** 32x32 multiplication where the result fits in 32 bits */ +#define MULT32_32_32(a,b) ((((opus_val32)(a))*((opus_val32)(b)))) + /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */ /** 16x16 multiplication where the result fits in 32 bits */ #define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b))) diff --git a/celt/float_cast.h b/celt/float_cast.h index 9d34976e..8915a5fd 100644 --- a/celt/float_cast.h +++ b/celt/float_cast.h @@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s return intgr ; } -#elif defined(HAVE_LRINTF) +#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* These defines enable functionality introduced with the 1999 ISO C ** standard. They must be defined before the inclusion of math.h to @@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s #include <math.h> #define float2int(x) lrintf(x) -#elif (defined(HAVE_LRINT)) +#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define _ISOC9X_SOURCE 1 #define _ISOC99_SOURCE 1 diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h index bffa2bfa..267f72f9 100644 --- a/celt/kiss_fft.h +++ b/celt/kiss_fft.h @@ -52,6 +52,10 @@ extern "C" { # define kiss_fft_scalar opus_int32 # define kiss_twiddle_scalar opus_int16 +/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory + * access, and could benefit from additional alignment. + */ +# define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32)) #else # ifndef kiss_fft_scalar @@ -62,6 +66,12 @@ extern "C" { # endif #endif +#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT) +#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT))) +#else +#define KISS_TWIDDLE_CPX_ALIGNED +#endif + typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; @@ -70,7 +80,7 @@ typedef struct { typedef struct { kiss_twiddle_scalar r; kiss_twiddle_scalar i; -}kiss_twiddle_cpx; +} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx; #define MAXFACTORS 8 /* e.g. an fft of length 128 has 4 factors diff --git a/celt/mathops.h b/celt/mathops.h index fe29dac1..478ac918 100644 --- a/celt/mathops.h +++ b/celt/mathops.h @@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x) float f; opus_uint32 i; } res; - integer = floor(x); + integer = (int)floor(x); if (integer < -50) return 0; frac = x-integer; diff --git a/celt/meson.build b/celt/meson.build index 370ea1fe..ad95d949 100644 --- a/celt/meson.build +++ b/celt/meson.build @@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR'] celt_static_libs = [] +if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD') + celt_sources += sources['CELT_SOURCES_X86_RTCD'] +endif + foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr'] have_intr = get_variable('have_' + intr_name) if not have_intr @@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio endif if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm - celt_sources += sources['CELT_SOURCES_ARM'] + if opus_conf.has('OPUS_HAVE_RTCD') + celt_sources += sources['CELT_SOURCES_ARM_RTCD'] + endif if have_arm_ne10 celt_sources += sources['CELT_SOURCES_ARM_NE10'] endif diff --git a/celt/modes.c b/celt/modes.c index 390c5e8a..23f7cde6 100644 --- a/celt/modes.c +++ b/celt/modes.c @@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode) mode->nbAllocVectors = BITALLOC_SIZE; allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands)); if (allocVectors==NULL) + { + mode->allocVectors = NULL; return; + } /* Check for standard mode */ if (mode->Fs == 400*(opus_int32)mode->shortMdctSize) diff --git a/celt/pitch.c b/celt/pitch.c index 872582a4..7998db41 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x shift=0; if (C==2) shift++; -#endif for (i=1;i<len>>1;i++) - x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift); - x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift); + x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1); + x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1); if (C==2) { for (i=1;i<len>>1;i++) - x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift); - x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift); + x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1); + x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1); } - +#else + for (i=1;i<len>>1;i++) + x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i]; + x_lp[0] = .25f*x[0][1] + .5f*x[0][0]; + if (C==2) + { + for (i=1;i<len>>1;i++) + x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i]; + x_lp[0] += .25f*x[1][1] + .5f*x[1][0]; + } +#endif _celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1, arch); @@ -249,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 maxcorr=1; #endif celt_assert(max_pitch>0); - celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + celt_sig_assert(((size_t)_x&3)==0); for (i=0;i<max_pitch-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; diff --git a/celt/rate.c b/celt/rate.c index 465e1ba2..7f7ad3fa 100644 --- a/celt/rate.c +++ b/celt/rate.c @@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, else depth_threshold = 0; #ifdef FUZZING + (void)signalBandwidth; + (void)depth_threshold; if ((rand()&0x1) == 0) #else if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth)) diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 70f8f493..ae9a7b56 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c index 7f674529..b1619b74 100644 --- a/celt/tests/test_unit_entropy.c +++ b/celt/tests/test_unit_entropy.c @@ -104,7 +104,7 @@ int main(int _argc,char **_argv){ nbits=ec_tell_frac(&enc); ec_enc_done(&enc); fprintf(stderr, - "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n", + "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n", entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits); fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc)); ec_dec_init(&dec,ptr,DATA_SIZE); @@ -129,7 +129,7 @@ int main(int _argc,char **_argv){ nbits2=ec_tell_frac(&dec); if(nbits!=nbits2){ fprintf(stderr, - "Reported number of bits used was %0.2lf, should be %0.2lf.\n", + "Reported number of bits used was %0.2f, should be %0.2f.\n", ldexp(nbits2,-3),ldexp(nbits,-3)); ret=-1; } diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 4a563ccf..844c5b48 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index 7d1ecf75..90e69ecf 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -33,7 +33,6 @@ #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) -#define OVERRIDE_CELT_FIR void celt_fir_sse4_1( const opus_val16 *x, @@ -44,10 +43,11 @@ void celt_fir_sse4_1( int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) +#define OVERRIDE_CELT_FIR #define celt_fir(x, num, y, N, ord, arch) \ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, int arch); +#define OVERRIDE_CELT_FIR # define celt_fir(x, num, y, N, ord, arch) \ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index f7a014b6..964aef50 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -63,7 +63,7 @@ void xcorr_kernel_sse( #define xcorr_kernel(x, y, sum, len, arch) \ ((void)arch, xcorr_kernel_sse(x, y, sum, len)) -#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse( ((void)arch, celt_inner_prod_sse(x, y, N)) -#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ - (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c index a092c68b..2bc57830 100644 --- a/celt/x86/pitch_sse4_1.c +++ b/celt/x86/pitch_sse4_1.c @@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 __m128i sum0, sum1, sum2, sum3, vecSum; __m128i initSum; +#ifdef OPUS_CHECK_ASM + opus_val32 sum_c[4]; + for (j=0;j<4;j++) { + sum_c[j] = sum[j]; + } + xcorr_kernel_c(x, y, sum_c, len); +#endif + celt_assert(len >= 3); sum0 = _mm_setzero_si128(); @@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 vecSum = _mm_add_epi32(vecSum, sum2); } - for (;j<len;j++) + vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]); + if (len - j == 3) { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX0 = _mm_shuffle_epi32(vecX, 0x55); + vecX1 = _mm_shuffle_epi32(vecX, 0xaa); + vecX2 = _mm_shuffle_epi32(vecX, 0xff); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + vecSum = _mm_add_epi32(vecSum, sum2); + } + else if (len - j == 2) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xaa); + vecX1 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + } + else if (len - j == 1) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); } initSum = _mm_loadu_si128((__m128i *)(&sum[0])); initSum = _mm_add_epi32(initSum, vecSum); _mm_storeu_si128((__m128i *)sum, initSum); + +#ifdef OPUS_CHECK_ASM + celt_assert(!memcmp(sum_c, sum, sizeof(sum_c))); +#endif } #endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c index 080eb25e..6a1914de 100644 --- a/celt/x86/x86cpu.c +++ b/celt/x86/x86cpu.c @@ -35,11 +35,11 @@ #include "pitch.h" #include "x86cpu.h" -#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) - + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) #if defined(_MSC_VER) @@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #else __asm__ __volatile__ ( @@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #endif #elif defined(CPU_INFO_BY_C) - __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); + /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive + prior to v3.17.0.*/ + if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) { + /* Our function cannot fail, but __get_cpuid{_count} can. + Returning all zeroes will effectively disable all SIMD, which is + what we want on CPUs that don't support CPUID. */ + CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0; + } +#else +# error "Configured to use x86 RTCD, but no CPU detection method available. " \ + "Reconfigure with --disable-rtcd (or send patches)." #endif } @@ -98,7 +110,7 @@ typedef struct CPU_Feature{ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) { - unsigned int info[4] = {0}; + unsigned int info[4]; unsigned int nIds = 0; cpuid(info, 0); @@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) } } -int opus_select_arch(void) +static int opus_select_arch_impl(void) { CPU_Feature cpu_feature; int arch; @@ -154,4 +166,13 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + /* Randomly downgrade the architecture. */ + arch = rand()%(arch+1); +#endif + return arch; +} + #endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index 1e2bf17b..04e80489 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -56,40 +56,18 @@ int opus_select_arch(void); # endif -/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32() - or _mm_cvtepi16_epi32() when optimizations are disabled, even though the - actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory - reference, these require 16-byte alignment and load a full 16 bytes (instead - of 4 or 8), possibly reading out of bounds. - - We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or - _mm_loadl_epi64(), which should have the same semantics as an m32 or m64 - reference in the PMOVSXWD instruction itself, but gcc is not smart enough to - optimize this out when optimizations ARE enabled. - - Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 - (which is fair, since technically the compiler is always allowed to do the - dereference before invoking the function implementing the intrinsic). - However, it is smart enough to eliminate the extra MOVD instruction. - For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out - the extra MOVQ if it's specified explicitly */ - -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) -# else -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(*(__m128i *)(x))) -#endif - -/* similar reasoning about the instruction sequence as in the 32-bit macro above, - */ -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI16_EPI32_M64(x) \ +/*MOVD should not impose any alignment restrictions, but the C standard does, + and UBSan will report errors if we actually make unaligned accesses. + Use this to work around those restrictions (which should hopefully all get + optimized to a single MOVD instruction).*/ +#define OP_LOADU_EPI32(x) \ + (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ + *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + +#define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) + +#define OP_CVTEPI16_EPI32_M64(x) \ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) -# else -# define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(*(__m128i *)(x))) -# endif #endif diff --git a/celt_sources.mk b/celt_sources.mk index c9dab06e..d6b6765b 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -18,9 +18,11 @@ celt/quant_bands.c \ celt/rate.c \ celt/vq.c -CELT_SOURCES_SSE = \ +CELT_SOURCES_X86_RTCD = \ celt/x86/x86cpu.c \ -celt/x86/x86_celt_map.c \ +celt/x86/x86_celt_map.c + +CELT_SOURCES_SSE = \ celt/x86/pitch_sse.c CELT_SOURCES_SSE2 = \ @@ -31,7 +33,7 @@ CELT_SOURCES_SSE4_1 = \ celt/x86/celt_lpc_sse4_1.c \ celt/x86/pitch_sse4_1.c -CELT_SOURCES_ARM = \ +CELT_SOURCES_ARM_RTCD = \ celt/arm/armcpu.c \ celt/arm/arm_celt_map.c diff --git a/cmake/OpusConfig.cmake b/cmake/OpusConfig.cmake index 8d19a535..b82307a1 100644 --- a/cmake/OpusConfig.cmake +++ b/cmake/OpusConfig.cmake @@ -9,16 +9,18 @@ configure_file(cmake/config.h.cmake.in config.h @ONLY) add_definitions(-DHAVE_CONFIG_H) set_property(GLOBAL PROPERTY USE_FOLDERS ON) -set_property(GLOBAL PROPERTY C_STANDARD 99) if(MSVC) - add_definitions(-D_CRT_SECURE_NO_WARNINGS) + # For compilers that have no notion of a C standard level, + # such as Microsoft Visual C++ before VS 16.7, + # this property has no effect. + set(CMAKE_C_STANDARD 11) +else() + set(CMAKE_C_STANDARD 99) endif() -include(CheckLibraryExists) -check_library_exists(m floor "" HAVE_LIBM) -if(HAVE_LIBM) - list(APPEND OPUS_REQUIRED_LIBRARIES m) +if(MSVC) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif() include(CFeatureCheck) @@ -35,9 +37,18 @@ else() check_symbol_exists(alloca "stdlib.h;malloc.h" USE_ALLOCA_SUPPORTED) endif() -include(CheckFunctionExists) -check_function_exists(lrintf HAVE_LRINTF) -check_function_exists(lrint HAVE_LRINT) +include(CMakePushCheckState) +cmake_push_check_state(RESET) +include(CheckLibraryExists) +check_library_exists(m floor "" HAVE_LIBM) +if(HAVE_LIBM) + list(APPEND OPUS_REQUIRED_LIBRARIES m) + set(CMAKE_REQUIRED_LIBRARIES m) +endif() + +check_symbol_exists(lrintf "math.h" HAVE_LRINTF) +check_symbol_exists(lrint "math.h" HAVE_LRINT) +cmake_pop_check_state() if(CMAKE_SYSTEM_PROCESSOR MATCHES "(i[0-9]86|x86|X86|amd64|AMD64|x86_64)") if(CMAKE_SIZEOF_VOID_P EQUAL 8) diff --git a/cmake/OpusFunctions.cmake b/cmake/OpusFunctions.cmake index fcf3351f..3f22ad81 100644 --- a/cmake/OpusFunctions.cmake +++ b/cmake/OpusFunctions.cmake @@ -142,14 +142,28 @@ function(opus_detect_neon COMPILER_SUPPORT_NEON) endfunction() function(opus_supports_cpu_detection RUNTIME_CPU_CAPABILITY_DETECTION) - if(MSVC) - check_include_file(intrin.h HAVE_INTRIN_H) - else() - check_include_file(cpuid.h HAVE_CPUID_H) - endif() - if(HAVE_INTRIN_H OR HAVE_CPUID_H) - set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)") + set(RUNTIME_CPU_CAPABILITY_DETECTION 0 PARENT_SCOPE) + if(OPUS_CPU_X86 OR OPUS_CPU_X64) + if(MSVC) + check_include_file(intrin.h HAVE_INTRIN_H) + if(HAVE_INTRIN_H) + # if intrin.h is available we assume __cpuid is there + set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE) + endif() + else() + include(CFeatureCheck) + c_feature_check(CPU_INFO_BY_ASM) + set(CPU_INFO_BY_ASM_SUPPORTED ${CPU_INFO_BY_ASM_SUPPORTED} PARENT_SCOPE) + check_include_file(cpuid.h HAVE_CPUID_H) + if(HAVE_CPUID_H) + c_feature_check(CPU_INFO_BY_C) + set(CPU_INFO_BY_C_SUPPORTED ${CPU_INFO_BY_C_SUPPORTED} PARENT_SCOPE) + endif() + if(CPU_INFO_BY_ASM_SUPPORTED OR CPU_INFO_BY_C_SUPPORTED) + set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE) + endif() + endif() + elseif(OPUS_CPU_ARM) # ARM cpu detection is implemented for Windows and anything # using a Linux kernel (such as Android). if (CMAKE_SYSTEM_NAME MATCHES "(Windows|Linux|Android)") diff --git a/cmake/OpusSources.cmake b/cmake/OpusSources.cmake index 01e75d1a..b47f8c69 100644 --- a/cmake/OpusSources.cmake +++ b/cmake/OpusSources.cmake @@ -9,9 +9,11 @@ get_opus_sources(SILK_HEAD silk_headers.mk silk_headers) get_opus_sources(SILK_SOURCES silk_sources.mk silk_sources) get_opus_sources(SILK_SOURCES_FLOAT silk_sources.mk silk_sources_float) get_opus_sources(SILK_SOURCES_FIXED silk_sources.mk silk_sources_fixed) +get_opus_sources(SILK_SOURCES_X86_RTCD silk_sources.mk silk_sources_x86_rtcd) get_opus_sources(SILK_SOURCES_SSE4_1 silk_sources.mk silk_sources_sse4_1) get_opus_sources(SILK_SOURCES_FIXED_SSE4_1 silk_sources.mk silk_sources_fixed_sse4_1) +get_opus_sources(SILK_SOURCES_ARM_RTCD silk_sources.mk silk_sources_arm_rtcd) get_opus_sources(SILK_SOURCES_ARM_NEON_INTR silk_sources.mk silk_sources_arm_neon_intr) get_opus_sources(SILK_SOURCES_FIXED_ARM_NEON_INTR silk_sources.mk @@ -23,10 +25,11 @@ get_opus_sources(OPUS_SOURCES_FLOAT opus_sources.mk opus_sources_float) get_opus_sources(CELT_HEAD celt_headers.mk celt_headers) get_opus_sources(CELT_SOURCES celt_sources.mk celt_sources) +get_opus_sources(CELT_SOURCES_X86_RTCD celt_sources.mk celt_sources_x86_rtcd) get_opus_sources(CELT_SOURCES_SSE celt_sources.mk celt_sources_sse) get_opus_sources(CELT_SOURCES_SSE2 celt_sources.mk celt_sources_sse2) get_opus_sources(CELT_SOURCES_SSE4_1 celt_sources.mk celt_sources_sse4_1) -get_opus_sources(CELT_SOURCES_ARM celt_sources.mk celt_sources_arm) +get_opus_sources(CELT_SOURCES_ARM_RTCD celt_sources.mk celt_sources_arm_rtcd) get_opus_sources(CELT_SOURCES_ARM_ASM celt_sources.mk celt_sources_arm_asm) get_opus_sources(CELT_AM_SOURCES_ARM_ASM celt_sources.mk celt_am_sources_arm_asm) diff --git a/cmake/RunTest.cmake b/cmake/RunTest.cmake new file mode 100644 index 00000000..f6f8b4a2 --- /dev/null +++ b/cmake/RunTest.cmake @@ -0,0 +1,61 @@ +if(NOT EXISTS ${TEST_EXECUTABLE}) + message(FATAL_ERROR "Error could not find ${TEST_EXECUTABLE}, ensure that you built the test binary") +endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + + # support to run plain old binary on android devices + # requires android debug bridge to be installed + + find_program(adb_executable adb) + if(NOT adb_executable) + message(FATAL_ERROR "Error could not find adb") + endif() + + # check if any device emulator is attached + execute_process(COMMAND ${adb_executable} shell echo RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error adb: no devices/emulators found") + endif() + + # push binary + set(android_path /data/local/tmp) + execute_process(COMMAND ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error running ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} failed with result ${CMD_RESULT}") + endif() + + # set permissions + get_filename_component(test_executable ${TEST_EXECUTABLE} NAME) + set(test_executable_on_android /data/local/tmp/${test_executable}) + execute_process(COMMAND ${adb_executable} shell chmod 555 ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error running ${adb_executable} shell chmod 555 ${test_executable_on_android} failed with result ${CMD_RESULT}") + endif() + + # run executable + execute_process(COMMAND ${adb_executable} shell ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error running ${adb_executable} shell ${test_executable_on_android} failed with result ${CMD_RESULT}") + endif() + + # clean up binary + execute_process(COMMAND ${adb_executable} shell rm ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error running ${adb_executable} shell rm ${test_executable_on_android} failed with result ${CMD_RESULT}") + endif() + +elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS") + # CTest doesn't support iOS + + message(FATAL_ERROR "Error CTest is not supported on iOS") + +else() + # for other platforms just execute test binary on host + + execute_process(COMMAND ${TEST_EXECUTABLE} RESULT_VARIABLE CMD_RESULT) + if(CMD_RESULT) + message(FATAL_ERROR "Error running ${TEST_EXECUTABLE} failed with result ${CMD_RESULT}") + endif() + +endif()
\ No newline at end of file diff --git a/cmake/cpu_info_by_asm.c b/cmake/cpu_info_by_asm.c new file mode 100644 index 00000000..1a70a815 --- /dev/null +++ b/cmake/cpu_info_by_asm.c @@ -0,0 +1,31 @@ +#include <stdio.h> +int main() { + unsigned int CPUInfo0; + unsigned int CPUInfo1; + unsigned int CPUInfo2; + unsigned int CPUInfo3; + unsigned int InfoType; +#if defined(__i386__) && defined(__PIC__) +/* %ebx is PIC register in 32-bit, so mustn't clobber it. */ + __asm__ __volatile__ ( + "xchg %%ebx, %1\n" + "cpuid\n" + "xchg %%ebx, %1\n": + "=a" (CPUInfo0), + "=r" (CPUInfo1), + "=c" (CPUInfo2), + "=d" (CPUInfo3) : + "0" (InfoType), "2" (0) + ); +#else + __asm__ __volatile__ ( + "cpuid": + "=a" (CPUInfo0), + "=b" (CPUInfo1), + "=c" (CPUInfo2), + "=d" (CPUInfo3) : + "0" (InfoType), "2" (0) + ); +#endif + return 0; +} diff --git a/cmake/cpu_info_by_c.c b/cmake/cpu_info_by_c.c new file mode 100644 index 00000000..117084eb --- /dev/null +++ b/cmake/cpu_info_by_c.c @@ -0,0 +1,9 @@ +#include <cpuid.h> +int main() { + unsigned int CPUInfo0; + unsigned int CPUInfo1; + unsigned int CPUInfo2; + unsigned int CPUInfo3; + unsigned int InfoType; + return __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3); +} diff --git a/configure.ac b/configure.ac index f12f0aa9..1d426f27 100644 --- a/configure.ac +++ b/configure.ac @@ -195,6 +195,7 @@ AC_ARG_ENABLE([intrinsics], rtcd_support=no cpu_arm=no +cpu_x86=no AS_IF([test x"${enable_asm}" = x"yes"],[ inline_optimization="No inline ASM for your platform, please send patches" @@ -535,6 +536,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ ], [i?86|x86_64], [ + cpu_x86=yes OPUS_CHECK_INTRINSICS( [SSE], [$X86_SSE_CFLAGS], @@ -724,7 +726,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ unsigned int CPUInfo2; unsigned int CPUInfo3; unsigned int InfoType; - __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3); + __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3); ]])], [AC_MSG_RESULT([C method]) AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])], @@ -744,6 +746,7 @@ AM_CONDITIONAL([HAVE_ARM_NEON_INTR], [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"]) AM_CONDITIONAL([HAVE_ARM_NE10], [test x"$HAVE_ARM_NE10" = x"1"]) +AM_CONDITIONAL([CPU_X86], [test "$cpu_x86" = "yes"]) AM_CONDITIONAL([HAVE_SSE], [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"]) AM_CONDITIONAL([HAVE_SSE2], @@ -753,6 +756,8 @@ AM_CONDITIONAL([HAVE_SSE4_1], AM_CONDITIONAL([HAVE_AVX], [test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"]) +AM_CONDITIONAL([HAVE_RTCD], + [test x"$enable_rtcd" = x"yes" -a x"$rtcd_support" != x"no"]) AS_IF([test x"$enable_rtcd" = x"yes"],[ AS_IF([test x"$rtcd_support" != x"no"],[ AC_DEFINE([OPUS_HAVE_RTCD], [1], diff --git a/fuzzer/Android.bp b/fuzzer/Android.bp index 45ce6ab4..be47f44a 100644 --- a/fuzzer/Android.bp +++ b/fuzzer/Android.bp @@ -39,6 +39,14 @@ cc_defaults { "android-media-fuzzing-reports@google.com", ], componentid: 155276, + hotlists: [ + "4593311", + ], + description: "The fuzzer targets the APIs of libopus", + vector: "remote", + service_privilege: "constrained", + users: "multi_user", + fuzzed_code_usage: "shipped", }, } diff --git a/include/opus.h b/include/opus.h index d282f21d..0c69c627 100644 --- a/include/opus.h +++ b/include/opus.h @@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels); * This must be one of 8000, 12000, 16000, * 24000, or 48000. * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal - * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @param [in] application <tt>int</tt>: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY) * @param [out] error <tt>int*</tt>: @ref opus_errorcodes * @note Regardless of the sampling rate and number channels selected, the Opus encoder * can switch to a lower audio bandwidth or number of channels if the bitrate @@ -222,7 +222,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create( * This must be one of 8000, 12000, 16000, * 24000, or 48000. * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal - * @param [in] application <tt>int</tt>: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @param [in] application <tt>int</tt>: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY) * @retval #OPUS_OK Success or @ref opus_errorcodes */ OPUS_EXPORT int opus_encoder_init( diff --git a/include/opus_custom.h b/include/opus_custom.h index 2227be01..2f22d4b3 100644 --- a/include/opus_custom.h +++ b/include/opus_custom.h @@ -104,7 +104,8 @@ typedef struct OpusCustomDecoder OpusCustomDecoder; /** The mode contains all the information necessary to create an encoder. Both the encoder and decoder need to be initialized with exactly the same mode, otherwise the output will be - corrupted. + corrupted. The mode MUST NOT BE DESTROYED until the encoders and + decoders that use it are destroyed as well. @brief Mode configuration */ typedef struct OpusCustomMode OpusCustomMode; diff --git a/include/opus_defines.h b/include/opus_defines.h index ceee5b84..94b9e0d9 100644 --- a/include/opus_defines.h +++ b/include/opus_defines.h @@ -482,7 +482,8 @@ extern "C" { * @param[in] x <tt>opus_int32</tt>: Allowed values: * <dl> * <dt>0</dt><dd>Disable inband FEC (default).</dd> - * <dt>1</dt><dd>Enable inband FEC.</dd> + * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd> + * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd> * </dl> * @hideinitializer */ #define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x) @@ -491,7 +492,8 @@ extern "C" { * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values: * <dl> * <dt>0</dt><dd>Inband FEC disabled (default).</dd> - * <dt>1</dt><dd>Inband FEC enabled.</dd> + * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd> + * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd> * </dl> * @hideinitializer */ #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x) diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt index 51db6112..84c19ee4 100644 --- a/libopus_blocklist.txt +++ b/libopus_blocklist.txt @@ -24,7 +24,13 @@ fun:ec_decode_bin fun:silk_noise_shape_quantizer_del_dec # silk/NSQ.c:265:25: 1318152552 + 1068143768 cannot be represented in type 'int' fun:silk_noise_shape_quantizer - +# silk/x86/NSQ_del_dec_sse4_1.c:571:28: 1162446838 - -1165932966 cannot be represented in type 'int' +fun:silk_noise_shape_quantizer_del_dec_sse4_1 +# silk/fixed/x86/burg_modified_FIX_sse4_1.c:277: 1940085720 + 252655088 cannot be represented +# in type 'int' +fun:silk_burg_modified_sse4_1 +# silk/fixed/burg_modified_FIX.c:181 1940085720 + 252655088 cannot be represented in type 'int' +fun:silk_burg_modified_c src:*/celt/kiss_fft.c # assembly optimizations that know what they are doing diff --git a/meson.build b/meson.build index 41f69353..ed66d380 100644 --- a/meson.build +++ b/meson.build @@ -532,9 +532,9 @@ if not opt_intrinsics.disabled() endif # opt_rtcd else if opt_intrinsics.enabled() - error('intrinsics option enabled, but no intrinsics support for ' + host_machine.get_cpu()) + error('intrinsics option enabled, but no intrinsics support for ' + host_cpu_family) endif - warning('No intrinsics support for ' + host_machine.get_cpu()) + warning('No intrinsics support for ' + host_cpu_family) endif endif diff --git a/meson/get-version.py b/meson/get-version.py index 0e8b8623..d3835f13 100755 --- a/meson/get-version.py +++ b/meson/get-version.py @@ -31,7 +31,7 @@ if __name__ == '__main__': # check if git checkout git_dir = os.path.join(srcroot, '.git') - is_git = os.path.isdir(git_dir) + is_git = os.path.isdir(git_dir) or os.path.isfile(git_dir) have_git = shutil.which('git') is not None if is_git and have_git: @@ -63,7 +63,7 @@ dnl #include <string.h> #include <opus.h> -int main () +int main (void) { system("touch conf.opustest"); return 0; diff --git a/silk/LPC_fit.c b/silk/LPC_fit.c index cdea4f3a..c0690a1f 100644 --- a/silk/LPC_fit.c +++ b/silk/LPC_fit.c @@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" -/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */ +/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around. + This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */ void silk_LPC_fit( opus_int16 *a_QOUT, /* O Output signal */ opus_int32 *a_QIN, /* I/O Input signal */ diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h index 8dd4ce2e..3110da9a 100644 --- a/silk/MacroDebug.h +++ b/silk/MacroDebug.h @@ -55,7 +55,7 @@ static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){ opus_int32 ret; - ret = a + b; + ret = (opus_int32)((opus_uint32)a + (opus_uint32)b); if ( ret != silk_ADD_SAT32( a, b ) ) { fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line); @@ -101,9 +101,9 @@ static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file #undef silk_SUB32 #define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){ - opus_int32 ret; + opus_int64 ret; - ret = a - b; + ret = a - (opus_int64)b; if ( ret != silk_SUB_SAT32( a, b ) ) { fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line); @@ -257,7 +257,7 @@ static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, c static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){ opus_int32 ret; opus_int64 ret64; - ret = a32 * b32; + ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32); ret64 = (opus_int64)a32 * (opus_int64)b32; if ( (opus_int64)ret != ret64 ) { @@ -333,8 +333,8 @@ static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char #define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ opus_int32 ret; - ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) ); - if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) ) + ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) ); + if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) ) { fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -465,7 +465,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char if ( fail ) { - fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line); + fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line); #ifdef FIXED_DEBUG_ASSERT silk_assert( 0 ); #endif @@ -491,12 +491,6 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_ return ret; } -/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */ -#undef silk_MLA_ovflw -#define silk_MLA_ovflw(a32, b32, c32) ((a32) + ((b32) * (c32))) -#undef silk_SMLABB_ovflw -#define silk_SMLABB_ovflw(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32))) - /* no checking needed for silk_SMULL no checking needed for silk_SMLAL no checking needed for silk_SMLALBB @@ -546,10 +540,10 @@ static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, cha static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){ opus_int8 ret; int fail = 0; - ret = a << shift; + ret = (opus_int8)((opus_uint8)a << shift); fail |= shift < 0; fail |= shift >= 8; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -565,10 +559,10 @@ static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char * static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){ opus_int16 ret; int fail = 0; - ret = a << shift; + ret = (opus_int16)((opus_uint16)a << shift); fail |= shift < 0; fail |= shift >= 16; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -584,10 +578,10 @@ static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, cha static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){ opus_int32 ret; int fail = 0; - ret = a << shift; + ret = (opus_int32)((opus_uint32)a << shift); fail |= shift < 0; fail |= shift >= 32; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -603,7 +597,7 @@ static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, cha static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){ opus_int64 ret; int fail = 0; - ret = a << shift; + ret = (opus_int64)((opus_uint64)a << shift); fail |= shift < 0; fail |= shift >= 64; fail |= (ret>>shift) != ((opus_int64)a); @@ -714,8 +708,8 @@ static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift #define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){ opus_int16 ret; - ret = a + (b << shift); - if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + ret = a + (opus_int16)((opus_uint16)b << shift); + if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -729,8 +723,8 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a + (b << shift); - if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift)); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -774,7 +768,7 @@ static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int #define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a + (b >> shift); + ret = silk_ADD32_ovflw(a, (b >> shift)); if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) ) { fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); @@ -804,8 +798,8 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a - (b << shift); - if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) ) + ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift)); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -819,7 +813,7 @@ static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opu #define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a - (b >> shift); + ret = silk_SUB32_ovflw(a, (b >> shift)); if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) ) { fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); @@ -835,7 +829,7 @@ static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opu static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){ opus_int32 ret; ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; - /* the marco definition can't handle a shift of zero */ + /* the macro definition can't handle a shift of zero */ if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) ) { fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -850,7 +844,7 @@ static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, #define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__) static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){ opus_int64 ret; - /* the marco definition can't handle a shift of zero */ + /* the macro definition can't handle a shift of zero */ if ( (shift <= 0) || (shift>=64) ) { fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line); @@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer( void silk_NSQ_c ( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int k, lag, start_idx, LSF_interpolation_flag; @@ -173,9 +173,9 @@ void silk_NSQ_c RESTORE_STACK; } -/***********************************/ -/* silk_noise_shape_quantizer */ -/***********************************/ +/******************************/ +/* silk_noise_shape_quantizer */ +/******************************/ #if !defined(OPUS_X86_MAY_HAVE_SSE4_1) static OPUS_INLINE @@ -262,7 +262,7 @@ void silk_noise_shape_quantizer( tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); shp_lag_ptr++; diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c index 00e749c3..41f3fc93 100644 --- a/silk/NSQ_del_dec.c +++ b/silk/NSQ_del_dec.c @@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( ); void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index a9068908..65fe6a07 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale( const opus_int len /* I vector lengths */ ); -opus_int64 silk_inner_prod16_aligned_64_c( +opus_int64 silk_inner_prod16_c( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ @@ -630,12 +630,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) /* the following seems faster on x86 */ #define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32) -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if !defined(OVERRIDE_silk_burg_modified) #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +#endif -#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len)) +#if !defined(OVERRIDE_silk_inner_prod16) +#define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len)) #endif #include "Inlines.h" diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c index 0f3d545c..245a7e4b 100644 --- a/silk/VQ_WMat_EC.c +++ b/silk/VQ_WMat_EC.c @@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c( *rate_dist_Q8 = silk_int32_MAX; *res_nrg_Q15 = silk_int32_MAX; cb_row_Q7 = cb_Q7; - /* In things go really bad, at least *ind is set to something safe. */ + /* If things go really bad, at least *ind is set to something safe. */ *ind = 0; for( k = 0; k < L; k++ ) { opus_int32 penalty; @@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c( if( sum1_Q15 >= 0 ) { /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */ bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) ); - /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */ + /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */ bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 ); if( bits_tot_Q8 <= *rate_dist_Q8 ) { *rate_dist_Q8 = bits_tot_Q8; diff --git a/silk/bwexpander_32.c b/silk/bwexpander_32.c index d0010f73..0f32b9df 100644 --- a/silk/bwexpander_32.c +++ b/silk/bwexpander_32.c @@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" -/* Chirp (bandwidth expand) LP AR filter */ +/* Chirp (bandwidth expand) LP AR filter. + This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */ void silk_bwexpander_32( opus_int32 *ar, /* I/O AR filter to be expanded (without leading 1) */ const opus_int d, /* I Length of ar */ diff --git a/silk/control_codec.c b/silk/control_codec.c index 52aa8fde..784ffe66 100644 --- a/silk/control_codec.c +++ b/silk/control_codec.c @@ -415,7 +415,7 @@ static OPUS_INLINE opus_int silk_setup_LBRR( /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */ psEncC->LBRR_GainIncreases = 7; } else { - psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 ); + psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 ); } } diff --git a/silk/enc_API.c b/silk/enc_API.c index 55a33f37..548e0736 100644 --- a/silk/enc_API.c +++ b/silk/enc_API.c @@ -270,6 +270,7 @@ opus_int silk_Encode( /* O Returns error co psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); ALLOC( buf, nSamplesFromInputMax, opus_int16 ); while( 1 ) { + int curr_nBitsUsedLBRR = 0; nSamplesToBuffer = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx; nSamplesToBuffer = silk_min( nSamplesToBuffer, nSamplesToBufferMax ); nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); @@ -342,6 +343,7 @@ opus_int silk_Encode( /* O Returns error co opus_uint8 iCDF[ 2 ] = { 0, 0 }; iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal ); ec_enc_icdf( psRangeEnc, 0, iCDF, 8 ); + curr_nBitsUsedLBRR = ec_tell( psRangeEnc ); /* Encode any LBRR data from previous packet */ /* Encode LBRR flags */ @@ -386,8 +388,7 @@ opus_int silk_Encode( /* O Returns error co for( n = 0; n < encControl->nChannelsInternal; n++ ) { silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) ); } - - psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc ); + curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR; } silk_HP_variable_cutoff( psEnc->state_Fxx ); @@ -396,6 +397,16 @@ opus_int silk_Encode( /* O Returns error co nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 ); /* Subtract bits used for LBRR */ if( !prefillFlag ) { + /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage, + except that for the first LBRR frame it does no averaging and for the first + frame after after LBRR, it goes back to zero immediately. */ + if ( curr_nBitsUsedLBRR < 10 ) { + psEnc->nBitsUsedLBRR = 0; + } else if ( psEnc->nBitsUsedLBRR < 10) { + psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR; + } else { + psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2; + } nBits -= psEnc->nBitsUsedLBRR; } /* Divide by number of uncoded frames left in packet */ diff --git a/silk/fixed/LTP_scale_ctrl_FIX.c b/silk/fixed/LTP_scale_ctrl_FIX.c index 3dcedef8..db1016e0 100644 --- a/silk/fixed/LTP_scale_ctrl_FIX.c +++ b/silk/fixed/LTP_scale_ctrl_FIX.c @@ -42,9 +42,14 @@ void silk_LTP_scale_ctrl_FIX( if( condCoding == CODE_INDEPENDENTLY ) { /* Only scale if first frame in packet */ - round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; - psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( - silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 ); + round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket; + if ( psEnc->sCmn.LBRR_flag ) { + /* LBRR reduces the effective loss. In practice, it does not square the loss because + losses aren't independent, but that still seems to work best. We also never go below 2%. */ + round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100; + } + psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 ); + psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 ); } else { /* Default is minimum scaling */ psEnc->sCmn.indices.LTP_scaleIndex = 0; diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c index 274d4b28..185a12b1 100644 --- a/silk/fixed/burg_modified_FIX.c +++ b/silk/fixed/burg_modified_FIX.c @@ -68,7 +68,7 @@ void silk_burg_modified_c( celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); /* Compute autocorrelations, added over subframes */ - C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch ); + C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch ); lz = silk_CLZ64(C0_64); rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; @@ -87,7 +87,7 @@ void silk_burg_modified_c( x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( - silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); + silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); } } } else { @@ -150,7 +150,7 @@ void silk_burg_modified_c( C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ - /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32), + /* We sometimes get overflows in the multiplications (even beyond +/- 2^32), but they cancel each other and the real result seems to always fit in a 32-bit signed integer. This was determined experimentally, not theoretically (unfortunately). */ tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ @@ -253,7 +253,7 @@ void silk_burg_modified_c( if( rshifts > 0 ) { for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; - C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts ); } } else { for( s = 0; s < nb_subfr; s++ ) { diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c index 606d8633..ad363fb7 100644 --- a/silk/fixed/find_pred_coefs_FIX.c +++ b/silk/fixed/find_pred_coefs_FIX.c @@ -42,7 +42,8 @@ void silk_find_pred_coefs_FIX( { opus_int i; opus_int32 invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ]; - opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0}; const opus_int16 *x_ptr; opus_int16 *x_pre_ptr; VARDECL( opus_int16, LPC_in_pre ); diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c index d9498001..dcf84070 100644 --- a/silk/fixed/vector_ops_FIX.c +++ b/silk/fixed/vector_ops_FIX.c @@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned( #endif } -opus_int64 silk_inner_prod16_aligned_64_c( +opus_int64 silk_inner_prod16_c( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ diff --git a/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/silk/fixed/x86/burg_modified_FIX_sse4_1.c index bbb1ce0f..e58bf079 100644 --- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c +++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -42,7 +42,7 @@ #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */ #define QA 25 -#define N_BITS_HEAD_ROOM 2 +#define N_BITS_HEAD_ROOM 3 #define MIN_RSHIFTS -16 #define MAX_RSHIFTS (32 - QA) @@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ) { - opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; + opus_int k, n, s, lz, rshifts, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; @@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1( opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; + opus_int64 C0_64; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); @@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1( celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); /* Compute autocorrelations, added over subframes */ - silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length ); - if( rshifts > MAX_RSHIFTS ) { - C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS ); - silk_assert( C0 > 0 ); - rshifts = MAX_RSHIFTS; + C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch ); + lz = silk_CLZ64(C0_64); + rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; + if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; + if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS; + + if (rshifts > 0) { + C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts ); } else { - lz = silk_CLZ32( C0 ) - 1; - rshifts_extra = N_BITS_HEAD_ROOM - lz; - if( rshifts_extra > 0 ) { - rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts ); - C0 = silk_RSHIFT32( C0, rshifts_extra ); - } else { - rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts ); - C0 = silk_LSHIFT32( C0, -rshifts_extra ); - } - rshifts += rshifts_extra; + C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts ); } + CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); if( rshifts > 0 ) { @@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1( x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( - silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); + silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); } } } else { @@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1( C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ - tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ - tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ + /* We sometimes get overflows in the multiplications (even beyond +/- 2^32), + but they cancel each other and the real result seems to always fit in a 32-bit + signed integer. This was determined experimentally, not theoretically (unfortunately). */ + tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ + tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ @@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1( if( rshifts > 0 ) { for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; - C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts ); } } else { for( s = 0; s < nb_subfr; s++ ) { @@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1( *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ *res_nrg_Q = -rshifts; } + +#ifdef OPUS_CHECK_ASM + { + opus_int32 res_nrg_c = 0; + opus_int res_nrg_Q_c = 0; + opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0}; + + silk_burg_modified_c( + &res_nrg_c, + &res_nrg_Q_c, + A_Q16_c, + x, + minInvGain_Q30, + subfr_length, + nb_subfr, + D, + 0 + ); + + silk_assert( *res_nrg == res_nrg_c ); + silk_assert( *res_nrg_Q == res_nrg_Q_c ); + silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) ); + } +#endif } diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c deleted file mode 100644 index 555432cd..00000000 --- a/silk/fixed/x86/prefilter_FIX_sse.c +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include <xmmintrin.h> -#include <emmintrin.h> -#include <smmintrin.h> -#include "main.h" -#include "celt/x86/x86cpu.h" - -void silk_warped_LPC_analysis_filter_FIX_sse4_1( - opus_int32 state[], /* I/O State [order + 1] */ - opus_int32 res_Q2[], /* O Residual signal [length] */ - const opus_int16 coef_Q13[], /* I Coefficients [order] */ - const opus_int16 input[], /* I Input signal [length] */ - const opus_int16 lambda_Q16, /* I Warping factor */ - const opus_int length, /* I Length of input signal */ - const opus_int order /* I Filter order (even) */ -) -{ - opus_int n, i; - opus_int32 acc_Q11, tmp1, tmp2; - - /* Order must be even */ - celt_assert( ( order & 1 ) == 0 ); - - if (order == 10) - { - if (0 == lambda_Q16) - { - __m128i coef_Q13_3210, coef_Q13_7654; - __m128i coef_Q13_0123, coef_Q13_4567; - __m128i state_0123, state_4567; - __m128i xmm_product1, xmm_product2; - __m128i xmm_tempa, xmm_tempb; - - register opus_int32 sum; - register opus_int32 state_8, state_9, state_a; - register opus_int64 coef_Q13_8, coef_Q13_9; - - celt_assert( length > 0 ); - - coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] ); - coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] ); - - coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - - coef_Q13_8 = (opus_int64) coef_Q13[ 8 ]; - coef_Q13_9 = (opus_int64) coef_Q13[ 9 ]; - - state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) ); - state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) ); - - state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - - state_8 = state[ 8 ]; - state_9 = state[ 9 ]; - state_a = 0; - - for( n = 0; n < length; n++ ) - { - xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */ - xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 ); - - xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ); - - xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */ - xmm_product2 = _mm_srli_epi64( xmm_product2, 16 ); - - xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa ); - xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb ); - - xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 ); - xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 ); - - xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 ); - xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 ); - xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb ); - - sum = (opus_int32)((coef_Q13_8 * state_8) >> 16); - sum += (opus_int32)((coef_Q13_9 * state_9) >> 16); - - xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) ); - sum += _mm_cvtsi128_si32( xmm_tempa); - res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9); - - /* move right */ - state_a = state_9; - state_9 = state_8; - state_8 = _mm_cvtsi128_si32( state_4567 ); - state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 ); - - state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 ); - } - - _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) ); - _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) ); - state[ 8 ] = state_8; - state[ 9 ] = state_9; - state[ 10 ] = state_a; - - return; - } - } - - for( n = 0; n < length; n++ ) { - /* Output of lowpass section */ - tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 ); - state[ 0 ] = silk_LSHIFT( input[ n ], 14 ); - /* Output of allpass section */ - tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 ); - state[ 1 ] = tmp2; - acc_Q11 = silk_RSHIFT( order, 1 ); - acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] ); - /* Loop over allpass sections */ - for( i = 2; i < order; i += 2 ) { - /* Output of allpass section */ - tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 ); - state[ i ] = tmp1; - acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] ); - /* Output of allpass section */ - tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 ); - state[ i + 1 ] = tmp2; - acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] ); - } - state[ order ] = tmp1; - acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] ); - res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 ); - } -} diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c index c1e90564..a46289bb 100644 --- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c +++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c @@ -36,40 +36,38 @@ #include "SigProc_FIX.h" #include "pitch.h" +#include "celt/x86/x86cpu.h" -opus_int64 silk_inner_prod16_aligned_64_sse4_1( +opus_int64 silk_inner_prod16_sse4_1( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ ) { - opus_int i, dataSize8; + opus_int i, dataSize4; opus_int64 sum; - __m128i xmm_tempa; - __m128i inVec1_76543210, acc1; - __m128i inVec2_76543210, acc2; + __m128i xmm_prod_20, xmm_prod_31; + __m128i inVec1_3210, acc1; + __m128i inVec2_3210, acc2; sum = 0; - dataSize8 = len & ~7; + dataSize4 = len & ~3; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); - for( i = 0; i < dataSize8; i += 8 ) { - inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) ); - inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) ); + for( i = 0; i < dataSize4; i += 4 ) { + inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] ); + inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] ); + xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 ); - /* only when all 4 operands are -32768 (0x8000), this results in wrap around */ - inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 ); + inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 ); - xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 ); - /* equal shift right 8 bytes */ - inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) ); - inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 ); - - acc1 = _mm_add_epi64( acc1, xmm_tempa ); - acc2 = _mm_add_epi64( acc2, inVec1_76543210 ); + acc1 = _mm_add_epi64( acc1, xmm_prod_20 ); + acc2 = _mm_add_epi64( acc2, xmm_prod_31 ); } acc1 = _mm_add_epi64( acc1, acc2 ); @@ -81,8 +79,15 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1( _mm_storel_epi64( (__m128i *)&sum, acc1 ); for( ; i < len; i++ ) { - sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); + sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] ); + } + +#ifdef OPUS_CHECK_ASM + { + opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len ); + silk_assert( sum == sum_c ); } +#endif return sum; } diff --git a/silk/float/LTP_scale_ctrl_FLP.c b/silk/float/LTP_scale_ctrl_FLP.c index 8dbe29d0..6f30ff09 100644 --- a/silk/float/LTP_scale_ctrl_FLP.c +++ b/silk/float/LTP_scale_ctrl_FLP.c @@ -41,8 +41,14 @@ void silk_LTP_scale_ctrl_FLP( if( condCoding == CODE_INDEPENDENTLY ) { /* Only scale if first frame in packet */ - round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; - psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f ); + round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket; + if ( psEnc->sCmn.LBRR_flag ) { + /* LBRR reduces the effective loss. In practice, it does not square the loss because + losses aren't independent, but that still seems to work best. We also never go below 2%. */ + round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100; + } + psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 ); + psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 ); } else { /* Default is minimum scaling */ psEnc->sCmn.indices.LTP_scaleIndex = 0; diff --git a/silk/float/find_pred_coefs_FLP.c b/silk/float/find_pred_coefs_FLP.c index dcf7c520..6f790788 100644 --- a/silk/float/find_pred_coefs_FLP.c +++ b/silk/float/find_pred_coefs_FLP.c @@ -44,7 +44,8 @@ void silk_find_pred_coefs_FLP( silk_float XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; silk_float xXLTP[ MAX_NB_SUBFR * LTP_ORDER ]; silk_float invGains[ MAX_NB_SUBFR ]; - opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0}; const silk_float *x_ptr; silk_float *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ]; silk_float minInvGain; diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c index ad90b874..c0c183e3 100644 --- a/silk/float/wrappers_FLP.c +++ b/silk/float/wrappers_FLP.c @@ -190,12 +190,14 @@ void silk_quant_LTP_gains_FLP( opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ]; - for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) { + i = 0; + do { XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f ); - } - for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) { + } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER ); + i = 0; + do { xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f ); - } + } while ( ++i < nb_subfr * LTP_ORDER ); silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch ); diff --git a/silk/main.h b/silk/main.h index 1a33eed5..a5f56875 100644 --- a/silk/main.h +++ b/silk/main.h @@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c( /************************************/ void silk_NSQ_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); #if !defined(OVERRIDE_silk_NSQ) @@ -273,21 +273,21 @@ void silk_NSQ_c( /* Noise shaping using delayed decision */ void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); #if !defined(OVERRIDE_silk_NSQ_del_dec) diff --git a/silk/meson.build b/silk/meson.build index 70692372..917048b2 100644 --- a/silk/meson.build +++ b/silk/meson.build @@ -21,6 +21,16 @@ endif silk_includes = [opus_includes, include_directories('float', 'fixed')] silk_static_libs = [] +if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD') + silk_sources += sources['SILK_SOURCES_X86_RTCD'] +endif + +if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm + if opus_conf.has('OPUS_HAVE_RTCD') + silk_sources += sources['SILK_SOURCES_ARM_RTCD'] + endif +endif + foreach intr_name : ['sse4_1', 'neon_intr'] have_intr = get_variable('have_' + intr_name) if not have_intr diff --git a/silk/stereo_LR_to_MS.c b/silk/stereo_LR_to_MS.c index c8226663..751452cb 100644 --- a/silk/stereo_LR_to_MS.c +++ b/silk/stereo_LR_to_MS.c @@ -77,7 +77,7 @@ void silk_stereo_LR_to_MS( ALLOC( LP_mid, frame_length, opus_int16 ); ALLOC( HP_mid, frame_length, opus_int16 ); for( n = 0; n < frame_length; n++ ) { - sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 ); + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 ); LP_mid[ n ] = sum; HP_mid[ n ] = mid[ n + 1 ] - sum; } @@ -86,7 +86,7 @@ void silk_stereo_LR_to_MS( ALLOC( LP_side, frame_length, opus_int16 ); ALLOC( HP_side, frame_length, opus_int16 ); for( n = 0; n < frame_length; n++ ) { - sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 ); + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 ); LP_side[ n ] = sum; HP_side[ n ] = side[ n + 1 ] - sum; } @@ -207,7 +207,7 @@ void silk_stereo_LR_to_MS( pred0_Q13 += delta0_Q13; pred1_Q13 += delta1_Q13; w_Q24 += deltaw_Q24; - sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); @@ -217,7 +217,7 @@ void silk_stereo_LR_to_MS( pred1_Q13 = -pred_Q13[ 1 ]; w_Q24 = silk_LSHIFT( width_Q14, 10 ); for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { - sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); diff --git a/silk/stereo_MS_to_LR.c b/silk/stereo_MS_to_LR.c index 62521a4f..1e01bb6e 100644 --- a/silk/stereo_MS_to_LR.c +++ b/silk/stereo_MS_to_LR.c @@ -59,7 +59,7 @@ void silk_stereo_MS_to_LR( for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) { pred0_Q13 += delta0_Q13; pred1_Q13 += delta1_Q13; - sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); @@ -67,7 +67,7 @@ void silk_stereo_MS_to_LR( pred0_Q13 = pred_Q13[ 0 ]; pred1_Q13 = pred_Q13[ 1 ]; for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { - sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); diff --git a/silk/tests/test_unit_LPC_inv_pred_gain.c b/silk/tests/test_unit_LPC_inv_pred_gain.c index 67067cea..7ca902ad 100644 --- a/silk/tests/test_unit_LPC_inv_pred_gain.c +++ b/silk/tests/test_unit_LPC_inv_pred_gain.c @@ -43,6 +43,7 @@ int check_stability(opus_int16 *A_Q12, int order) { int i; int j; int sum_a, sum_abs_a; + double y[SILK_MAX_ORDER_LPC] = {0}; sum_a = sum_abs_a = 0; for( j = 0; j < order; j++ ) { sum_a += A_Q12[ j ]; @@ -57,7 +58,6 @@ int check_stability(opus_int16 *A_Q12, int order) { if( sum_abs_a < 4096 ) { return 1; } - double y[SILK_MAX_ORDER_LPC] = {0}; y[0] = 1; for( i = 0; i < 10000; i++ ) { double sum = 0; diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c index 2c75ede2..a58a76cd 100644 --- a/silk/x86/NSQ_del_dec_sse4_1.c +++ b/silk/x86/NSQ_del_dec_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -46,6 +46,7 @@ typedef struct { opus_int32 Shape_Q14[ DECISION_DELAY ]; opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; opus_int32 LF_AR_Q14; + opus_int32 Diff_Q14; opus_int32 Seed; opus_int32 SeedInit; opus_int32 RD_Q10; @@ -56,6 +57,7 @@ typedef struct { opus_int32 RD_Q10; opus_int32 xq_Q14; opus_int32 LF_AR_Q14; + opus_int32 Diff_Q14; opus_int32 sLTP_shp_Q14; opus_int32 LPC_exc_Q14; } NSQ_sample_struct; @@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ - const opus_int32 x_Q3[], /* I Input in Q3 */ + const opus_int16 x16[], /* I Input */ opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ @@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( ); void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; @@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1( VARDECL( opus_int32, delayedGain_Q10 ); VARDECL( NSQ_del_dec_struct, psDelDec ); NSQ_del_dec_struct *psDD; +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; + const opus_int8 *const pulses_a = pulses; +#endif SAVE_STACK; +#ifdef OPUS_CHECK_ASM + ( void )pulses_a; + silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); + silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); + silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); + silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); + silk_NSQ_del_dec_c( + psEncC, + &NSQ_c, + &psIndices_c, + x16, + pulses_c, + PredCoef_Q12, + LTPCoef_Q14, + AR_Q13, + HarmShapeGain_Q14, + Tilt_Q14, + LF_shp_Q14, + Gains_Q16, + pitchL, + Lambda_Q10, + LTP_scale_Q14 + ); +#endif + /* Set unvoiced lag to the previous one, overwrite later for voiced */ lag = NSQ->lagPrev; @@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1( psDD->SeedInit = psDD->Seed; psDD->RD_Q10 = 0; psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; + psDD->Diff_Q14 = NSQ->sDiff_shp_Q14; psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); @@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1( LSF_interpolation_flag = 1; } - ALLOC( sLTP_Q15, - psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); @@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1( for( k = 0; k < psEncC->nb_subfr; k++ ) { A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; - AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; /* Noise shape parameters */ silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); @@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1( } } - silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, + silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, @@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1( Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); - x_Q3 += psEncC->subfr_length; + x16 += psEncC->subfr_length; pulses += psEncC->subfr_length; pxq += psEncC->subfr_length; } @@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1( for( i = 0; i < decisionDelay; i++ ) { last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY; if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY; + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); @@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1( /* Update states */ NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; + NSQ->sDiff_shp_Q14 = psDD->Diff_Q14; NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; /* Save quantized speech signal */ silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + +#ifdef OPUS_CHECK_ASM + silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); + silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); + silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); +#endif + RESTORE_STACK; } @@ -345,6 +387,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; + int rdo_offset; + VARDECL( NSQ_sample_pair, psSampleState ); NSQ_del_dec_struct *psDD; NSQ_sample_struct *psSS; @@ -356,6 +400,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( celt_assert( nStatesDelayedDecision > 0 ); ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); + rdo_offset = (Lambda_Q10 >> 1) - 512; + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); @@ -407,8 +453,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Long-term shaping */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); - n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ shp_lag_ptr++; } else { @@ -478,7 +524,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); - /* setp 4 */ + /* step 4 */ psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ); @@ -511,9 +557,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ /* Noise shape feedback */ - silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ /* Output of lowpass section */ - tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 ); + tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 ); /* Output of allpass section */ tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); psDD->sAR2_Q14[ 0 ] = tmp2; @@ -543,9 +589,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Input minus prediction plus noise feedback */ /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ - tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ + tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ - tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ + tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */ tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ @@ -559,6 +605,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Find two quantization level candidates and measure their rate-distortion */ q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if (Lambda_Q10 > 2048) { + /* For aggressive RDO, the bias becomes more than one pulse. */ + if (q1_Q10 > rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); + } else if (q1_Q10 < -rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); + } else if (q1_Q10 < 0) { + q1_Q0 = -1; + } else { + q1_Q0 = 0; + } + } if( q1_Q0 > 0 ) { q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); @@ -612,8 +670,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); - psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 ); + sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 ); + psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 0 ].xq_Q14 = xq_Q14; @@ -626,14 +685,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( exc_Q14 = -exc_Q14; } - /* Add predictions */ LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); - psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 ); + sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 ); + psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 1 ].xq_Q14 = xq_Q14; @@ -705,6 +764,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( psDD = &psDelDec[ k ]; psSS = &psSampleState[ k ][ 0 ]; psDD->LF_AR_Q14 = psSS->LF_AR_Q14; + psDD->Diff_Q14 = psSS->Diff_Q14; psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; @@ -728,7 +788,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ - const opus_int32 x_Q3[], /* I Input in Q3 */ + const opus_int16 x16[], /* I Input */ opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ @@ -742,51 +802,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( ) { opus_int i, k, lag; - opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; NSQ_del_dec_struct *psDD; - __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; + __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; lag = pitchL[ subfr ]; inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); - silk_assert( inv_gain_Q31 != 0 ); - /* Calculate gain adjustment factor */ - if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { - gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); - } else { - gain_adj_Q16 = (opus_int32)1 << 16; - } - /* Scale input */ - inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); - /* prepare inv_gain_Q23 in packed 4 32-bits */ - xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); + /* prepare inv_gain_Q26 in packed 4 32-bits */ + xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { - xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); + xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); + /* equal shift right 4 bytes*/ - xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); - xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); + xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); + xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); - xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); - xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); + xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); + xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); - xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); + xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 ); + _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { - x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); } - /* Save inverse gain */ - NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; - /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ if( NSQ->rewhite_flag ) { if( subfr == 0 ) { @@ -800,7 +850,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( } /* Adjust for changing gain */ - if( gain_adj_Q16 != (opus_int32)1 << 16 ) { + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + /* Scale long-term shaping state */ { __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; @@ -841,6 +893,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( /* Scale scalar states */ psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); + psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 ); /* Scale short-term prediction and shaping states */ for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { @@ -855,5 +908,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( } } } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; } } diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c index b0315e35..d5ae1d3b 100644 --- a/silk/x86/NSQ_sse4_1.c +++ b/silk/x86/NSQ_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -37,17 +37,17 @@ #include "stack_alloc.h" static OPUS_INLINE void silk_nsq_scale_states_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - const opus_int32 x_Q3[], /* I input in Q3 */ - opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ - const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ - opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ - opus_int subfr, /* I subframe number */ - const opus_int LTP_scale_Q14, /* I */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ - const opus_int signal_type /* I Signal type */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int16 x16[], /* I input */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ ); static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( @@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int Tilt_Q14, /* I Spectral tilt */ opus_int32 LF_shp_Q14, /* I */ opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ opus_int offset_Q10, /* I */ opus_int length, /* I Input length */ opus_int32 table[][4] /* I */ ); void silk_NSQ_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int k, lag, start_idx, LSF_interpolation_flag; @@ -101,8 +102,41 @@ void silk_NSQ_sse4_1( opus_int32 tmp1; opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; + const opus_int8 *const pulses_a = pulses; +#endif + SAVE_STACK; +#ifdef OPUS_CHECK_ASM + ( void )pulses_a; + silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); + silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); + silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); + silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); + + silk_NSQ_c( + psEncC, + &NSQ_c, + &psIndices_c, + x16, + pulses_c, + PredCoef_Q12, + LTPCoef_Q14, + AR_Q13, + HarmShapeGain_Q14, + Tilt_Q14, + LF_shp_Q14, + Gains_Q16, + pitchL, + Lambda_Q10, + LTP_scale_Q14 + ); +#endif + NSQ->rand_seed = psIndices->Seed; /* Set unvoiced lag to the previous one, overwrite later for voiced */ @@ -172,8 +206,7 @@ void silk_NSQ_sse4_1( LSF_interpolation_flag = 1; } - ALLOC( sLTP_Q15, - psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); /* Set up pointers to start of sub frame */ @@ -183,7 +216,7 @@ void silk_NSQ_sse4_1( for( k = 0; k < psEncC->nb_subfr; k++ ) { A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; - AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; /* Noise shape parameters */ silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); @@ -209,12 +242,12 @@ void silk_NSQ_sse4_1( } } - silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); + silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) ) { silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, - AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], + AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, &(table[32]) ); } else @@ -224,7 +257,7 @@ void silk_NSQ_sse4_1( offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); } - x_Q3 += psEncC->subfr_length; + x16 += psEncC->subfr_length; pulses += psEncC->subfr_length; pxq += psEncC->subfr_length; } @@ -235,12 +268,19 @@ void silk_NSQ_sse4_1( /* Save quantized speech and noise shaping signals */ silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + +#ifdef OPUS_CHECK_ASM + silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); + silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); + silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); +#endif + RESTORE_STACK; } -/***********************************/ -/* silk_noise_shape_quantizer_10_16 */ -/***********************************/ +/************************************/ +/* silk_noise_shape_quantizer_10_16 */ +/************************************/ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( silk_nsq_state *NSQ, /* I/O NSQ state */ opus_int signalType, /* I Signal type */ @@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int Tilt_Q14, /* I Spectral tilt */ opus_int32 LF_shp_Q14, /* I */ opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ opus_int offset_Q10, /* I */ opus_int length, /* I Input length */ opus_int32 table[][4] /* I */ @@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int i; opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10; - opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; + opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14; opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; @@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210; __m128i AR_shp_Q13_76543210; + int rdo_offset = (Lambda_Q10 >> 1) - 512; + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); @@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14; xq_Q14 = psLPC_Q14[ 0 ]; + sDiff_shp_Q14 = NSQ->sDiff_shp_Q14; LTP_pred_Q13 = 0; /* load a_Q12 */ @@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 ); sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 ); - sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 ); - sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 ); + sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 ); + sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 ); /* high part, use pmaddwd, results in 4 32-bit */ xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 ); @@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 ); - silk_assert( lag > 0 || signalType != TYPE_VOICED ); + celt_assert( lag > 0 || signalType != TYPE_VOICED ); /* Combine prediction and noise shaping signals */ tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); shp_lag_ptr++; @@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* Find two quantization level candidates and measure their rate-distortion */ q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if (Lambda_Q10 > 2048) { + /* For aggressive RDO, the bias becomes more than one pulse. */ + if (q1_Q10 > rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); + } else if (q1_Q10 < -rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); + } else if (q1_Q10 < 0) { + q1_Q0 = -1; + } else { + q1_Q0 = 0; + } + } q1_Q10 = table[q1_Q0][0]; q2_Q10 = table[q1_Q0][1]; @@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* Update states */ psLPC_Q14++; *psLPC_Q14 = xq_Q14; - sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 ); + NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 ); + sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 ); NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); @@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( } static OPUS_INLINE void silk_nsq_scale_states_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - const opus_int32 x_Q3[], /* I input in Q3 */ - opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ - const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ - opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ - opus_int subfr, /* I subframe number */ - const opus_int LTP_scale_Q14, /* I */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ - const opus_int signal_type /* I Signal type */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int16 x16[], /* I input */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ ) { opus_int i, lag; - opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; - __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; + __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; lag = pitchL[ subfr ]; inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); silk_assert( inv_gain_Q31 != 0 ); - /* Calculate gain adjustment factor */ - if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { - gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); - } else { - gain_adj_Q16 = (opus_int32)1 << 16; - } - /* Scale input */ - inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); - /* prepare inv_gain_Q23 in packed 4 32-bits */ - xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); + /* prepare inv_gain_Q26 in packed 4 32-bits */ + xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { - xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); + xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); /* equal shift right 4 bytes*/ - xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); - xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); + xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); + xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); - xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); - xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); + xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); + xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); - xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); + xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 ); + _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { - x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); } - /* Save inverse gain */ - NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; - /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ if( NSQ->rewhite_flag ) { if( subfr == 0 ) { @@ -671,9 +718,11 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( } /* Adjust for changing gain */ - if( gain_adj_Q16 != (opus_int32)1 << 16 ) { - /* Scale long-term shaping state */ + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + + /* Scale long-term shaping state */ /* prepare gain_adj_Q16 in packed 4 32-bits */ xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16); @@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( } NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); + NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 ); /* Scale short-term prediction and shaping states */ for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { @@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; } } diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h index 61efa8da..89a5ec88 100644 --- a/silk/x86/SigProc_FIX_sse.h +++ b/silk/x86/SigProc_FIX_sse.h @@ -26,13 +26,13 @@ */ #ifndef SIGPROC_FIX_SSE_H -#define SIGPROC_FIX_SSE_H +# define SIGPROC_FIX_SSE_H -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif +# ifdef HAVE_CONFIG_H +# include "config.h" +# endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ @@ -45,11 +45,13 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ); -#if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ - ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +# if defined(OPUS_X86_PRESUME_SSE4_1) + +# define OVERRIDE_silk_burg_modified +# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *res_nrg, /* O Residual energy */ @@ -62,33 +64,36 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( const opus_int D, /* I Order */ int arch /* I Run-time architecture */); -# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ - ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +# define OVERRIDE_silk_burg_modified +# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) -#endif +# endif -opus_int64 silk_inner_prod16_aligned_64_sse4_1( +opus_int64 silk_inner_prod16_sse4_1( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ); -#if defined(OPUS_X86_PRESUME_SSE4_1) +# if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) +# define OVERRIDE_silk_inner_prod16 +# define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len)) -#else +# elif defined(OPUS_HAVE_RTCD) -extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( +extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len); -# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len)) +# define OVERRIDE_silk_inner_prod16 +# define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len)) -#endif -#endif +# endif +# endif #endif diff --git a/silk/x86/VAD_sse4_1.c b/silk/x86/VAD_sse4_1.c index d02ddf4a..e7eaf971 100644 --- a/silk/x86/VAD_sse4_1.c +++ b/silk/x86/VAD_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s SAVE_STACK; +#ifdef OPUS_CHECK_ASM + silk_encoder_state psEncC_c; + opus_int ret_c; + + silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) ); + ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn ); +#endif + /* Safety checks */ silk_assert( VAD_N_BANDS == 4 ); celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length ); @@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 ); } + if( psEncC->frame_length == 20 * psEncC->fs_kHz ) { + speech_nrg = silk_RSHIFT32( speech_nrg, 1 ); + } /* Power scaling */ if( speech_nrg <= 0 ) { SA_Q15 = silk_RSHIFT( SA_Q15, 1 ); - } else if( speech_nrg < 32768 ) { - if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { - speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 ); - } else { - speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 ); - } + } else if( speech_nrg < 16384 ) { + speech_nrg = silk_LSHIFT32( speech_nrg, 16 ); /* square-root */ speech_nrg = silk_SQRT_APPROX( speech_nrg ); @@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) ); } +#ifdef OPUS_CHECK_ASM + silk_assert( ret == ret_c ); + silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) ); +#endif + RESTORE_STACK; return( ret ); } diff --git a/silk/x86/VQ_WMat_EC_sse4_1.c b/silk/x86/VQ_WMat_EC_sse4_1.c index 74d6c6d0..2c7d18d0 100644 --- a/silk/x86/VQ_WMat_EC_sse4_1.c +++ b/silk/x86/VQ_WMat_EC_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -38,105 +38,136 @@ /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */ void silk_VQ_WMat_EC_sse4_1( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ) { opus_int k, gain_tmp_Q7; const opus_int8 *cb_row_Q7; - opus_int16 diff_Q14[ 5 ]; - opus_int32 sum1_Q14, sum2_Q16; + opus_int32 neg_xX_Q24[ 5 ]; + opus_int32 sum1_Q15, sum2_Q24; + opus_int32 bits_res_Q8, bits_tot_Q8; + __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24; + + /* Negate and convert to new Q domain */ + neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 ); + neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 ); + neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 ); + neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 ); + neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 ); + + v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) ); + v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5; /* Loop over codebook */ - *rate_dist_Q14 = silk_int32_MAX; + *rate_dist_Q8 = silk_int32_MAX; + *res_nrg_Q15 = silk_int32_MAX; cb_row_Q7 = cb_Q7; + /* If things go really bad, at least *ind is set to something safe. */ + *ind = 0; for( k = 0; k < L; k++ ) { + opus_int32 penalty; gain_tmp_Q7 = cb_gain_Q7[k]; - - diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 ); - - C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] ); - C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] ); - C_tmp2 = _mm_slli_epi32( C_tmp2, 7 ); - C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 ); - - diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 ); - diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 ); - diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 ); - diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 ); - /* Weighted rate */ - sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] ); + /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */ + sum1_Q15 = SILK_FIX_CONST( 1.001, 15 ); /* Penalty for too large gain */ - sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 ); - - silk_assert( sum1_Q14 >= 0 ); - - /* first row of W_Q18 */ - C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) ); - C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 ); - C_tmp4 = _mm_srli_si128( C_tmp4, 2 ); - - C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ - C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ - - C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 ); - C_tmp5 = _mm_srli_si128( C_tmp5, 2 ); - - C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 ); - C_tmp5 = _mm_slli_epi32( C_tmp5, 1 ); - - C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) ); - sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 ); - - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] ); - - /* second row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] ); - - /* third row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] ); - - /* fourth row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] ); - - /* last row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] ); - - silk_assert( sum1_Q14 >= 0 ); + penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 ); + + /* first row of XX_Q17 */ + v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] ); + v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 ); + v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 ); + v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7); + v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) ); + v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24); + sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 ); + sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] ); + + /* second row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] ); + + /* third row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] ); + + /* fourth row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] ); + + /* last row of XX_Q17 */ + sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] ); /* find best */ - if( sum1_Q14 < *rate_dist_Q14 ) { - *rate_dist_Q14 = sum1_Q14; - *ind = (opus_int8)k; - *gain_Q7 = gain_tmp_Q7; + if( sum1_Q15 >= 0 ) { + /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */ + bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) ); + /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */ + bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 ); + if( bits_tot_Q8 <= *rate_dist_Q8 ) { + *rate_dist_Q8 = bits_tot_Q8; + *res_nrg_Q15 = sum1_Q15 + penalty; + *ind = (opus_int8)k; + *gain_Q7 = gain_tmp_Q7; + } } /* Go to next cbk vector */ cb_row_Q7 += LTP_ORDER; } + +#ifdef OPUS_CHECK_ASM + { + opus_int8 ind_c = 0; + opus_int32 res_nrg_Q15_c = 0; + opus_int32 rate_dist_Q8_c = 0; + opus_int gain_Q7_c = 0; + + silk_VQ_WMat_EC_c( + &ind_c, + &res_nrg_Q15_c, + &rate_dist_Q8_c, + &gain_Q7_c, + XX_Q17, + xX_Q17, + cb_Q7, + cb_gain_Q7, + cl_Q5, + subfr_len, + max_gain_Q7, + L + ); + + silk_assert( *ind == ind_c ); + silk_assert( *res_nrg_Q15 == res_nrg_Q15_c ); + silk_assert( *rate_dist_Q8 == rate_dist_Q8_c ); + silk_assert( *gain_Q7 == gain_Q7_c ); + } +#endif } diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h index 2f15d448..a01d7f6c 100644 --- a/silk/x86/main_sse.h +++ b/silk/x86/main_sse.h @@ -26,171 +26,169 @@ */ #ifndef MAIN_SSE_H -#define MAIN_SSE_H +# define MAIN_SSE_H -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif +# ifdef HAVE_CONFIG_H +# include "config.h" +# endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */ -# define OVERRIDE_silk_VQ_WMat_EC - void silk_VQ_WMat_EC_sse4_1( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined OPUS_X86_PRESUME_SSE4_1 -#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L, arch) \ - ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L)) +# define OVERRIDE_silk_VQ_WMat_EC +# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ); -# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L, arch) \ - ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L)) +# define OVERRIDE_silk_VQ_WMat_EC +# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L, arch) \ + ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L)) -#endif -#endif - -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ -# define OVERRIDE_silk_NSQ +# endif void silk_NSQ_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined OPUS_X86_PRESUME_SSE4_1 -#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ +# define OVERRIDE_silk_NSQ +# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ +# define OVERRIDE_silk_NSQ +# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#endif - -# define OVERRIDE_silk_NSQ_del_dec +# endif void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined OPUS_X86_PRESUME_SSE4_1 -#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ - ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ - ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#endif -#endif +# endif void silk_noise_shape_quantizer( silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -223,26 +221,27 @@ void silk_VAD_GetNoiseLevels( silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ ); -# define OVERRIDE_silk_VAD_GetSA_Q8 - opus_int silk_VAD_GetSA_Q8_sse4_1( silk_encoder_state *psEnC, const opus_int16 pIn[] ); -#if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) +# if defined(OPUS_X86_PRESUME_SSE4_1) -#else +# define OVERRIDE_silk_VAD_GetSA_Q8 +# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) -# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ - ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) +# elif defined(OPUS_HAVE_RTCD) extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( silk_encoder_state *psEnC, const opus_int16 pIn[]); -#endif +# define OVERRIDE_silk_VAD_GetSA_Q8 +# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ + ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) + +# endif # endif #endif diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c index 32dcc3ca..70f60078 100644 --- a/silk/x86/x86_silk_map.c +++ b/silk/x86/x86_silk_map.c @@ -35,22 +35,22 @@ #include "pitch.h" #include "main.h" -#if !defined(OPUS_X86_PRESUME_SSE4_1) +#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1) #if defined(FIXED_POINT) #include "fixed/main_FIX.h" -opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( +opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ) = { - silk_inner_prod16_aligned_64_c, /* non-sse */ - silk_inner_prod16_aligned_64_c, - silk_inner_prod16_aligned_64_c, - MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ - MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */ + silk_inner_prod16_c, /* non-sse */ + silk_inner_prod16_c, + silk_inner_prod16_c, + MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */ + MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */ }; #endif @@ -66,23 +66,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */ }; -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) = { silk_NSQ_c, /* non-sse */ silk_NSQ_c, @@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */ }; -#endif -#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ) = { silk_VQ_WMat_EC_c, /* non-sse */ silk_VQ_WMat_EC_c, @@ -112,25 +110,23 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */ }; -#endif -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) = { silk_NSQ_del_dec_c, /* non-sse */ silk_NSQ_del_dec_c, @@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */ }; -#endif #if defined(FIXED_POINT) diff --git a/silk_sources.mk b/silk_sources.mk index d2666e66..3df24816 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -77,15 +77,19 @@ silk/stereo_find_predictor.c \ silk/stereo_quant_pred.c \ silk/LPC_fit.c -SILK_SOURCES_SSE4_1 = \ +SILK_SOURCES_X86_RTCD = \ +silk/x86/x86_silk_map.c + +SILK_SOURCES_SSE4_1 = \ silk/x86/NSQ_sse4_1.c \ silk/x86/NSQ_del_dec_sse4_1.c \ -silk/x86/x86_silk_map.c \ silk/x86/VAD_sse4_1.c \ silk/x86/VQ_WMat_EC_sse4_1.c +SILK_SOURCES_ARM_RTCD = \ +silk/arm/arm_silk_map.c + SILK_SOURCES_ARM_NEON_INTR = \ -silk/arm/arm_silk_map.c \ silk/arm/biquad_alt_neon_intr.c \ silk/arm/LPC_inv_pred_gain_neon_intr.c \ silk/arm/NSQ_del_dec_neon_intr.c \ diff --git a/src/opus_decoder.c b/src/opus_decoder.c index 9113638a..6520e748 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -278,7 +278,8 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, ec_dec_init(&dec,(unsigned char*)data,len); } else { audiosize = frame_size; - mode = st->prev_mode; + /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */ + mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode; bandwidth = 0; if (mode == 0) @@ -419,7 +420,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, start_band = 0; if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL - && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len) + && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len) { /* Check if we have a redundant 0-8 kHz band */ if (mode == MODE_HYBRID) @@ -499,6 +500,11 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, /* 5 ms redundant frame for CELT->SILK*/ if (redundancy && celt_to_silk) { + /* If the previous frame did not use CELT (the first redundancy frame in + a transition from SILK may have been lost) then the CELT decoder is + stale at this point and the redundancy audio is not useful, however + the final range is still needed (for testing), so the redundancy is + always decoded but the decoded audio may not be used */ MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0))); celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL, 0); @@ -561,7 +567,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5, pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs); } - if (redundancy && celt_to_silk) + /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not + use CELT (the first redundancy frame in a transition from SILK may have + been lost) */ + if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy)) { for (c=0;c<st->channels;c++) { diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 7b5f0abf..8c8db5a5 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -87,6 +87,7 @@ struct OpusEncoder { int lfe; int arch; int use_dtx; /* general DTX for both SILK and CELT */ + int fec_config; #ifndef DISABLE_FLOAT_API TonalityAnalysisState analysis; #endif @@ -112,7 +113,7 @@ struct OpusEncoder { opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef DISABLE_FLOAT_API int detected_bandwidth; - int nb_no_activity_frames; + int nb_no_activity_ms_Q1; opus_val32 peak_signal_energy; #endif int nonfinal_frame; /* current frame is not the final in a packet */ @@ -893,24 +894,28 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in /* Decides if DTX should be turned on (=1) or off (=0) */ static int decide_dtx_mode(opus_int activity, /* indicates if this frame contains speech/music */ - int *nb_no_activity_frames /* number of consecutive frames with no activity */ + int *nb_no_activity_ms_Q1, /* number of consecutive milliseconds with no activity, in Q1 */ + int frame_size_ms_Q1 /* number of miliseconds in this update, in Q1 */ ) { if (!activity) { - /* The number of consecutive DTX frames should be within the allowed bounds */ - (*nb_no_activity_frames)++; - if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX) + /* The number of consecutive DTX frames should be within the allowed bounds. + Note that the allowed bound is defined in the SILK headers and assumes 20 ms + frames. As this function can be called with any frame length, a conversion to + milliseconds is done before the comparisons. */ + (*nb_no_activity_ms_Q1) += frame_size_ms_Q1; + if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2) { - if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)) + if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2) /* Valid frame for DTX! */ return 1; else - (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX; + (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2; } } else - (*nb_no_activity_frames) = 0; + (*nb_no_activity_ms_Q1) = 0; return 0; } @@ -1310,6 +1315,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->stream_channels = st->force_channels; } else { #ifdef FUZZING + (void)stereo_music_threshold; + (void)stereo_voice_threshold; /* Random mono/stereo decision */ if (st->channels == 2 && (rand()&0x1F)==0) st->stream_channels = 3-st->stream_channels; @@ -1348,6 +1355,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } else if (st->user_forced_mode == OPUS_AUTO) { #ifdef FUZZING + (void)stereo_width; + (void)mode_thresholds; /* Random mode switching */ if ((rand()&0xF)==0) { @@ -1385,8 +1394,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY; - /* When FEC is enabled and there's enough packet loss, use SILK */ - if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4) + /* When FEC is enabled and there's enough packet loss, use SILK. + Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */ + if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25)) st->mode = MODE_SILK_ONLY; /* When encoding voice and DTX is enabled but the generalized DTX cannot be used, use SILK in order to make use of its DTX. */ @@ -2132,7 +2142,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #ifndef DISABLE_FLOAT_API if (st->use_dtx && (analysis_info.valid || is_silence)) { - if (decide_dtx_mode(activity, &st->nb_no_activity_frames)) + if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs)) { st->rangeFinal = 0; data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels); @@ -2140,7 +2150,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ return 1; } } else { - st->nb_no_activity_frames = 0; + st->nb_no_activity_ms_Q1 = 0; } #endif @@ -2435,11 +2445,12 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) case OPUS_SET_INBAND_FEC_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); - if(value<0 || value>1) + if(value<0 || value>2) { goto bad_arg; } - st->silk_mode.useInBandFEC = value; + st->fec_config = value; + st->silk_mode.useInBandFEC = (value != 0); } break; case OPUS_GET_INBAND_FEC_REQUEST: @@ -2449,7 +2460,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) { goto bad_arg; } - *value = st->silk_mode.useInBandFEC; + *value = st->fec_config; } break; case OPUS_SET_PACKET_LOSS_PERC_REQUEST: @@ -2733,7 +2744,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) #ifndef DISABLE_FLOAT_API else if (st->use_dtx) { /* DTX determined by Opus. */ - *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX; + *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2; } #endif else { diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index 93204a14..213e3eb2 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -443,7 +443,8 @@ static int opus_multistream_encoder_init_impl( char *ptr; if ((channels>255) || (channels<1) || (coupled_streams>streams) || - (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) || + (streams+coupled_streams>channels)) return OPUS_BAD_ARG; st->arch = opus_select_arch(); @@ -459,8 +460,7 @@ static int opus_multistream_encoder_init_impl( st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout)) return OPUS_BAD_ARG; - if (mapping_type == MAPPING_TYPE_SURROUND && - !validate_encoder_layout(&st->layout)) + if (!validate_encoder_layout(&st->layout)) return OPUS_BAD_ARG; if (mapping_type == MAPPING_TYPE_AMBISONICS && !validate_ambisonics(st->layout.nb_channels, NULL, NULL)) @@ -595,7 +595,8 @@ OpusMSEncoder *opus_multistream_encoder_create( int ret; OpusMSEncoder *st; if ((channels>255) || (channels<1) || (coupled_streams>streams) || - (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) || + (streams+coupled_streams>channels)) { if (error) *error = OPUS_BAD_ARG; diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh new file mode 100755 index 00000000..573f4473 --- /dev/null +++ b/tests/opus_build_test.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +tarball=`realpath "$1"` +nb_tests="$2" +oldvectors=`realpath "$3"` +newvectors=`realpath "$4"` +base=`basename "$tarball" .tar.gz` + +tar xvf "$tarball" > /dev/null 2>&1 +cd "$base" + +if [ $? -ne 0 ] +then + echo cannot go to "$base" + exit 1 +fi + +mkdir build_tests + +configure_dir=`pwd` +seq -w "$nb_tests" | parallel --halt now,fail=10 -j +2 -q ../random_config.sh "build_tests/run_{}" "$configure_dir" "$oldvectors" "$newvectors" + +if [ $? -ne 0 ] +then + echo Check found errors + exit 1 +else + echo No error found +fi diff --git a/tests/opus_encode_regressions.c b/tests/opus_encode_regressions.c index 29234730..4d506eb6 100644 --- a/tests/opus_encode_regressions.c +++ b/tests/opus_encode_regressions.c @@ -35,7 +35,6 @@ #include <stdint.h> #include <math.h> #include <string.h> -#include <assert.h> #include "opus_multistream.h" #include "opus.h" #include "test_opus_common.h" @@ -106,7 +105,7 @@ static int celt_ec_internal_error(void) 1799, 1799, 1799, 1799, -9721 }; err = opus_multistream_encode(enc, pcm, 320, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0)); @@ -144,7 +143,7 @@ static int celt_ec_internal_error(void) -9510, -9510, -9510, -9510, -9510, -9510, -9510 }; err = opus_multistream_encode(enc, pcm, 160, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0)); @@ -182,7 +181,7 @@ static int celt_ec_internal_error(void) -9510, -9510, -9510, -9510, -9510, -9510, -9510 }; err = opus_multistream_encode(enc, pcm, 160, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0)); @@ -220,7 +219,7 @@ static int celt_ec_internal_error(void) -9510, -9510, -9510, -9510, -9510, -9510, -9510 }; err = opus_multistream_encode(enc, pcm, 160, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0)); @@ -256,7 +255,7 @@ static int celt_ec_internal_error(void) 5632 }; err = opus_multistream_encode(enc, pcm, 160, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0)); @@ -281,7 +280,7 @@ static int celt_ec_internal_error(void) 0, 0, -256, 226 }; err = opus_multistream_encode(enc, pcm, 40, data, 2460); - assert(err > 0); + opus_test_assert(err > 0); /* returns -3 */ } opus_multistream_encoder_destroy(enc); @@ -334,7 +333,7 @@ static int mscbr_encode_fail10(void) 0 }; err = opus_multistream_encode(enc, pcm, 20, data, 627300); - assert(err > 0); + opus_test_assert(err > 0); /* returns -1 */ } opus_multistream_encoder_destroy(enc); @@ -384,7 +383,7 @@ static int mscbr_encode_fail(void) 0 }; err = opus_multistream_encode(enc, pcm, 20, data, 472320); - assert(err > 0); + opus_test_assert(err > 0); /* returns -1 */ } opus_multistream_encoder_destroy(enc); @@ -740,7 +739,7 @@ static int surround_analysis_uninit(void) -20992, 25859, 5372, 12040, 13307, -4355,-30213, -9, -6019 }; err = opus_multistream_encode(enc, pcm, 960, data, 7380); - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(1)); @@ -885,7 +884,7 @@ static int surround_analysis_uninit(void) }; err = opus_multistream_encode(enc, pcm, 1440, data, 7380); /* reads uninitialized data at src/opus_multistream_encoder.c:293 */ - assert(err > 0); + opus_test_assert(err > 0); } opus_multistream_encoder_destroy(enc); return 0; @@ -935,7 +934,7 @@ static int ec_enc_shrink_assert(void) opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(6)); opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000)); data_len = opus_encode(enc, pcm1, 960, data, 2000); - assert(data_len > 0); + opus_test_assert(data_len > 0); opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE)); opus_encoder_ctl(enc, OPUS_SET_PREDICTION_DISABLED(1)); @@ -943,12 +942,12 @@ static int ec_enc_shrink_assert(void) opus_encoder_ctl(enc, OPUS_SET_INBAND_FEC(1)); opus_encoder_ctl(enc, OPUS_SET_BITRATE(15600)); data_len = opus_encode(enc, pcm2, 2880, data, 122); - assert(data_len > 0); + opus_test_assert(data_len > 0); opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); opus_encoder_ctl(enc, OPUS_SET_BITRATE(27000)); data_len = opus_encode(enc, pcm3, 2880, data, 122); /* assertion failure */ - assert(data_len > 0); + opus_test_assert(data_len > 0); opus_encoder_destroy(enc); return 0; @@ -970,7 +969,7 @@ static int ec_enc_shrink_assert2(void) { static const short pcm[960] = { 0 }; data_len = opus_encode(enc, pcm, 960, data, 2000); - assert(data_len > 0); + opus_test_assert(data_len > 0); } opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC)); { @@ -980,7 +979,7 @@ static int ec_enc_shrink_assert2(void) -32768, -32768, 0, 0, -32768, -32768, 0, 0, -32768, -32768 }; data_len = opus_encode(enc, pcm, 480, data, 19); - assert(data_len > 0); + opus_test_assert(data_len > 0); } opus_encoder_destroy(enc); return 0; @@ -1009,14 +1008,14 @@ static int silk_gain_assert(void) opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_NARROWBAND)); opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000)); data_len = opus_encode(enc, pcm1, 160, data, 1000); - assert(data_len > 0); + opus_test_assert(data_len > 0); opus_encoder_ctl(enc, OPUS_SET_VBR(0)); opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(0)); opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_MEDIUMBAND)); opus_encoder_ctl(enc, OPUS_SET_BITRATE(2867)); data_len = opus_encode(enc, pcm2, 960, data, 1000); - assert(data_len > 0); + opus_test_assert(data_len > 0); opus_encoder_destroy(enc); return 0; diff --git a/tests/random_config.sh b/tests/random_config.sh new file mode 100755 index 00000000..0cdd855f --- /dev/null +++ b/tests/random_config.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +dir="$1" +mkdir "$dir" +if [ $? -ne 0 ] +then + exit 1 +fi + +cd "$dir" +if [ $? -ne 0 ] +then + exit 1 +fi + + +configure_path="$2" +config="random_config.txt" + +case `seq 3 | shuf -n1` in +1) +approx=--enable-float-approx +math=-ffast-math +;; +2) +approx=--enable-float-approx +;; +*) +approx= +math= +;; +esac + +CFLAGS='-g' + +opt=`echo -e "-O1\n-O2\n-O3" | shuf -n1` + +#arch=-march=`echo -e "core2\nsandybridge\nbroadwell\nskylake" | shuf -n1` +arch=`echo -e "\n-march=core2\n-march=sandybridge\n-march=broadwell\n-march=skylake\n-march=native" | shuf -n1` + +footprint=`echo -e "\n-DSMALL_FOOTPRINT" | shuf -n1` +std=`echo -e "\n-std=c90\n-std=c99\n-std=c11\n-std=c17" | shuf -n1` +sanitize=`echo -e "\n-fsanitize=address -fno-sanitize-recover=all\n-fsanitize=undefined -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow" | shuf -n1` + + +CFLAGS="$CFLAGS $std $opt $arch $footprint $math $sanitize" + +echo "CFLAGS=$CFLAGS" > "$config" + +lib=`echo -e "\n--disable-static\n--disable-shared" | shuf -n1` + +arithmetic=`echo -e "\n--enable-fixed-point\n--enable-fixed-point --enable-fixed-point-debug\n--enable-fixed-point --disable-float-api\n--enable-fixed-point --enable-fixed-point-debug --disable-float-api" | shuf -n1` + +custom=`echo -e "\n--enable-custom-modes" | shuf -n1` + +asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1` +#asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1` + +assert=`echo -e "\n--enable-assertions" | shuf -n1` +harden=`echo -e "\n--enable-hardening" | shuf -n1` +fuzz=`echo -e "\n--enable-fuzzing" | shuf -n1` +checkasm=`echo -e "\n--enable-check-asm" | shuf -n1` +rfc8251=`echo -e "\n--disable-rfc8251" | shuf -n1` + +if [ "$rfc8251" = --disable-rfc8251 ] +then + vectors="$3" +else + vectors="$4" +fi +echo using testvectors at "$vectors" >> "$config" + + +config_opt="$lib $arithmetic $custom $asm $assert $harden $fuzz $checkasm $rfc8251 $approx" + +echo configure $config_opt >> "$config" + +export CFLAGS +"$configure_path/configure" $config_opt > configure_output.txt 2>&1 + +if [ $? -ne 0 ] +then + echo configure FAIL >> "$config" + exit 1 +fi + +make > make_output.txt 2>&1 + +if [ $? -ne 0 ] +then + echo make FAIL >> "$config" + exit 1 +fi + +#Run valgrind 5% of the time (minus the asan cases) +if [ "`seq 20 | shuf -n1`" -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ] +then + make check > makecheck_output.txt 2>&1 +else + echo valgrind enabled >> "$config" + valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1 +fi + +if [ $? -ne 0 ] +then + echo check FAIL >> "$config" + exit 1 +fi + + +rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1` +echo testvectors for "$rate" Hz > testvectors_output.txt +../../../run_vectors.sh . "$vectors" "$rate" >> testvectors_output.txt 2>&1 + +if [ $? -ne 0 ] +then + echo testvectors FAIL >> "$config" + exit 1 +fi + +echo all tests PASS >> "$config" + +#When everything's good, do some cleaning up to save space +make distclean > /dev/null 2>&1 +rm -f tmp.out +gzip make_output.txt diff --git a/tests/test_opus_api.c b/tests/test_opus_api.c index fb385c63..0e7ed2cc 100644 --- a/tests/test_opus_api.c +++ b/tests/test_opus_api.c @@ -1298,7 +1298,7 @@ opus_int32 test_enc_api(void) err=opus_encoder_ctl(enc,OPUS_GET_INBAND_FEC(null_int_ptr)); if(err!=OPUS_BAD_ARG)test_failed(); cfgs++; - CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,2, + CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,3, 1,0, " OPUS_SET_INBAND_FEC .......................... OK.\n", " OPUS_GET_INBAND_FEC .......................... OK.\n") diff --git a/tests/test_opus_common.h b/tests/test_opus_common.h index d96c7d84..5fb924f4 100644 --- a/tests/test_opus_common.h +++ b/tests/test_opus_common.h @@ -81,5 +81,5 @@ static OPUS_INLINE void _test_failed(const char *file, int line) abort(); } #define test_failed() _test_failed(__FILE__, __LINE__); - +#define opus_test_assert(cond) {if (!(cond)) {test_failed();}} void regression_test(void); diff --git a/tests/test_opus_encode.c b/tests/test_opus_encode.c index 00795a1e..d6e8e2d3 100644 --- a/tests/test_opus_encode.c +++ b/tests/test_opus_encode.c @@ -297,6 +297,7 @@ int run_test1(int no_fuzz) /*FIXME: encoder api tests, fs!=48k, mono, VBR*/ fprintf(stdout," Encode+Decode tests.\n"); + fflush(stdout); enc = opus_encoder_create(48000, 2, OPUS_APPLICATION_VOIP, &err); if(err != OPUS_OK || enc==NULL)test_failed(); @@ -466,6 +467,7 @@ int run_test1(int no_fuzz) count++; }while(i<(SSAMPLES-MAX_FRAME_SAMP)); fprintf(stdout," Mode %s FB encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate); + fflush(stdout); } } @@ -543,6 +545,7 @@ int run_test1(int no_fuzz) count++; }while(i<(SSAMPLES/12-MAX_FRAME_SAMP)); fprintf(stdout," Mode %s NB dual-mono MS encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate); + fflush(stdout); } } @@ -612,6 +615,7 @@ int run_test1(int no_fuzz) i+=frame_size; }while(i<SAMPLES*4); fprintf(stdout," All framesize pairs switching encode, %d frames OK.\n",count); + fflush(stdout); if(opus_encoder_ctl(enc, OPUS_RESET_STATE)!=OPUS_OK)test_failed(); opus_encoder_destroy(enc); diff --git a/tests/test_opus_padding.c b/tests/test_opus_padding.c index c22e8f0d..c9ef7375 100644 --- a/tests/test_opus_padding.c +++ b/tests/test_opus_padding.c @@ -39,7 +39,7 @@ #define CHANNELS 2 #define FRAMESIZE 5760 -int test_overflow(void) +void test_overflow(void) { OpusDecoder *decoder; int result; @@ -51,7 +51,7 @@ int test_overflow(void) fprintf(stderr, " Checking for padding overflow... "); if (!in || !out) { fprintf(stderr, "FAIL (out of memory)\n"); - return -1; + test_failed(); } in[0] = 0xff; in[1] = 0x41; @@ -71,21 +71,18 @@ int test_overflow(void) } fprintf(stderr, "OK.\n"); - - return 1; } int main(void) { const char *oversion; - int tests = 0;; iseed = 0; oversion = opus_get_version_string(); if (!oversion) test_failed(); fprintf(stderr, "Testing %s padding.\n", oversion); - tests += test_overflow(); + test_overflow(); fprintf(stderr, "All padding tests passed.\n"); diff --git a/tests/test_opus_projection.c b/tests/test_opus_projection.c index 5f0d672c..4e06613e 100644 --- a/tests/test_opus_projection.c +++ b/tests/test_opus_projection.c @@ -29,7 +29,6 @@ #include "config.h" #endif -#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <stdint.h> |