From 16286a25fdd865c66a837a73b65fbaa7b25bf484 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20de=20Vicente=20Pe=C3=B1a?=
 <devicentepena@webrtc.org>
Date: Fri, 12 Feb 2021 13:51:43 +0100
Subject: Sending refresh DTX packets every 400 ms independently of the encoded
 frame size.

Signed-off-by: Felicia Lim <flim@google.com>
---
 src/opus_encoder.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 7b5f0abf..321bb2bb 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -112,7 +112,7 @@ struct OpusEncoder {
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef DISABLE_FLOAT_API
     int          detected_bandwidth;
-    int          nb_no_activity_frames;
+    int          nb_no_activity_ms_Q1;
     opus_val32   peak_signal_energy;
 #endif
     int          nonfinal_frame; /* current frame is not the final in a packet */
@@ -893,24 +893,28 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in
 
 /* Decides if DTX should be turned on (=1) or off (=0) */
 static int decide_dtx_mode(opus_int activity,            /* indicates if this frame contains speech/music */
-                           int *nb_no_activity_frames    /* number of consecutive frames with no activity */
+                           int *nb_no_activity_ms_Q1,    /* number of consecutive milliseconds with no activity, in Q1 */
+                           int frame_size_ms_Q1          /* number of miliseconds in this update, in Q1 */
                            )
 
 {
    if (!activity)
    {
-      /* The number of consecutive DTX frames should be within the allowed bounds */
-      (*nb_no_activity_frames)++;
-      if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX)
+      /* The number of consecutive DTX frames should be within the allowed bounds. 
+      Note that the allowed bound is defined in the Silk headers and assumes 20 ms
+      frames. As this function can be called with any frame length, a conversion to
+      miliseconds is done before the comparisons. */
+      (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
+      if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
       {
-         if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX))
+         if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2)
             /* Valid frame for DTX! */
             return 1;
          else
-            (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX;
+            (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
       }
    } else
-      (*nb_no_activity_frames) = 0;
+      (*nb_no_activity_ms_Q1) = 0;
 
    return 0;
 }
@@ -2132,7 +2136,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifndef DISABLE_FLOAT_API
     if (st->use_dtx && (analysis_info.valid || is_silence))
     {
-       if (decide_dtx_mode(activity, &st->nb_no_activity_frames))
+       if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs))
        {
           st->rangeFinal = 0;
           data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels);
@@ -2140,7 +2144,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
           return 1;
        }
     } else {
-       st->nb_no_activity_frames = 0;
+       st->nb_no_activity_ms_Q1 = 0;
     }
 #endif
 
@@ -2733,7 +2737,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
 #ifndef DISABLE_FLOAT_API
             else if (st->use_dtx) {
                 /* DTX determined by Opus. */
-                *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX;
+                *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
             }
 #endif
             else {
-- 
cgit v1.2.3


From 7b05f44f4baadf34d8d1073f4ff69f1806d5cdb4 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Mon, 22 Feb 2021 22:29:14 -0800
Subject: celt_lpc: avoid overflows when computing lpcs in fixed point

The LPCs are computed in 32-bit, so increase the allowed range from +/-8
to +/-64 to avoid overflows caught during fuzzing. Before downshifting
back down to the +/-8 range in the final 16-bit output, perform bandwidth
extension to avoid any additional overflow issues.
---
 celt/celt_lpc.c      | 54 +++++++++++++++++++++++++++++++++++++++++++++++-----
 celt/fixed_debug.h   | 45 +++++++++++++++++++++++++++++++++++++++++++
 celt/fixed_generic.h | 10 ++++++++++
 silk/LPC_fit.c       |  3 ++-
 silk/bwexpander_32.c |  3 ++-
 5 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 8ecb693e..457e7ed0 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -57,10 +57,10 @@ int          p
          opus_val32 rr = 0;
          for (j = 0; j < i; j++)
             rr += MULT32_32_Q31(lpc[j],ac[i - j]);
-         rr += SHR32(ac[i + 1],3);
-         r = -frac_div32(SHL32(rr,3), error);
+         rr += SHR32(ac[i + 1],6);
+         r = -frac_div32(SHL32(rr,6), error);
          /*  Update LPC coefficients and total error */
-         lpc[i] = SHR32(r,3);
+         lpc[i] = SHR32(r,6);
          for (j = 0; j < (i+1)>>1; j++)
          {
             opus_val32 tmp1, tmp2;
@@ -82,8 +82,52 @@ int          p
       }
    }
 #ifdef FIXED_POINT
-   for (i=0;i<p;i++)
-      _lpc[i] = ROUND16(lpc[i],16);
+   {
+      /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+         This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+         fixes should also be applied there. */
+      int iter, idx = 0;
+      opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+      for (iter = 0; iter < 10; iter++) {
+         maxabs = 0;
+         for (i = 0; i < p; i++) {
+            absval = ABS32(lpc[i]);
+            if (absval > maxabs) {
+               maxabs = absval;
+               idx = i;
+            }
+         }
+         maxabs = PSHR32(maxabs, 13);  /* Q25->Q12 */
+
+         if (maxabs > 32767) {
+            maxabs = MIN32(maxabs, 163838);
+            chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+                                                    SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+            chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+            /* Apply bandwidth expansion. */
+            for (i = 0; i < p - 1; i++) {
+               lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+               chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+            }
+            lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+         } else {
+            break;
+         }
+      }
+
+      if (iter == 10) {
+         /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+            fall back to the A(z)=1 filter. */
+         OPUS_CLEAR(lpc, p);
+         _lpc[0] = 4096;  /* Q12 */
+      } else {
+         for (i = 0; i < p; i++) {
+            _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13));  /* Q25->Q12 */
+         }
+      }
+   }
 #endif
 }
 
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f4352952..3765baa6 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b)
    return res;
 }
 
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_32: inputs are not int: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a*b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_32: output is not int: %d\n", res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_Q16: output is not int: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
 #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
 static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
 {
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 0ecbb899..8f29d46b 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -57,6 +57,13 @@
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #if OPUS_FAST_INT64
 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -131,6 +138,9 @@
 /** 16x16 multiplication where the result fits in 16 bits */
 #define MULT16_16_16(a,b)     ((((opus_val16)(a))*((opus_val16)(b))))
 
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b)     ((((opus_val32)(a))*((opus_val32)(b))))
+
 /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
 /** 16x16 multiplication where the result fits in 32 bits */
 #define MULT16_16(a,b)     (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/silk/LPC_fit.c b/silk/LPC_fit.c
index cdea4f3a..c0690a1f 100644
--- a/silk/LPC_fit.c
+++ b/silk/LPC_fit.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 
-/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
+/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_LPC_fit(
     opus_int16                  *a_QOUT,            /* O    Output signal                                               */
     opus_int32                    *a_QIN,             /* I/O  Input signal                                                */
diff --git a/silk/bwexpander_32.c b/silk/bwexpander_32.c
index d0010f73..0f32b9df 100644
--- a/silk/bwexpander_32.c
+++ b/silk/bwexpander_32.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 
-/* Chirp (bandwidth expand) LP AR filter */
+/* Chirp (bandwidth expand) LP AR filter.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_bwexpander_32(
     opus_int32                  *ar,                /* I/O  AR filter to be expanded (without leading 1)                */
     const opus_int              d,                  /* I    Length of ar                                                */
-- 
cgit v1.2.3


From 2985a40afee560dbbbc8dcf63c9eea09b3e2b733 Mon Sep 17 00:00:00 2001
From: Ralph Giles <giles@thaumas.net>
Date: Tue, 11 May 2021 10:19:53 -0700
Subject: Fix trailing whitespace.

This was introduced in February, and fails the corresponding
check in gitlab ci runs.

Also indent the subsequent lines to match and correct typos.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 src/opus_encoder.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 321bb2bb..253fe9e8 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -900,10 +900,10 @@ static int decide_dtx_mode(opus_int activity,            /* indicates if this fr
 {
    if (!activity)
    {
-      /* The number of consecutive DTX frames should be within the allowed bounds. 
-      Note that the allowed bound is defined in the Silk headers and assumes 20 ms
-      frames. As this function can be called with any frame length, a conversion to
-      miliseconds is done before the comparisons. */
+      /* The number of consecutive DTX frames should be within the allowed bounds.
+         Note that the allowed bound is defined in the SILK headers and assumes 20 ms
+         frames. As this function can be called with any frame length, a conversion to
+         milliseconds is done before the comparisons. */
       (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
       if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
       {
-- 
cgit v1.2.3


From dfd6c88aaa54a03a61434c413e30c217eb98f1d5 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Tue, 22 Dec 2020 00:33:56 -0800
Subject: cmake - add support to run ctest on android #2347

Signed-off-by: Ralph Giles <giles@thaumas.net>
---
 CMakeLists.txt      | 49 ++++++++++++++++++++++++------------------
 Makefile.am         |  1 +
 cmake/RunTest.cmake | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 20 deletions(-)
 create mode 100644 cmake/RunTest.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a28f441c..02de9b74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -579,7 +579,7 @@ if(OPUS_BUILD_PROGRAMS)
   target_link_libraries(opus_compare PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
 endif()
 
-if(BUILD_TESTING)
+if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
   enable_testing()
 
   # tests
@@ -590,29 +590,38 @@ if(BUILD_TESTING)
   if(OPUS_FIXED_POINT)
     target_compile_definitions(test_opus_decode PRIVATE DISABLE_FLOAT_API)
   endif()
-  add_test(NAME test_opus_decode COMMAND $<TARGET_FILE:test_opus_decode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME test_opus_decode COMMAND ${CMAKE_COMMAND}
+           -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_decode>
+           -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+           -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
 
   add_executable(test_opus_padding ${test_opus_padding_sources})
   target_include_directories(test_opus_padding
                              PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(test_opus_padding PRIVATE opus)
-  add_test(NAME test_opus_padding COMMAND $<TARGET_FILE:test_opus_padding> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
-  if(NOT BUILD_SHARED_LIBS)
-    # disable tests that depends on private API when building shared lib
-    add_executable(test_opus_api ${test_opus_api_sources})
-    target_include_directories(test_opus_api
-                               PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
-    target_link_libraries(test_opus_api PRIVATE opus)
-    if(OPUS_FIXED_POINT)
-      target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
-    endif()
-    add_test(NAME test_opus_api COMMAND $<TARGET_FILE:test_opus_api> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
-    add_executable(test_opus_encode ${test_opus_encode_sources})
-    target_include_directories(test_opus_encode
-                               PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
-    target_link_libraries(test_opus_encode PRIVATE opus)
-    add_test(NAME test_opus_encode COMMAND $<TARGET_FILE:test_opus_encode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME test_opus_padding COMMAND ${CMAKE_COMMAND}
+           -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_padding>
+           -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+           -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
+
+  add_executable(test_opus_api ${test_opus_api_sources})
+  target_include_directories(test_opus_api
+                            PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+  target_link_libraries(test_opus_api PRIVATE opus)
+  if(OPUS_FIXED_POINT)
+    target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
   endif()
+  add_test(NAME test_opus_api COMMAND ${CMAKE_COMMAND}
+        -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_api>
+        -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+        -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
+
+  add_executable(test_opus_encode ${test_opus_encode_sources})
+  target_include_directories(test_opus_encode
+                            PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+  target_link_libraries(test_opus_encode PRIVATE opus)
+  add_test(NAME test_opus_encode COMMAND ${CMAKE_COMMAND}
+        -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_encode>
+        -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+        -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
 endif()
diff --git a/Makefile.am b/Makefile.am
index 83beaa3f..70a2ebfa 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -222,6 +222,7 @@ EXTRA_DIST = opus.pc.in \
              cmake/OpusFunctions.cmake \
              cmake/OpusPackageVersion.cmake \
              cmake/OpusSources.cmake \
+             cmake/RunTest.cmake \
              cmake/config.h.cmake.in \
              cmake/vla.c \
              meson/get-version.py \
diff --git a/cmake/RunTest.cmake b/cmake/RunTest.cmake
new file mode 100644
index 00000000..f6f8b4a2
--- /dev/null
+++ b/cmake/RunTest.cmake
@@ -0,0 +1,61 @@
+if(NOT EXISTS ${TEST_EXECUTABLE})
+    message(FATAL_ERROR "Error could not find ${TEST_EXECUTABLE}, ensure that you built the test binary")
+endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+
+  # support to run plain old binary on android devices
+  # requires android debug bridge to be installed
+
+  find_program(adb_executable adb)
+  if(NOT adb_executable)
+    message(FATAL_ERROR "Error could not find adb")
+  endif()
+
+  # check if any device emulator is attached
+  execute_process(COMMAND ${adb_executable} shell echo RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error adb: no devices/emulators found")
+  endif()
+
+  # push binary
+  set(android_path /data/local/tmp)
+  execute_process(COMMAND ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} failed with result ${CMD_RESULT}")
+  endif()
+
+  # set permissions
+  get_filename_component(test_executable ${TEST_EXECUTABLE} NAME)
+  set(test_executable_on_android /data/local/tmp/${test_executable})
+  execute_process(COMMAND ${adb_executable} shell chmod 555 ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell chmod 555 ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+  # run executable
+  execute_process(COMMAND ${adb_executable} shell ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+  # clean up binary
+  execute_process(COMMAND ${adb_executable} shell rm ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell rm ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS")
+  # CTest doesn't support iOS
+
+  message(FATAL_ERROR "Error CTest is not supported on iOS")
+
+else()
+  # for other platforms just execute test binary on host
+
+  execute_process(COMMAND ${TEST_EXECUTABLE} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${TEST_EXECUTABLE} failed with result ${CMD_RESULT}")
+  endif()
+
+endif()
\ No newline at end of file
-- 
cgit v1.2.3


From 4b21ff9c5421ac563b57275b99665d721a0b5ed3 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Mon, 7 Jun 2021 16:35:27 -0700
Subject: Relax comparison to 0 to avoid a floating point divide-by-zero error.

---
 celt/celt_lpc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 457e7ed0..5ac54b27 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -50,7 +50,11 @@ int          p
 #endif
 
    OPUS_CLEAR(lpc, p);
-   if (ac[0] != 0)
+#ifdef FIXED_POINT
+   if (ac[0] > QCONST32(0.001f, 31))
+#else
+   if (ac[0] > 1e-10f)
+#endif
    {
       for (i = 0; i < p; i++) {
          /* Sum up this iteration's reflection coefficient */
@@ -73,10 +77,10 @@ int          p
          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
          /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
             break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
             break;
 #endif
       }
-- 
cgit v1.2.3


From 1639592368fc2dadc82d63f3be6f17ed0b554d71 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Thu, 10 Jun 2021 11:54:55 -0700
Subject: Revert relaxing comparison to 0 for fixed point only

---
 celt/celt_lpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 5ac54b27..242e6df5 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -51,7 +51,7 @@ int          p
 
    OPUS_CLEAR(lpc, p);
 #ifdef FIXED_POINT
-   if (ac[0] > QCONST32(0.001f, 31))
+   if (ac[0] != 0)
 #else
    if (ac[0] > 1e-10f)
 #endif
-- 
cgit v1.2.3


From 61747bc6ec728de69d54db6ece90ad4617f059b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim-Philipp=20M=C3=BCller?= <tim@centricular.com>
Date: Tue, 13 Apr 2021 12:41:18 +0100
Subject: meson: fix get-version script for git worktrees

For git worktree directories .git is not a directory
but a file that points to the real .git dir.

The `update_version` script used by other builds
works correctly with git worktrees.

Signed-off-by: Ralph Giles <giles@thaumas.net>
---
 meson/get-version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/meson/get-version.py b/meson/get-version.py
index 0e8b8623..d3835f13 100755
--- a/meson/get-version.py
+++ b/meson/get-version.py
@@ -31,7 +31,7 @@ if __name__ == '__main__':
 
         # check if git checkout
         git_dir = os.path.join(srcroot, '.git')
-        is_git = os.path.isdir(git_dir)
+        is_git = os.path.isdir(git_dir) or os.path.isfile(git_dir)
         have_git = shutil.which('git') is not None
 
         if is_git and have_git:
-- 
cgit v1.2.3


From 6b6035ae4a29abbd237463d84a45fbeb0d92bc18 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Wed, 7 Jul 2021 12:21:20 -0700
Subject: Remove an unused parameter

---
 celt/bands.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/celt/bands.c b/celt/bands.c
index 2702963c..bd54036a 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
    sctx->itheta = itheta;
    sctx->qalloc = qalloc;
 }
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
       celt_norm *lowband_out)
 {
    int c;
@@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
             sign = ec_dec_bits(ec, 1);
          }
          ctx->remaining_bits -= 1<<BITRES;
-         b-=1<<BITRES;
       }
       if (ctx->resynth)
          x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+      return quant_band_n1(ctx, X, NULL, lowband_out);
    }
 
    if (tf_change>0)
@@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, Y, b, lowband_out);
+      return quant_band_n1(ctx, X, Y, lowband_out);
    }
 
    orig_fill = fill;
-- 
cgit v1.2.3


From a8e6a77c5fe0c37aa6788f939f24f8cd22ae2652 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Mon, 27 Sep 2021 21:52:22 -0700
Subject: Check channels/stream counts and mapping when creating the
 multistream encoder

---
 src/opus_multistream_encoder.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index 93204a14..213e3eb2 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -443,7 +443,8 @@ static int opus_multistream_encoder_init_impl(
    char *ptr;
 
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
       return OPUS_BAD_ARG;
 
    st->arch = opus_select_arch();
@@ -459,8 +460,7 @@ static int opus_multistream_encoder_init_impl(
       st->layout.mapping[i] = mapping[i];
    if (!validate_layout(&st->layout))
       return OPUS_BAD_ARG;
-   if (mapping_type == MAPPING_TYPE_SURROUND &&
-       !validate_encoder_layout(&st->layout))
+   if (!validate_encoder_layout(&st->layout))
       return OPUS_BAD_ARG;
    if (mapping_type == MAPPING_TYPE_AMBISONICS &&
        !validate_ambisonics(st->layout.nb_channels, NULL, NULL))
@@ -595,7 +595,8 @@ OpusMSEncoder *opus_multistream_encoder_create(
    int ret;
    OpusMSEncoder *st;
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
    {
       if (error)
          *error = OPUS_BAD_ARG;
-- 
cgit v1.2.3


From ec64b3c5b7abd621dfddee6b4cc115298e5d6803 Mon Sep 17 00:00:00 2001
From: Felicia Lim <flim@google.com>
Date: Thu, 9 Dec 2021 12:54:43 -0800
Subject: Fix buffer overflow in xcorr_kernel_sse4_1

Before, an overflow can occur in the last loop if `len` is not a
multiple of 4 as OP_CVTEPI16_EPI32_M64 tries to load 64 bits, but there
are insufficient bits allocated in `x`.
---
 celt/x86/pitch_sse4_1.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..58db6c7f 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,11 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
     __m128i sum0, sum1, sum2, sum3, vecSum;
     __m128i initSum;
 
+#ifdef OPUS_CHECK_ASM
+    opus_val32 sum_c[4]={0,0,0,0};
+    xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
     celt_assert(len >= 3);
 
     sum0 = _mm_setzero_si128();
@@ -177,19 +182,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
         vecSum = _mm_add_epi32(vecSum, sum2);
     }
 
-    for (;j<len;j++)
+    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+    if (len - j == 3)
     {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+    else if (len - j == 2)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
 
         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
 
         sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+    }
+    else if (len - j == 1)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
         vecSum = _mm_add_epi32(vecSum, sum0);
     }
 
     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
     initSum = _mm_add_epi32(initSum, vecSum);
     _mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
 }
 #endif
-- 
cgit v1.2.3


From 12a356e431d1b2d3531d3d73de330bf9ee9be48b Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Sat, 18 Dec 2021 18:07:59 -0500
Subject: Disable dangerous SSE 4.1 intrinsic optimizations

These could result in 16-byte-aligned loads on unaligned data, causing
a segfault.
---
 celt/x86/x86cpu.h | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17b..0de8df35 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,10 @@
 int opus_select_arch(void);
 # endif
 
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
-  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
-  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
-  reference, these require 16-byte alignment and load a full 16 bytes (instead
-  of 4 or 8), possibly reading out of bounds.
-
-  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
-  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
-  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
-  optimize this out when optimizations ARE enabled.
-
-  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
-  (which is fair, since technically the compiler is always allowed to do the
-  dereference before invoking the function implementing the intrinsic).
-  However, it is smart enough to eliminate the extra MOVD instruction.
-  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
-  the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI8_EPI32_M32(x) \
+#define OP_CVTEPI8_EPI32_M32(x) \
  (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
 
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI16_EPI32_M64(x) \
+#define OP_CVTEPI16_EPI32_M64(x) \
  (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-#  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
 
 #endif
-- 
cgit v1.2.3


From 66d060c7734c9b780d0183f6565d474fc246b84b Mon Sep 17 00:00:00 2001
From: Tom Denton <tomdenton@google.com>
Date: Thu, 6 Jan 2022 09:35:12 -0800
Subject: Initialize non-zero test arrays.

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 celt/x86/pitch_sse4_1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index 58db6c7f..2bc57830 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -118,7 +118,10 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
     __m128i initSum;
 
 #ifdef OPUS_CHECK_ASM
-    opus_val32 sum_c[4]={0,0,0,0};
+    opus_val32 sum_c[4];
+    for (j=0;j<4;j++) {
+      sum_c[j] = sum[j];
+    }
     xcorr_kernel_c(x, y, sum_c, len);
 #endif
 
-- 
cgit v1.2.3


From 37aba6e9b382f7dbdb7916adbc335704cf2992e8 Mon Sep 17 00:00:00 2001
From: Tom Denton <tomdenton@google.com>
Date: Mon, 7 Feb 2022 16:34:41 -0800
Subject: Prevent int32 overflow when applying HARM FIR filter in NSQ.c by
 using a saturating sum. This matches behavior in NSQ_del_dec.c.

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 silk/NSQ.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/silk/NSQ.c b/silk/NSQ.c
index 1d64d8e2..ab8f9b6c 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -262,7 +262,7 @@ void silk_noise_shape_quantizer(
         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
-- 
cgit v1.2.3


From c6f98577716d39907264d5388b74b5be5dea3d6c Mon Sep 17 00:00:00 2001
From: Francis Quiers <fquiers@cisco.com>
Date: Mon, 18 May 2020 17:15:05 +0100
Subject: Update and re-enable SILK SSE4.1 optimisations

---
 silk/NSQ.c                                |  34 ++---
 silk/NSQ_del_dec.c                        |  28 ++--
 silk/SigProc_FIX.h                        |   6 +-
 silk/VQ_WMat_EC.c                         |   4 +-
 silk/fixed/burg_modified_FIX.c            |   8 +-
 silk/fixed/vector_ops_FIX.c               |   2 +-
 silk/fixed/x86/burg_modified_FIX_sse4_1.c |  69 ++++++----
 silk/fixed/x86/prefilter_FIX_sse.c        | 160 ----------------------
 silk/fixed/x86/vector_ops_FIX_sse4_1.c    |  40 +++---
 silk/main.h                               |  60 ++++-----
 silk/x86/NSQ_del_dec_sse4_1.c             | 179 ++++++++++++++++---------
 silk/x86/NSQ_sse4_1.c                     | 211 +++++++++++++++++++-----------
 silk/x86/SigProc_FIX_sse.h                |  12 +-
 silk/x86/VAD_sse4_1.c                     |  28 ++--
 silk/x86/VQ_WMat_EC_sse4_1.c              | 189 +++++++++++++++-----------
 silk/x86/main_sse.h                       | 170 ++++++++++++------------
 silk/x86/x86_silk_map.c                   |  89 ++++++-------
 17 files changed, 650 insertions(+), 639 deletions(-)
 delete mode 100644 silk/fixed/x86/prefilter_FIX_sse.c

diff --git a/silk/NSQ.c b/silk/NSQ.c
index ab8f9b6c..45dd45ce 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
 
 void silk_NSQ_c
 (
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -173,9 +173,9 @@ void silk_NSQ_c
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer  */
-/***********************************/
+/******************************/
+/* silk_noise_shape_quantizer */
+/******************************/
 
 #if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 00e749c3..41f3fc93 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
 );
 
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index f9ae3263..1d9bf2f1 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
     const opus_int              len                 /*    I vector lengths                                              */
 );
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
@@ -613,8 +613,8 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
     ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
 #endif
 
 #include "Inlines.h"
diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c
index 0f3d545c..245a7e4b 100644
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
     *rate_dist_Q8 = silk_int32_MAX;
     *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
-    /* In things go really bad, at least *ind is set to something safe. */
+    /* If things go really bad, at least *ind is set to something safe. */
     *ind = 0;
     for( k = 0; k < L; k++ ) {
         opus_int32 penalty;
@@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
         if( sum1_Q15 >= 0 ) {
             /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
             bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
-            /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
             bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
             if( bits_tot_Q8 <= *rate_dist_Q8 ) {
                 *rate_dist_Q8 = bits_tot_Q8;
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index 274d4b28..185a12b1 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -68,7 +68,7 @@ void silk_burg_modified_c(
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
     lz = silk_CLZ64(C0_64);
     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@ void silk_burg_modified_c(
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -150,7 +150,7 @@ void silk_burg_modified_c(
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
                        but they cancel each other and the real result seems to always fit in a 32-bit
                        signed integer. This was determined experimentally, not theoretically (unfortunately). */
                     tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
@@ -253,7 +253,7 @@ void silk_burg_modified_c(
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index d9498001..dcf84070 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
 #endif
 }
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
index bbb1ce0f..e58bf079 100644
--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -42,7 +42,7 @@
 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
 
 #define QA                          25
-#define N_BITS_HEAD_ROOM            2
+#define N_BITS_HEAD_ROOM            3
 #define MIN_RSHIFTS                 -16
 #define MAX_RSHIFTS                 (32 - QA)
 
@@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
     int                         arch                /* I    Run-time architecture                                       */
 )
 {
-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int         k, n, s, lz, rshifts, reached_max_gain;
     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     const opus_int16 *x_ptr;
     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+    opus_int64       C0_64;
 
     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
     __m128i CONST1 = _mm_set1_epi32(1);
@@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
-    if( rshifts > MAX_RSHIFTS ) {
-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
-        silk_assert( C0 > 0 );
-        rshifts = MAX_RSHIFTS;
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+    lz = silk_CLZ64(C0_64);
+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+    if (rshifts > 0) {
+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
     } else {
-        lz = silk_CLZ32( C0 ) - 1;
-        rshifts_extra = N_BITS_HEAD_ROOM - lz;
-        if( rshifts_extra > 0 ) {
-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
-            C0 = silk_RSHIFT32( C0, rshifts_extra );
-        } else {
-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
-            C0 = silk_LSHIFT32( C0, -rshifts_extra );
-        }
-        rshifts += rshifts_extra;
+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
     }
+
     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     if( rshifts > 0 ) {
@@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
-                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+                       but they cancel each other and the real result seems to always fit in a 32-bit
+                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
+                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
                 }
 
                 tmp1 = -tmp1;                /* Q17 */
@@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
@@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
         *res_nrg_Q = -rshifts;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int32 res_nrg_c = 0;
+        opus_int res_nrg_Q_c = 0;
+        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+        silk_burg_modified_c(
+            &res_nrg_c,
+            &res_nrg_Q_c,
+            A_Q16_c,
+            x,
+            minInvGain_Q30,
+            subfr_length,
+            nb_subfr,
+            D,
+            0
+        );
+
+        silk_assert( *res_nrg == res_nrg_c );
+        silk_assert( *res_nrg_Q == res_nrg_Q_c );
+        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+    }
+#endif
 }
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
deleted file mode 100644
index 555432cd..00000000
--- a/silk/fixed/x86/prefilter_FIX_sse.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-   - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include "main.h"
-#include "celt/x86/x86cpu.h"
-
-void silk_warped_LPC_analysis_filter_FIX_sse4_1(
-    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
-    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
-    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
-    const opus_int16            input[],                    /* I    Input signal [length]               */
-    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
-    const opus_int              length,                     /* I    Length of input signal              */
-    const opus_int              order                       /* I    Filter order (even)                 */
-)
-{
-    opus_int     n, i;
-    opus_int32   acc_Q11, tmp1, tmp2;
-
-    /* Order must be even */
-    celt_assert( ( order & 1 ) == 0 );
-
-    if (order == 10)
-    {
-        if (0 == lambda_Q16)
-        {
-            __m128i coef_Q13_3210, coef_Q13_7654;
-            __m128i coef_Q13_0123, coef_Q13_4567;
-            __m128i state_0123, state_4567;
-            __m128i xmm_product1, xmm_product2;
-            __m128i xmm_tempa, xmm_tempb;
-
-            register opus_int32 sum;
-            register opus_int32 state_8, state_9, state_a;
-            register opus_int64 coef_Q13_8, coef_Q13_9;
-
-            celt_assert( length > 0 );
-
-            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
-            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
-
-            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
-            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
-
-            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
-            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
-
-            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            state_8 = state[ 8 ];
-            state_9 = state[ 9 ];
-            state_a = 0;
-
-            for( n = 0; n < length; n++ )
-            {
-                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
-                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
-
-                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
-                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
-
-                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
-                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
-
-                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
-                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
-                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
-
-                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);
-                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-                sum += _mm_cvtsi128_si32( xmm_tempa);
-                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
-
-                /* move right */
-                state_a = state_9;
-                state_9 = state_8;
-                state_8 = _mm_cvtsi128_si32( state_4567 );
-                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
-
-                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
-            }
-
-            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            state[ 8 ] = state_8;
-            state[ 9 ] = state_9;
-            state[ 10 ] = state_a;
-
-            return;
-        }
-    }
-
-    for( n = 0; n < length; n++ ) {
-        /* Output of lowpass section */
-        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
-        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
-        /* Output of allpass section */
-        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
-        state[ 1 ] = tmp2;
-        acc_Q11 = silk_RSHIFT( order, 1 );
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
-        /* Loop over allpass sections */
-        for( i = 2; i < order; i += 2 ) {
-            /* Output of allpass section */
-            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
-            state[ i ] = tmp1;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
-            /* Output of allpass section */
-            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
-            state[ i + 1 ] = tmp2;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
-        }
-        state[ order ] = tmp1;
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
-        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
-    }
-}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index c1e90564..0cfb08d9 100644
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -37,39 +37,36 @@
 #include "SigProc_FIX.h"
 #include "pitch.h"
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
 )
 {
-    opus_int  i, dataSize8;
+    opus_int  i, dataSize4;
     opus_int64 sum;
 
-    __m128i xmm_tempa;
-    __m128i inVec1_76543210, acc1;
-    __m128i inVec2_76543210, acc2;
+    __m128i xmm_prod_20, xmm_prod_31;
+    __m128i inVec1_3210, acc1;
+    __m128i inVec2_3210, acc2;
 
     sum = 0;
-    dataSize8 = len & ~7;
+    dataSize4 = len & ~3;
 
     acc1 = _mm_setzero_si128();
     acc2 = _mm_setzero_si128();
 
-    for( i = 0; i < dataSize8; i += 8 ) {
-        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
-        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+    for( i = 0; i < dataSize4; i += 4 ) {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+        xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
-        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+        inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
-        /* equal shift right 8 bytes */
-        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
-        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
-        acc1 = _mm_add_epi64( acc1, xmm_tempa );
-        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+        acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+        acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
     }
 
     acc1 = _mm_add_epi64( acc1, acc2 );
@@ -84,5 +81,12 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
         sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
     }
 
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+        silk_assert( sum == sum_c );
+    }
+#endif
+
     return sum;
 }
diff --git a/silk/main.h b/silk/main.h
index 1a33eed5..a5f56875 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c(
 /************************************/
 
 void silk_NSQ_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ)
@@ -273,21 +273,21 @@ void silk_NSQ_c(
 
 /* Noise shaping using delayed decision */
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ_del_dec)
diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c
index 2c75ede2..42735c52 100644
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -46,6 +46,7 @@ typedef struct {
     opus_int32 Shape_Q14[ DECISION_DELAY ];
     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 Seed;
     opus_int32 SeedInit;
     opus_int32 RD_Q10;
@@ -56,6 +57,7 @@ typedef struct {
     opus_int32 RD_Q10;
     opus_int32 xq_Q14;
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 sLTP_shp_Q14;
     opus_int32 LPC_exc_Q14;
 } NSQ_sample_struct;
@@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
 );
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
     VARDECL( opus_int32, delayedGain_Q10 );
     VARDECL( NSQ_del_dec_struct, psDelDec );
     NSQ_del_dec_struct  *psDD;
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+    silk_NSQ_del_dec_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
     lag = NSQ->lagPrev;
 
@@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
         psDD->SeedInit       = psDD->Seed;
         psDD->RD_Q10         = 0;
         psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;
         psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
         silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
         silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
             }
         }
 
-        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
             psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
 
         silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
             psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
     for( i = 0; i < decisionDelay; i++ ) {
         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
 
     /* Update states */
     NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;
     NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
 
     /* Save quantized speech signal */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
@@ -345,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
     NSQ_sample_struct  *psSS;
@@ -356,6 +399,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     celt_assert( nStatesDelayedDecision > 0 );
     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
 
+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -407,8 +452,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
         /* Long-term shaping */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
-            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
             shp_lag_ptr++;
         } else {
@@ -478,7 +523,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
-                    /* setp 4 */
+                    /* step 4 */
                     psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@@ -511,9 +556,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
 
                 /* Noise shape feedback */
-                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
                 /* Output of lowpass section */
-                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
                 /* Output of allpass section */
                 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
                 psDD->sAR2_Q14[ 0 ] = tmp2;
@@ -543,9 +588,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
 
                 /* Input minus prediction plus noise feedback                       */
                 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
-                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
                 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
-                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
                 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
 
                 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
@@ -559,6 +604,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 /* Find two quantization level candidates and measure their rate-distortion */
                 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
                 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if (Lambda_Q10 > 2048) {
+                    /* For aggressive RDO, the bias becomes more than one pulse. */
+                    if (q1_Q10 > rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+                    } else if (q1_Q10 < -rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+                    } else if (q1_Q10 < 0) {
+                        q1_Q0 = -1;
+                    } else {
+                        q1_Q0 = 0;
+                    }
+                }
                 if( q1_Q0 > 0 ) {
                     q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
@@ -612,8 +669,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 0 ].xq_Q14       = xq_Q14;
@@ -626,14 +684,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                     exc_Q14 = -exc_Q14;
                 }
 
-
                 /* Add predictions */
                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 1 ].xq_Q14       = xq_Q14;
@@ -705,6 +763,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
             psDD                                     = &psDelDec[ k ];
             psSS                                     = &psSampleState[ k ][ 0 ];
             psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->Diff_Q14                           = psSS->Diff_Q14;
             psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
             psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
             psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
@@ -728,7 +787,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -742,51 +801,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 )
 {
     opus_int            i, k, lag;
-    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
     NSQ_del_dec_struct  *psDD;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -800,7 +849,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
         /* Scale long-term shaping state */
         {
             __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -841,6 +892,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
                 /* Scale scalar states */
                 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
 
                 /* Scale short-term prediction and shaping states */
                 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -855,5 +907,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
                 }
             }
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c
index b0315e35..a2a74659 100644
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -37,17 +37,17 @@
 #include "stack_alloc.h"
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 );
 
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
@@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
 );
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
     opus_int32   tmp1;
     opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
 
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
+
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+    silk_NSQ_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     NSQ->rand_seed = psIndices->Seed;
 
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
@@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     /* Set up pointers to start of sub frame */
@@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
             }
         }
 
-        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
 
         if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
         {
             silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
-                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
                 offset_Q10, psEncC->subfr_length, &(table[32]) );
         }
         else
@@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
                 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
         }
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
     /* Save quantized speech and noise shaping signals */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer_10_16  */
-/***********************************/
+/************************************/
+/* silk_noise_shape_quantizer_10_16 */
+/************************************/
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     opus_int            signalType,             /* I    Signal type                     */
@@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
@@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int     i;
     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
     opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
-    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
 
@@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
     __m128i AR_shp_Q13_76543210;
 
+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
 
     sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
     xq_Q14         = psLPC_Q14[ 0 ];
+    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
     LTP_pred_Q13   = 0;
 
     /* load a_Q12 */
@@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
         sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
 
-        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
-        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );
 
         /* high part, use pmaddwd, results in 4 32-bit */
         xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
         n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
 
-        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+        celt_assert( lag > 0 || signalType != TYPE_VOICED );
 
         /* Combine prediction and noise shaping signals */
         tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
@@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* Find two quantization level candidates and measure their rate-distortion */
         q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
         q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+        if (Lambda_Q10 > 2048) {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            if (q1_Q10 > rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+            } else if (q1_Q10 < -rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+            } else if (q1_Q10 < 0) {
+                q1_Q0 = -1;
+            } else {
+                q1_Q0 = 0;
+            }
+        }
 
         q1_Q10 = table[q1_Q0][0];
         q2_Q10 = table[q1_Q0][1];
@@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* Update states */
         psLPC_Q14++;
         *psLPC_Q14 = xq_Q14;
-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
 
         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
         sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
 }
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 )
 {
     opus_int   i, lag;
-    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
 
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -671,7 +718,9 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
         /* Scale long-term shaping state */
         __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
 
@@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
         }
 
         NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
 
         /* Scale short-term prediction and shaping states */
         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
         for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
             NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 61efa8da..e49d5d4e 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -67,7 +67,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
 
 #endif
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
@@ -76,18 +76,18 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
 
 #else
 
-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
                     const opus_int   len);
 
-#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+#  define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
 
 #endif
 #endif
diff --git a/silk/x86/VAD_sse4_1.c b/silk/x86/VAD_sse4_1.c
index d02ddf4a..e7eaf971 100644
--- a/silk/x86/VAD_sse4_1.c
+++ b/silk/x86/VAD_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
 
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    silk_encoder_state psEncC_c;
+    opus_int ret_c;
+
+    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
     /* Safety checks */
     silk_assert( VAD_N_BANDS == 4 );
     celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
     }
 
+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+    }
     /* Power scaling */
     if( speech_nrg <= 0 ) {
         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
-    } else if( speech_nrg < 32768 ) {
-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
-        } else {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
-        }
+    } else if( speech_nrg < 16384 ) {
+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
 
         /* square-root */
         speech_nrg = silk_SQRT_APPROX( speech_nrg );
@@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
     }
 
+#ifdef OPUS_CHECK_ASM
+    silk_assert( ret == ret_c );
+    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
     RESTORE_STACK;
     return( ret );
 }
diff --git a/silk/x86/VQ_WMat_EC_sse4_1.c b/silk/x86/VQ_WMat_EC_sse4_1.c
index 74d6c6d0..2c7d18d0 100644
--- a/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -38,105 +38,136 @@
 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 )
 {
     opus_int   k, gain_tmp_Q7;
     const opus_int8 *cb_row_Q7;
-    opus_int16 diff_Q14[ 5 ];
-    opus_int32 sum1_Q14, sum2_Q16;
+    opus_int32 neg_xX_Q24[ 5 ];
+    opus_int32 sum1_Q15, sum2_Q24;
+    opus_int32 bits_res_Q8, bits_tot_Q8;
+    __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
+
+    /* Negate and convert to new Q domain */
+    neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+    neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+    neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+    neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+    neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+    v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
     /* Loop over codebook */
-    *rate_dist_Q14 = silk_int32_MAX;
+    *rate_dist_Q8 = silk_int32_MAX;
+    *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
+    /* If things go really bad, at least *ind is set to something safe. */
+    *ind = 0;
     for( k = 0; k < L; k++ ) {
+        opus_int32 penalty;
         gain_tmp_Q7 = cb_gain_Q7[k];
-
-        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
-        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
-        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
-        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
-        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
-        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
-        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
-        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
-        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
         /* Weighted rate */
-        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+        /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+        sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
 
         /* Penalty for too large gain */
-        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
-
-        silk_assert( sum1_Q14 >= 0 );
-
-        /* first row of W_Q18 */
-        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
-        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
-
-        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-
-        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
-        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
-
-        /* second row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
-
-        /* third row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
-
-        /* fourth row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
-
-        /* last row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
-
-        silk_assert( sum1_Q14 >= 0 );
+        penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
+
+        /* first row of XX_Q17 */
+        v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+        v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+        v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+        v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+        v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+        sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+        sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  0 ], cb_row_Q7[ 0 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 0 ] );
+
+        /* second row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[  7 ], cb_row_Q7[ 2 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  8 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  9 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  6 ], cb_row_Q7[ 1 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 1 ] );
+
+        /* third row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 2 ] );
+
+        /* fourth row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 3 ] );
+
+        /* last row of XX_Q17 */
+        sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 4 ] );
 
         /* find best */
-        if( sum1_Q14 < *rate_dist_Q14 ) {
-            *rate_dist_Q14 = sum1_Q14;
-            *ind = (opus_int8)k;
-            *gain_Q7 = gain_tmp_Q7;
+        if( sum1_Q15 >= 0 ) {
+            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+            if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+                *rate_dist_Q8 = bits_tot_Q8;
+                *res_nrg_Q15 = sum1_Q15 + penalty;
+                *ind = (opus_int8)k;
+                *gain_Q7 = gain_tmp_Q7;
+            }
         }
 
         /* Go to next cbk vector */
         cb_row_Q7 += LTP_ORDER;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int8  ind_c = 0;
+        opus_int32 res_nrg_Q15_c = 0;
+        opus_int32 rate_dist_Q8_c = 0;
+        opus_int   gain_Q7_c = 0;
+
+        silk_VQ_WMat_EC_c(
+            &ind_c,
+            &res_nrg_Q15_c,
+            &rate_dist_Q8_c,
+            &gain_Q7_c,
+            XX_Q17,
+            xX_Q17,
+            cb_Q7,
+            cb_gain_Q7,
+            cl_Q5,
+            subfr_len,
+            max_gain_Q7,
+            L
+        );
+
+        silk_assert( *ind == ind_c );
+        silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+        silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+        silk_assert( *gain_Q7 == gain_Q7_c );
+    }
+#endif
 }
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 2f15d448..0a0391a2 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -34,73 +34,72 @@
 
 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 #  define OVERRIDE_silk_VQ_WMat_EC
 
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
 #if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
 #else
 
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#  define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L, arch) \
+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
 #endif
-#endif
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 #  define OVERRIDE_silk_NSQ
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if defined OPUS_X86_PRESUME_SSE4_1
@@ -113,21 +112,21 @@ void silk_NSQ_sse4_1(
 #else
 
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
@@ -140,57 +139,56 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
 #  define OVERRIDE_silk_NSQ_del_dec
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
 #else
 
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
 #endif
-#endif
 
 void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 32dcc3ca..ca13cde9 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -41,16 +41,16 @@
 
 #include "fixed/main_FIX.h"
 
-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 ) = {
-  silk_inner_prod16_aligned_64_c,                  /* non-sse */
-  silk_inner_prod16_aligned_64_c,
-  silk_inner_prod16_aligned_64_c,
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
+  silk_inner_prod16_c,                  /* non-sse */
+  silk_inner_prod16_c,
+  silk_inner_prod16_c,
+  MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16 )  /* avx */
 };
 
 #endif
@@ -66,23 +66,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
 };
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_c,                  /* non-sse */
   silk_NSQ_c,
@@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 ) = {
   silk_VQ_WMat_EC_c,                  /* non-sse */
   silk_VQ_WMat_EC_c,
@@ -112,25 +110,23 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_del_dec_c,                  /* non-sse */
   silk_NSQ_del_dec_c,
@@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
 };
-#endif
 
 #if defined(FIXED_POINT)
 
-- 
cgit v1.2.3


From f3377959820511fa012edc57a697b9dca427ef87 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 31 Mar 2022 22:13:52 -0400
Subject: build test scripts

---
 tests/opus_build_test.sh |  30 ++++++++++++
 tests/random_config.sh   | 117 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100755 tests/opus_build_test.sh
 create mode 100755 tests/random_config.sh

diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh
new file mode 100755
index 00000000..6de50481
--- /dev/null
+++ b/tests/opus_build_test.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+tarball=`realpath $1`
+nb_tests=$2
+oldvectors=`realpath $3`
+newvectors=`realpath $4`
+base=`basename $tarball .tar.gz`
+
+tar xvf $tarball > /dev/null 2>&1
+cd $base
+
+if [ $? -ne 0 ]
+then
+	echo cannot go to $base
+        exit 1
+fi
+
+mkdir build_tests
+
+configure_dir=`pwd`
+seq -w $nb_tests | parallel -j +2 "../random_config.sh build_tests/run_{} $configure_dir $oldvectors $newvectors"
+
+if [ $? -ne 0 ]
+then
+        echo Check found errors
+        exit 1
+else
+	echo No error found
+fi
+
diff --git a/tests/random_config.sh b/tests/random_config.sh
new file mode 100755
index 00000000..41a2f276
--- /dev/null
+++ b/tests/random_config.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+dir=$1
+mkdir $dir
+if [ $? -ne 0 ]
+then
+        exit 1
+fi
+
+cd $dir
+if [ $? -ne 0 ]
+then
+        exit 1
+fi
+
+
+configure_path=$2
+config="random_config.txt"
+
+case `seq 3 | shuf -n1` in
+1)
+approx=--enable-float-approx
+math=-ffast-math
+;;
+2)
+approx=--enable-float-approx
+;;
+*)
+approx=
+math=
+;;
+esac
+
+CFLAGS='-g'
+
+opt=`echo -e "-O1\n-O2\n-O3" | shuf -n1`
+
+#arch=-march=`echo -e "core2\nsandybridge\nbroadwell\nskylake" | shuf -n1`
+arch=`echo -e "\n-march=core2\n-march=sandybridge\n-march=broadwell\n-march=skylake\n-march=native" | shuf -n1`
+
+footprint=`echo -e "\n-DSMALL_FOOTPRINT" | shuf -n1`
+std=`echo -e "\n-std=c90\n-std=c99\n-std=c11\n-std=c17" | shuf -n1`
+
+CFLAGS="$CFLAGS $std $opt $arch $footprint $math"
+
+echo CFLAGS=$CFLAGS > $config
+
+lib=`echo -e "\n--disable-static\n--disable-shared" | shuf -n1`
+
+arithmetic=`echo -e "\n--enable-fixed-point\n--enable-fixed-point --enable-fixed-point-debug\n--enable-fixed-point --disable-float-api\n--enable-fixed-point --enable-fixed-point-debug --disable-float-api" | shuf -n1`
+
+custom=`echo -e "\n--enable-custom-modes" | shuf -n1`
+
+#asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1`
+asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1`
+
+assert=`echo -e "\n--enable-assertions" | shuf -n1`
+harden=`echo -e "\n--enable-hardening" | shuf -n1`
+fuzz=`echo -e "\n--enable-fuzzing" | shuf -n1`
+checkasm=`echo -e "\n--enable-check-asm" | shuf -n1`
+rfc8251=`echo -e "\n--disable-rfc8251" | shuf -n1`
+
+if [ "$rfc8251" = --disable-rfc8251 ]
+then
+	vectors=$3
+else
+	vectors=$4
+fi
+echo using testvectors at $vectors >> $config
+
+
+config_opt="$lib $arithmetic $custom $asm $assert $harden $fuzz $checkasm $rfc8251 $approx"
+
+echo configure $config_opt >> $config
+
+export CFLAGS
+$configure_path/configure $config_opt > configure_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+	echo configure error >> $config
+	exit 1
+fi
+
+make > make_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+        echo make error >> $config
+	exit 1
+fi
+
+#Run valgrind 10% of the time
+if [ `seq 30 | shuf -n1` -ne 1 ]
+then
+	make check > makecheck_output.txt 2>&1
+else
+	valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
+fi
+
+if [ $? -ne 0 ]
+then
+        echo check error >> $config
+	exit 1
+fi
+
+
+rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
+../../../run_vectors.sh . $vectors $rate > testvectors_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+        echo testvectors error >> $config
+        exit 1
+fi
+
+echo all pass >> $config
-- 
cgit v1.2.3


From 2654707e86cc94413998976d179b2ab4a2aa3114 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Fri, 1 Apr 2022 14:32:38 -0400
Subject: Cleanup testing directories to save space

---
 tests/opus_build_test.sh |  2 +-
 tests/random_config.sh   | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh
index 6de50481..b334ec16 100755
--- a/tests/opus_build_test.sh
+++ b/tests/opus_build_test.sh
@@ -18,7 +18,7 @@ fi
 mkdir build_tests
 
 configure_dir=`pwd`
-seq -w $nb_tests | parallel -j +2 "../random_config.sh build_tests/run_{} $configure_dir $oldvectors $newvectors"
+seq -w $nb_tests | parallel --halt now,fail=10 -j +2 "../random_config.sh build_tests/run_{} $configure_dir $oldvectors $newvectors"
 
 if [ $? -ne 0 ]
 then
diff --git a/tests/random_config.sh b/tests/random_config.sh
index 41a2f276..073c83e7 100755
--- a/tests/random_config.sh
+++ b/tests/random_config.sh
@@ -78,7 +78,7 @@ $configure_path/configure $config_opt > configure_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-	echo configure error >> $config
+	echo configure FAIL >> $config
 	exit 1
 fi
 
@@ -86,7 +86,7 @@ make > make_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-        echo make error >> $config
+        echo make FAIL >> $config
 	exit 1
 fi
 
@@ -100,7 +100,7 @@ fi
 
 if [ $? -ne 0 ]
 then
-        echo check error >> $config
+        echo check FAIL >> $config
 	exit 1
 fi
 
@@ -110,8 +110,13 @@ rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
 
 if [ $? -ne 0 ]
 then
-        echo testvectors error >> $config
+        echo testvectors FAIL >> $config
         exit 1
 fi
 
-echo all pass >> $config
+echo all tests PASS >> $config
+
+#When everything's good, do some cleaning up to save space
+make distclean > /dev/null 2>&1
+rm -f tmp.out
+gzip make_output.txt
-- 
cgit v1.2.3


From 6ba284f22feeedb394697d112cf1da80c77bb5a4 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sat, 2 Apr 2022 15:18:07 -0400
Subject: Fix lrint/lrintf detection

Prevents using lrint/lrintf when compiling with -std=c90 even though the
functions are in libm. This was causing tests to fail, likely due to
incorrect prototypes.
---
 celt/float_cast.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/celt/float_cast.h b/celt/float_cast.h
index 9d34976e..8915a5fd 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
                 return intgr ;
         }
 
-#elif defined(HAVE_LRINTF)
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 /*      These defines enable functionality introduced with the 1999 ISO C
 **      standard. They must be defined before the inclusion of math.h to
@@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
 #include <math.h>
 #define float2int(x) lrintf(x)
 
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 #define _ISOC9X_SOURCE 1
 #define _ISOC99_SOURCE 1
-- 
cgit v1.2.3


From ccaaffa9a3ee427e9401c4dcf6462e378d9a4694 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sat, 2 Apr 2022 15:21:48 -0400
Subject: print rate used for testvectors

---
 tests/random_config.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/random_config.sh b/tests/random_config.sh
index 073c83e7..fe68f5e7 100755
--- a/tests/random_config.sh
+++ b/tests/random_config.sh
@@ -106,7 +106,8 @@ fi
 
 
 rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
-../../../run_vectors.sh . $vectors $rate > testvectors_output.txt 2>&1
+echo testvectors for $rate Hz > testvectors_output.txt
+../../../run_vectors.sh . $vectors $rate >> testvectors_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-- 
cgit v1.2.3


From 3c94f3a55d27b70b3438ceb60cd2b119ba0c895e Mon Sep 17 00:00:00 2001
From: Omer Osman <omerosman@google.com>
Date: Fri, 22 Apr 2022 02:02:40 +0000
Subject: Add libopus to Bluetooth mainline module Apex

This CL includes libopus as a static object within Bluetooth Apex to
enable support for Opus over A2DP.

Bug: 226441860
Test: Sink device using bds-dev

Change-Id: Ie3864f47b1be11d0f34eaf63e5cc82824d4f1e9f
---
 Android.bp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Android.bp b/Android.bp
index 270d3271..f2f09f8d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -383,6 +383,7 @@ cc_library {
     apex_available: [
         "//apex_available:platform", // used by libstagefright_soft_opusdec
         "com.android.media.swcodec",
+        "com.android.bluetooth",
     ],
     min_sdk_version: "29",
 }
-- 
cgit v1.2.3


From 8101b33e6c59b51b22aeeeaf39045e34a402b01f Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Mon, 20 Jun 2022 22:23:26 -0700
Subject: Correct redundancy handling with lost/DTX frames

In https://github.com/xiph/opus/issues/253, the encoder generates a
Hybrid frame with redundancy, to switch to CELT-only mode, and then
activates DTX immediately afterwards.  The decoder ran Hybrid PLC,
which isn't right.  Use CELT PLC instead if there was already a
transition to CELT via redundancy at the end of the previous frame.

Also do not use a stale CELT decoder to decode a second redundancy
frame when the first redundancy frame for a transition from SILK-only
mode was lost.  Instead of mixing in old audio from the last time
that CELT was used, ignore the second redundancy frame in this case.
Alternatively the CELT decoder could be reset before decoding, but
it would not be ready until after the 2.5 ms of audio that is needed.

Reviewed by Jean-Marc Valin.
---
 src/opus_decoder.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 9113638a..0be87dc0 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -278,7 +278,8 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
       ec_dec_init(&dec,(unsigned char*)data,len);
    } else {
       audiosize = frame_size;
-      mode = st->prev_mode;
+      /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */
+      mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode;
       bandwidth = 0;
 
       if (mode == 0)
@@ -419,7 +420,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
 
    start_band = 0;
    if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL
-    && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len)
+    && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len)
    {
       /* Check if we have a redundant 0-8 kHz band */
       if (mode == MODE_HYBRID)
@@ -454,6 +455,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
    {
       transition = 0;
       pcm_transition_silk_size=ALLOC_NONE;
+      /* don't use stale CELT decoder to decode second redundancy frame if
+         the first redundancy frame for a transition from SILK was lost */
+      if (celt_to_silk && st->prev_mode == MODE_SILK_ONLY && !st->prev_redundancy)
+         redundancy = 0;
    }
 
    ALLOC(pcm_transition_silk, pcm_transition_silk_size, opus_val16);
-- 
cgit v1.2.3


From 63855aff731dcf58875c159dc2fa7463a444d617 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Sun, 26 Jun 2022 12:48:44 -0400
Subject: Improve background noise estimation for CELT DTX

We now update the background noise estimate even in frames classified
as transient. It shouldn't be a problem because we're using min
statistics. Also, it avoids problems when update frames get
missclassified as transient.

In addition, we now use the duration of losses rather than the
number of lost packets to make decisions. That should make
PLC/DTX behaviour more consistent across frame sizes.
---
 celt/celt_decoder.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 74ca3b74..77eb44f4 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -90,7 +90,7 @@ struct OpusCustomDecoder {
    opus_uint32 rng;
    int error;
    int last_pitch_index;
-   int loss_count;
+   int loss_duration;
    int skip_plc;
    int postfilter_period;
    int postfilter_period_old;
@@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    int nbEBands;
    int overlap;
    int start;
-   int loss_count;
+   int loss_duration;
    int noise_based;
    const opus_int16 *eBands;
    SAVE_STACK;
@@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
 
-   loss_count = st->loss_count;
+   loss_duration = st->loss_duration;
    start = st->start;
-   noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+   noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
@@ -559,7 +559,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 #endif
 
       /* Energy decay */
-      decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+      decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
       c=0; do
       {
          for (i=start;i<end;i++)
@@ -602,7 +602,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       VARDECL(opus_val16, _exc);
       VARDECL(opus_val16, fir_tmp);
 
-      if (loss_count == 0)
+      if (loss_duration == 0)
       {
          st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
@@ -632,7 +632,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
             exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
 
-         if (loss_count == 0)
+         if (loss_duration == 0)
          {
             opus_val32 ac[LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
@@ -812,7 +812,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       } while (++c<C);
    }
 
-   st->loss_count = loss_count+1;
+   /* Saturate to soemthing large to avoid wrap-around. */
+   st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
 
    RESTORE_STACK;
 }
@@ -868,6 +869,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    int nbEBands;
    int overlap;
    const opus_int16 *eBands;
+   opus_val16 max_background_increase;
    ALLOC_STACK;
 
    VALIDATE_CELT_DECODER(st);
@@ -942,7 +944,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    /* Check if there are at least two packets received consecutively before
     * turning on the pitch-based PLC */
-   st->skip_plc = st->loss_count != 0;
+   st->skip_plc = st->loss_duration != 0;
 
    if (dec == NULL)
    {
@@ -1140,25 +1142,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    if (C==1)
       OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
 
-   /* In case start or end were to change */
    if (!isTransient)
    {
-      opus_val16 max_background_increase;
       OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
       OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
-      /* In normal circumstances, we only allow the noise floor to increase by
-         up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
-         increase for each update.*/
-      if (st->loss_count < 10)
-         max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
-      else
-         max_background_increase = QCONST16(1.f,DB_SHIFT);
-      for (i=0;i<2*nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
    } else {
       for (i=0;i<2*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
    }
+   /* In normal circumstances, we only allow the noise floor to increase by
+      up to 2.4 dB/second, but when we're in DTX we give the weight of
+      all missing packets to the update packet. */
+   max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+   for (i=0;i<2*nbEBands;i++)
+      backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+   /* In case start or end were to change */
    c=0; do
    {
       for (i=0;i<start;i++)
@@ -1175,7 +1173,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    st->rng = dec->rng;
 
    deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
-   st->loss_count = 0;
+   st->loss_duration = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
       return OPUS_INTERNAL_ERROR;
-- 
cgit v1.2.3


From 57ddf37c83eff80a5064c38fde57b6cc6ad1e739 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Sun, 26 Jun 2022 13:40:45 -0700
Subject: Fix 8101b33 to decode ignored redundancy

Even if the redundancy is ignored, the final range from the decoder is
needed for testing.

Reviewed by Timothy B. Terriberry.
---
 src/opus_decoder.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 0be87dc0..6520e748 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -455,10 +455,6 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
    {
       transition = 0;
       pcm_transition_silk_size=ALLOC_NONE;
-      /* don't use stale CELT decoder to decode second redundancy frame if
-         the first redundancy frame for a transition from SILK was lost */
-      if (celt_to_silk && st->prev_mode == MODE_SILK_ONLY && !st->prev_redundancy)
-         redundancy = 0;
    }
 
    ALLOC(pcm_transition_silk, pcm_transition_silk_size, opus_val16);
@@ -504,6 +500,11 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
    /* 5 ms redundant frame for CELT->SILK*/
    if (redundancy && celt_to_silk)
    {
+      /* If the previous frame did not use CELT (the first redundancy frame in
+         a transition from SILK may have been lost) then the CELT decoder is
+         stale at this point and the redundancy audio is not useful, however
+         the final range is still needed (for testing), so the redundancy is
+         always decoded but the decoded audio may not be used */
       MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)));
       celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
                           redundant_audio, F5, NULL, 0);
@@ -566,7 +567,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
       smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
                   pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
    }
-   if (redundancy && celt_to_silk)
+   /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not
+      use CELT (the first redundancy frame in a transition from SILK may have
+      been lost) */
+   if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy))
    {
       for (c=0;c<st->channels;c++)
       {
-- 
cgit v1.2.3


From be67ea8c7ca22d4803b3828c183af1d47a3932cb Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 27 Jun 2022 23:27:48 -0400
Subject: Fixes wrap-around in silk_inner_prod16_sse4_1()

Thanks Tim
---
 silk/fixed/x86/vector_ops_FIX_sse4_1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index 0cfb08d9..a46289bb 100644
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -36,6 +36,7 @@
 
 #include "SigProc_FIX.h"
 #include "pitch.h"
+#include "celt/x86/x86cpu.h"
 
 opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
@@ -78,7 +79,7 @@ opus_int64 silk_inner_prod16_sse4_1(
     _mm_storel_epi64( (__m128i *)&sum, acc1 );
 
     for( ; i < len; i++ ) {
-        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+        sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] );
     }
 
 #ifdef OPUS_CHECK_ASM
-- 
cgit v1.2.3


From 31b922e79cbaad7b5143c044ce58b01b4a9be7f8 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 28 Jun 2022 01:02:37 -0400
Subject: Silence some warnings for fixed-point debug builds

Reviewed by Timothy B. Terriberry.
---
 celt/fixed_debug.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index 3765baa6..92f0f47a 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -214,7 +214,7 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -222,7 +222,7 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
    res = a<<shift;
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    res = a+b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
    if (a<b)
    {
-      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    res = a-b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -416,7 +416,7 @@ static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
    opus_int64 res;
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
    {
-      fprintf (stderr, "MULT32_32_32: inputs are not int: %d %d\n", a, b);
+      fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -424,7 +424,7 @@ static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
    res = a*b;
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "MULT32_32_32: output is not int: %d\n", res);
+      fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -438,7 +438,7 @@ static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
    opus_int64 res;
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
    {
-      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %d %d\n", a, b);
+      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -446,7 +446,7 @@ static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
    res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "MULT32_32_Q16: output is not int: %d*%d=%d\n", a, b, (int)res);
+      fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -831,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
 
 
 #undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
 
 #endif
-- 
cgit v1.2.3


From d7d4b3487f9b352942d868d056cba520fc4346c7 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 28 Jun 2022 01:13:16 -0400
Subject: Fixes --disable-rtcd

Make sure we don't try to use the rtcd table when rtcd is disabled.
That code still needs a lot more cleaning up.
---
 celt/x86/celt_lpc_sse.h | 5 +++--
 celt/x86/pitch_sse.h    | 6 +++---
 tests/random_config.sh  | 4 ++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
 
 void celt_fir_sse4_1(
          const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
 #define celt_fir(x, num, y, N, ord, arch) \
     ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          int ord,
          int arch);
 
+#define OVERRIDE_CELT_FIR
 #  define celt_fir(x, num, y, N, ord, arch) \
     ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b6..964aef50 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((void)arch, xcorr_kernel_sse(x, y, sum, len))
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) &&  ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
 
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
-    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
diff --git a/tests/random_config.sh b/tests/random_config.sh
index fe68f5e7..e5d72833 100755
--- a/tests/random_config.sh
+++ b/tests/random_config.sh
@@ -51,8 +51,8 @@ arithmetic=`echo -e "\n--enable-fixed-point\n--enable-fixed-point --enable-fixed
 
 custom=`echo -e "\n--enable-custom-modes" | shuf -n1`
 
-#asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1`
-asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1`
+asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1`
+#asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1`
 
 assert=`echo -e "\n--enable-assertions" | shuf -n1`
 harden=`echo -e "\n--enable-hardening" | shuf -n1`
-- 
cgit v1.2.3


From ccb42e05cc6eb98a15874b9695361025b3ee17ab Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 28 Jun 2022 02:07:08 -0400
Subject: Fixes valgrind failure caused by silk_find_pred_coefs_*()

The function copies NLSFs from the stack to the state which for
order 10 means we were copying uninitialized values. That in turn
breaks check-asm when comparing the state under valgrind.

Reviewed by Timothy B. Terriberry.
---
 silk/fixed/find_pred_coefs_FIX.c | 3 ++-
 silk/float/find_pred_coefs_FLP.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c
index 606d8633..ad363fb7 100644
--- a/silk/fixed/find_pred_coefs_FIX.c
+++ b/silk/fixed/find_pred_coefs_FIX.c
@@ -42,7 +42,8 @@ void silk_find_pred_coefs_FIX(
 {
     opus_int         i;
     opus_int32       invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const opus_int16 *x_ptr;
     opus_int16       *x_pre_ptr;
     VARDECL( opus_int16, LPC_in_pre );
diff --git a/silk/float/find_pred_coefs_FLP.c b/silk/float/find_pred_coefs_FLP.c
index dcf7c520..6f790788 100644
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@@ -44,7 +44,8 @@ void silk_find_pred_coefs_FLP(
     silk_float       XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     silk_float       xXLTP[ MAX_NB_SUBFR * LTP_ORDER ];
     silk_float       invGains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const silk_float *x_ptr;
     silk_float       *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ];
     silk_float       minInvGain;
-- 
cgit v1.2.3


From 03889ae76dea9c5e63e64df495fd77d613a03a80 Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <tterribe@xiph.org>
Date: Tue, 28 Jun 2022 22:33:43 -0700
Subject: Check the return value of __get_cpuid().

This function can fail if CPUID is not supported or the maximum
 supported value of EAX is less than the requested one.
Check the return value and explicitly disable all SIMD if it does
 fail.
This was happening before implicitly because of the initialization
 of info[] to zero, but being explicit about it makes it less likely
 someone will break this behavior because they did not realize what
 was going on.
---
 celt/x86/x86cpu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25e..accf0676 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -81,7 +81,12 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
     );
 #endif
 #elif defined(CPU_INFO_BY_C)
-    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+    if !(__get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+        /* Our function cannot fail, but __get_cpuid can.
+           Returning all zeroes will effectively disable all SIMD, which is
+            what we want on CPUs that don't support CPUID. */
+        CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+    }
 #endif
 }
 
@@ -98,7 +103,7 @@ typedef struct CPU_Feature{
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
 {
-    unsigned int info[4] = {0};
+    unsigned int info[4];
     unsigned int nIds = 0;
 
     cpuid(info, 0);
-- 
cgit v1.2.3


From 6577534a80c833bd310276f1e2bd3254271bb86d Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <tterribe@xiph.org>
Date: Tue, 28 Jun 2022 22:42:01 -0700
Subject: Work around a valgrind false-positive in CPUID.

Valgrind versions prior to 3.17.0 assume that an uninitialized value
 in ECX causes the whole output of CPUID to be uninitialized, even
 though ECX is only "read" by CPUID for certain values of EAX.
Work around that by guaranteeing that ECX is initialized.
---
 celt/x86/x86cpu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index accf0676..d95a9b94 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=r" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #else
     __asm__ __volatile__ (
@@ -77,12 +78,15 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=b" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #endif
 #elif defined(CPU_INFO_BY_C)
-    if !(__get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
-        /* Our function cannot fail, but __get_cpuid can.
+    /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+        prior to v3.17.0.*/
+    if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+        /* Our function cannot fail, but __get_cpuid{_count} can.
            Returning all zeroes will effectively disable all SIMD, which is
             what we want on CPUs that don't support CPUID. */
         CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
-- 
cgit v1.2.3


From e0ca05b1ec5ef4abbfed5f70623ed3e5ea77dd6b Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Thu, 30 Jun 2022 16:25:03 -0400
Subject: Adds fuzzing to CPU detection

Makes ti possible to randomize (with --enable-fuzzing) the CPU flags
so we can better test all the intrinsics implementations.
---
 celt/arm/armcpu.c |  9 ++++++++-
 celt/x86/x86cpu.c | 11 ++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index cce3ae3a..c7d16e6d 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void)
    "your platform.  Reconfigure with --disable-rtcd (or send patches)."
 #endif
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
   opus_uint32 flags = opus_cpu_capabilities();
   int arch = 0;
@@ -184,4 +184,11 @@ int opus_select_arch(void)
   return arch;
 }
 
+int opus_select_arch(void) {
+  int arch = opus_select_arch_impl();
+#ifdef FUZZING
+  arch = rand()%(arch+1);
+#endif
+  return arch;
+}
 #endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index d95a9b94..7cfc8db5 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -128,7 +128,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
     }
 }
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
     CPU_Feature cpu_feature;
     int arch;
@@ -163,4 +163,13 @@ int opus_select_arch(void)
     return arch;
 }
 
+int opus_select_arch(void) {
+    int arch = opus_select_arch_impl();
+#ifdef FUZZING
+    /* Randomly downgrade the architecture. */
+    arch = rand()%(arch+1);
+#endif
+    return arch;
+}
+
 #endif
-- 
cgit v1.2.3


From 60c48ade0a9d192b8535023bf7c7db40341ece1e Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Thu, 30 Jun 2022 16:44:34 -0400
Subject: Estimate the inner product accuracy to fix check-asm

Estimate the rounding error so that we can have a useful margin of
error when checking the asm against the C code even when the float
operations get reordered due to -ffast-math.
---
 celt/arm/pitch_neon_intr.c | 51 +++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index 1ac38c43..35cc46e2 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
 /* operations of celt_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
 {
    int i;
+   *err = 0;
    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    for (i = 0; i < N - 3; i += 4) {
       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
    }
    xy0 += xy2;
    xy1 += xy3;
    xy = xy0 + xy1;
+   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
    for (; i < N; i++) {
       xy = MAC16_16(xy, x[i], y[i]);
+      *err += ABS32(xy);
    }
+   *err = *err*2e-7 + N*1e-37;
    return xy;
 }
 
@@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c
 /* operations of dual_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
+      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
 {
-   int i;
-   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
-   for (i = 0; i < N - 3; i += 4) {
-      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
-      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
-      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
-      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
-      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
-      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
-      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
-      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
-   }
-   xy01_0 += xy01_2;
-   xy02_0 += xy02_2;
-   xy01_1 += xy01_3;
-   xy02_1 += xy02_3;
-   xy01 = xy01_0 + xy01_1;
-   xy02 = xy02_0 + xy02_1;
-   for (; i < N; i++) {
-      xy01 = MAC16_16(xy01, x[i], y01[i]);
-      xy02 = MAC16_16(xy02, x[i], y02[i]);
-   }
-   *xy1 = xy01;
-   *xy2 = xy02;
+   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
 }
 
 #endif /* OPUS_CHECK_ASM */
@@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
     }
 
 #ifdef OPUS_CHECK_ASM
-    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+    {
+        float err, res;
+        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+        celt_assert(ABS32(res - xy) <= err);
+    }
 #endif
 
     return xy;
@@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 #ifdef OPUS_CHECK_ASM
     {
         opus_val32 xy1_c, xy2_c;
-        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
-        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
-        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+        float err[2];
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
     }
 #endif
 }
-- 
cgit v1.2.3


From 3acaa70965c5570ef1711fee9b3a15eac3e74ffe Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Thu, 30 Jun 2022 20:11:26 -0400
Subject: Adds OPUS_SET_INBAND_FEC(2) option

Unlike OPUS_SET_INBAND_FEC(1), the encoder does not necessarily
switch to SILK if we have music.
---
 include/opus_defines.h |  6 ++++--
 src/opus_encoder.c     | 13 ++++++++-----
 tests/test_opus_api.c  |  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/opus_defines.h b/include/opus_defines.h
index ceee5b84..94b9e0d9 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -482,7 +482,8 @@ extern "C" {
   * @param[in] x <tt>opus_int32</tt>: Allowed values:
   * <dl>
   * <dt>0</dt><dd>Disable inband FEC (default).</dd>
-  * <dt>1</dt><dd>Enable inband FEC.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x)
@@ -491,7 +492,8 @@ extern "C" {
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
   * <dl>
   * <dt>0</dt><dd>Inband FEC disabled (default).</dd>
-  * <dt>1</dt><dd>Inband FEC enabled.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x)
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 253fe9e8..64b6726c 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -87,6 +87,7 @@ struct OpusEncoder {
     int          lfe;
     int          arch;
     int          use_dtx;                 /* general DTX for both SILK and CELT */
+    int          fec_config;
 #ifndef DISABLE_FLOAT_API
     TonalityAnalysisState analysis;
 #endif
@@ -1389,8 +1390,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
        st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY;
 
-       /* When FEC is enabled and there's enough packet loss, use SILK */
-       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4)
+       /* When FEC is enabled and there's enough packet loss, use SILK.
+          Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */
+       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25))
           st->mode = MODE_SILK_ONLY;
        /* When encoding voice and DTX is enabled but the generalized DTX cannot be used,
           use SILK in order to make use of its DTX. */
@@ -2439,11 +2441,12 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
         case OPUS_SET_INBAND_FEC_REQUEST:
         {
             opus_int32 value = va_arg(ap, opus_int32);
-            if(value<0 || value>1)
+            if(value<0 || value>2)
             {
                goto bad_arg;
             }
-            st->silk_mode.useInBandFEC = value;
+            st->fec_config = value;
+            st->silk_mode.useInBandFEC = (value != 0);
         }
         break;
         case OPUS_GET_INBAND_FEC_REQUEST:
@@ -2453,7 +2456,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             {
                goto bad_arg;
             }
-            *value = st->silk_mode.useInBandFEC;
+            *value = st->fec_config;
         }
         break;
         case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
diff --git a/tests/test_opus_api.c b/tests/test_opus_api.c
index fb385c63..0e7ed2cc 100644
--- a/tests/test_opus_api.c
+++ b/tests/test_opus_api.c
@@ -1298,7 +1298,7 @@ opus_int32 test_enc_api(void)
    err=opus_encoder_ctl(enc,OPUS_GET_INBAND_FEC(null_int_ptr));
    if(err!=OPUS_BAD_ARG)test_failed();
    cfgs++;
-   CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,2,
+   CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,3,
      1,0,
      "    OPUS_SET_INBAND_FEC .......................... OK.\n",
      "    OPUS_GET_INBAND_FEC .......................... OK.\n")
-- 
cgit v1.2.3


From 7b6cede819a673cc21e896283668b2d3ddd9e623 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sat, 2 Jul 2022 22:41:15 -0400
Subject: Add asan/ubsan support in random tests

---
 tests/random_config.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/random_config.sh b/tests/random_config.sh
index e5d72833..aea0c389 100755
--- a/tests/random_config.sh
+++ b/tests/random_config.sh
@@ -40,8 +40,10 @@ arch=`echo -e "\n-march=core2\n-march=sandybridge\n-march=broadwell\n-march=skyl
 
 footprint=`echo -e "\n-DSMALL_FOOTPRINT" | shuf -n1`
 std=`echo -e "\n-std=c90\n-std=c99\n-std=c11\n-std=c17" | shuf -n1`
+sanitize=`echo -e "\n-fsanitize=address -fno-sanitize-recover=all\n-fsanitize=undefined -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow" | shuf -n1`
 
-CFLAGS="$CFLAGS $std $opt $arch $footprint $math"
+
+CFLAGS="$CFLAGS $std $opt $arch $footprint $math $sanitize"
 
 echo CFLAGS=$CFLAGS > $config
 
@@ -90,11 +92,12 @@ then
 	exit 1
 fi
 
-#Run valgrind 10% of the time
-if [ `seq 30 | shuf -n1` -ne 1 ]
+#Run valgrind 5% of the time (minus the asan cases)
+if [ `seq 20 | shuf -n1` -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ]
 then
 	make check > makecheck_output.txt 2>&1
 else
+	echo valgrind enabled >> $config
 	valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
 fi
 
-- 
cgit v1.2.3


From e535e894196387b12d9fcbc271a98b21572a630b Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <tterribe@xiph.org>
Date: Sat, 2 Jul 2022 12:14:06 -0700
Subject: Work around UBSan unaligned access errors.

The underlying objects are all 8-bit integers.
Verified that the generated assembly still just uses MOVD.

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 celt/x86/x86cpu.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 0de8df35..04e80489 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,8 +56,16 @@
 int opus_select_arch(void);
 # endif
 
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+   and UBSan will report errors if we actually make unaligned accesses.
+  Use this to work around those restrictions (which should hopefully all get
+   optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+  (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+   *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+
 #define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
 
 #define OP_CVTEPI16_EPI32_M64(x) \
  (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-- 
cgit v1.2.3


From 3cc09dee34a22f89385447fb7bd230140316281f Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sat, 2 Jul 2022 22:46:14 -0400
Subject: Avoid left shifts of negative values in debug macros

Reviewed by Mark Harris
---
 celt/fixed_debug.h |  6 +++---
 silk/MacroDebug.h  | 32 +++++++++++++-------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index 92f0f47a..c2cf5a83 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
 #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
 static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
 {
-   int res;
+   opus_int32 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
       fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int32)((opus_uint32)a<<shift);
    if (!VERIFY_SHORT(res))
    {
       fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -219,7 +219,7 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int64)((opus_uint64)a<<shift);
    if (!VERIFY_INT(res))
    {
       fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index 8dd4ce2e..bf42d5f0 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -491,12 +491,6 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_
     return ret;
 }
 
-/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */
-#undef  silk_MLA_ovflw
-#define silk_MLA_ovflw(a32, b32, c32)    ((a32) + ((b32) * (c32)))
-#undef  silk_SMLABB_ovflw
-#define silk_SMLABB_ovflw(a32, b32, c32)    ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32)))
-
 /* no checking needed for silk_SMULL
    no checking needed for silk_SMLAL
    no checking needed for silk_SMLALBB
@@ -546,10 +540,10 @@ static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, cha
 static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){
     opus_int8 ret;
     int       fail = 0;
-    ret = a << shift;
+    ret = (opus_int8)((opus_uint8)a << shift);
     fail |= shift < 0;
     fail |= shift >= 8;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -565,10 +559,10 @@ static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *
 static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){
     opus_int16 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int16)((opus_uint16)a << shift);
     fail |= shift < 0;
     fail |= shift >= 16;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -584,10 +578,10 @@ static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, cha
 static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int32)((opus_uint32)a << shift);
     fail |= shift < 0;
     fail |= shift >= 32;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -603,7 +597,7 @@ static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, cha
 static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){
     opus_int64 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int64)((opus_uint64)a << shift);
     fail |= shift < 0;
     fail |= shift >= 64;
     fail |= (ret>>shift) != ((opus_int64)a);
@@ -714,8 +708,8 @@ static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift
 #define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){
     opus_int16 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = a + (opus_int16)((opus_uint16)b << shift);
+    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -729,8 +723,8 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int
 #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = a + (opus_int32)((opus_uint32)b << shift);
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -804,8 +798,8 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32
 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) )
+    ret = a - (opus_int32)((opus_uint32)b << shift);
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
-- 
cgit v1.2.3


From 78fe48adfb5b849f263629d391878c7d34596f11 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 3 Jul 2022 01:42:14 -0400
Subject: Fix some 16-bit overflows (using 32-bit macros)

Reviewed by Mark Harris
---
 silk/stereo_LR_to_MS.c | 8 ++++----
 silk/stereo_MS_to_LR.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/silk/stereo_LR_to_MS.c b/silk/stereo_LR_to_MS.c
index c8226663..751452cb 100644
--- a/silk/stereo_LR_to_MS.c
+++ b/silk/stereo_LR_to_MS.c
@@ -77,7 +77,7 @@ void silk_stereo_LR_to_MS(
     ALLOC( LP_mid, frame_length, opus_int16 );
     ALLOC( HP_mid, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
         LP_mid[ n ] = sum;
         HP_mid[ n ] = mid[ n + 1 ] - sum;
     }
@@ -86,7 +86,7 @@ void silk_stereo_LR_to_MS(
     ALLOC( LP_side, frame_length, opus_int16 );
     ALLOC( HP_side, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
         LP_side[ n ] = sum;
         HP_side[ n ] = side[ n + 1 ] - sum;
     }
@@ -207,7 +207,7 @@ void silk_stereo_LR_to_MS(
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
         w_Q24   += deltaw_Q24;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -217,7 +217,7 @@ void silk_stereo_LR_to_MS(
     pred1_Q13 = -pred_Q13[ 1 ];
     w_Q24     =  silk_LSHIFT( width_Q14, 10 );
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/silk/stereo_MS_to_LR.c b/silk/stereo_MS_to_LR.c
index 62521a4f..1e01bb6e 100644
--- a/silk/stereo_MS_to_LR.c
+++ b/silk/stereo_MS_to_LR.c
@@ -59,7 +59,7 @@ void silk_stereo_MS_to_LR(
     for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) {
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -67,7 +67,7 @@ void silk_stereo_MS_to_LR(
     pred0_Q13 = pred_Q13[ 0 ];
     pred1_Q13 = pred_Q13[ 1 ];
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
-- 
cgit v1.2.3


From 68d21fb5b0c1f38ef9fc82344094cf02103282c3 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 3 Jul 2022 03:07:00 -0400
Subject: Fix fixed-point overflow in pitch downsampling

Reviewed by Mark Harris
---
 celt/pitch.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/celt/pitch.c b/celt/pitch.c
index 872582a4..586ca8c3 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
       shift=0;
    if (C==2)
       shift++;
-#endif
    for (i=1;i<len>>1;i++)
-      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
-   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+      x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+   x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
    if (C==2)
    {
       for (i=1;i<len>>1;i++)
-         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
-      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+         x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+      x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
    }
-
+#else
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+   x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+      x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+   }
+#endif
    _celt_autocorr(x_lp, ac, NULL, 0,
                   4, len>>1, arch);
 
-- 
cgit v1.2.3


From 8489ff3ffa6930ee32a9616ca10c7dceceb05f8d Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 3 Jul 2022 03:25:59 -0400
Subject: Avoid undefined behaviour within the debug macros

Even when the macro itself would overflow.

Reviewed by Mark Harris
---
 silk/MacroDebug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index bf42d5f0..e505d02a 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -55,7 +55,7 @@ static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file
 static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){
     opus_int32 ret;
 
-    ret = a + b;
+    ret = (opus_int32)((opus_uint32)a + (opus_uint32)b);
     if ( ret != silk_ADD_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -257,7 +257,7 @@ static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, c
 static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){
     opus_int32 ret;
     opus_int64 ret64;
-    ret = a32 * b32;
+    ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32);
     ret64 = (opus_int64)a32 * (opus_int64)b32;
     if ( (opus_int64)ret != ret64 )
     {
-- 
cgit v1.2.3


From 98a6042bb67bbe0ddd9d4291de4292c6882bb095 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Tue, 5 Jul 2022 00:16:44 -0400
Subject: Avoids incrementing uninitialized values

The values were never used, but ubsan + valgrind would complain.

Reviewed by Mark Harris
---
 celt/celt_encoder.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index d6f8afc2..637d442c 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
       compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
       amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
-      for (i=0;i<C*nbEBands;i++)
-         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+      for (c=0;c<C;c++)
+      {
+         for (i=0;i<end;i++)
+            bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+      }
    }
 
    compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
          amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
-         for (i=0;i<C*nbEBands;i++)
-            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         for (c=0;c<C;c++)
+         {
+            for (i=0;i<end;i++)
+               bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+         }
          tf_estimate = QCONST16(.2f,14);
       }
    }
-- 
cgit v1.2.3


From ef10bf56c34e0da31c4ec5572f4e6f23e5b66306 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Sat, 2 Jul 2022 11:34:46 -0700
Subject: Silence GCC 11+ -Wmaybe-uninitialized warnings

Reviewed by Timothy B. Terriberry.
---
 silk/float/wrappers_FLP.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index ad90b874..c0c183e3 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -190,12 +190,14 @@ void silk_quant_LTP_gains_FLP(
     opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ];
 
-    for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) {
+    i = 0;
+    do {
         XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f );
-    }
-    for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
+    } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER );
+    i = 0;
+    do {
         xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f );
-    }
+    } while ( ++i < nb_subfr * LTP_ORDER );
 
     silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch );
 
-- 
cgit v1.2.3


From bfebf824addf8027ae60dee6ef271980b283a9c1 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Tue, 5 Jul 2022 15:11:02 -0700
Subject: Fix quoting and whitespace errors in build test

Reviewed by Jean-Marc Valin.
---
 tests/opus_build_test.sh | 21 ++++++++++-----------
 tests/random_config.sh   | 48 ++++++++++++++++++++++++------------------------
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh
index b334ec16..573f4473 100755
--- a/tests/opus_build_test.sh
+++ b/tests/opus_build_test.sh
@@ -1,30 +1,29 @@
 #!/bin/sh
 
-tarball=`realpath $1`
-nb_tests=$2
-oldvectors=`realpath $3`
-newvectors=`realpath $4`
-base=`basename $tarball .tar.gz`
+tarball=`realpath "$1"`
+nb_tests="$2"
+oldvectors=`realpath "$3"`
+newvectors=`realpath "$4"`
+base=`basename "$tarball" .tar.gz`
 
-tar xvf $tarball > /dev/null 2>&1
-cd $base
+tar xvf "$tarball" > /dev/null 2>&1
+cd "$base"
 
 if [ $? -ne 0 ]
 then
-	echo cannot go to $base
+        echo cannot go to "$base"
         exit 1
 fi
 
 mkdir build_tests
 
 configure_dir=`pwd`
-seq -w $nb_tests | parallel --halt now,fail=10 -j +2 "../random_config.sh build_tests/run_{} $configure_dir $oldvectors $newvectors"
+seq -w "$nb_tests" | parallel --halt now,fail=10 -j +2 -q ../random_config.sh "build_tests/run_{}" "$configure_dir" "$oldvectors" "$newvectors"
 
 if [ $? -ne 0 ]
 then
         echo Check found errors
         exit 1
 else
-	echo No error found
+        echo No error found
 fi
-
diff --git a/tests/random_config.sh b/tests/random_config.sh
index aea0c389..0cdd855f 100755
--- a/tests/random_config.sh
+++ b/tests/random_config.sh
@@ -1,20 +1,20 @@
 #!/bin/bash
 
-dir=$1
-mkdir $dir
+dir="$1"
+mkdir "$dir"
 if [ $? -ne 0 ]
 then
         exit 1
 fi
 
-cd $dir
+cd "$dir"
 if [ $? -ne 0 ]
 then
         exit 1
 fi
 
 
-configure_path=$2
+configure_path="$2"
 config="random_config.txt"
 
 case `seq 3 | shuf -n1` in
@@ -45,7 +45,7 @@ sanitize=`echo -e "\n-fsanitize=address -fno-sanitize-recover=all\n-fsanitize=un
 
 CFLAGS="$CFLAGS $std $opt $arch $footprint $math $sanitize"
 
-echo CFLAGS=$CFLAGS > $config
+echo "CFLAGS=$CFLAGS" > "$config"
 
 lib=`echo -e "\n--disable-static\n--disable-shared" | shuf -n1`
 
@@ -64,61 +64,61 @@ rfc8251=`echo -e "\n--disable-rfc8251" | shuf -n1`
 
 if [ "$rfc8251" = --disable-rfc8251 ]
 then
-	vectors=$3
+        vectors="$3"
 else
-	vectors=$4
+        vectors="$4"
 fi
-echo using testvectors at $vectors >> $config
+echo using testvectors at "$vectors" >> "$config"
 
 
 config_opt="$lib $arithmetic $custom $asm $assert $harden $fuzz $checkasm $rfc8251 $approx"
 
-echo configure $config_opt >> $config
+echo configure $config_opt >> "$config"
 
 export CFLAGS
-$configure_path/configure $config_opt > configure_output.txt 2>&1
+"$configure_path/configure" $config_opt > configure_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-	echo configure FAIL >> $config
-	exit 1
+        echo configure FAIL >> "$config"
+        exit 1
 fi
 
 make > make_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-        echo make FAIL >> $config
-	exit 1
+        echo make FAIL >> "$config"
+        exit 1
 fi
 
 #Run valgrind 5% of the time (minus the asan cases)
-if [ `seq 20 | shuf -n1` -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ]
+if [ "`seq 20 | shuf -n1`" -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ]
 then
-	make check > makecheck_output.txt 2>&1
+        make check > makecheck_output.txt 2>&1
 else
-	echo valgrind enabled >> $config
-	valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
+        echo valgrind enabled >> "$config"
+        valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
 fi
 
 if [ $? -ne 0 ]
 then
-        echo check FAIL >> $config
-	exit 1
+        echo check FAIL >> "$config"
+        exit 1
 fi
 
 
 rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
-echo testvectors for $rate Hz > testvectors_output.txt
-../../../run_vectors.sh . $vectors $rate >> testvectors_output.txt 2>&1
+echo testvectors for "$rate" Hz > testvectors_output.txt
+../../../run_vectors.sh . "$vectors" "$rate" >> testvectors_output.txt 2>&1
 
 if [ $? -ne 0 ]
 then
-        echo testvectors FAIL >> $config
+        echo testvectors FAIL >> "$config"
         exit 1
 fi
 
-echo all tests PASS >> $config
+echo all tests PASS >> "$config"
 
 #When everything's good, do some cleaning up to save space
 make distclean > /dev/null 2>&1
-- 
cgit v1.2.3


From 271d48814912a23c7f9443bc81055cbf58ff355b Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 5 Jul 2022 21:58:02 -0400
Subject: Fix warning with --disable-rfc8251

---
 celt/bands.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/celt/bands.c b/celt/bands.c
index bd54036a..5320ffab 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -1380,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    return cm;
 }
 
+#ifndef DISABLE_UPDATE_DRAFT
 static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
 {
    int n1, n2;
@@ -1392,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm
    if (dual_stereo)
       OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
 }
+#endif
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
-- 
cgit v1.2.3


From 683592180f64f5723274a9105be2f3af6208a901 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Thu, 30 Jun 2022 18:57:08 -0700
Subject: Replace assert with test_failed function in test

This will fix -Wunused-but-set-variable on gcc
9.3 release build. Also remove unused assert.h.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 tests/opus_encode_regressions.c | 35 +++++++++++++++++------------------
 tests/test_opus_common.h        |  2 +-
 tests/test_opus_projection.c    |  1 -
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/tests/opus_encode_regressions.c b/tests/opus_encode_regressions.c
index 29234730..4d506eb6 100644
--- a/tests/opus_encode_regressions.c
+++ b/tests/opus_encode_regressions.c
@@ -35,7 +35,6 @@
 #include <stdint.h>
 #include <math.h>
 #include <string.h>
-#include <assert.h>
 #include "opus_multistream.h"
 #include "opus.h"
 #include "test_opus_common.h"
@@ -106,7 +105,7 @@ static int celt_ec_internal_error(void)
               1799,  1799,  1799,  1799, -9721
         };
         err = opus_multistream_encode(enc, pcm, 320, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -144,7 +143,7 @@ static int celt_ec_internal_error(void)
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -182,7 +181,7 @@ static int celt_ec_internal_error(void)
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -220,7 +219,7 @@ static int celt_ec_internal_error(void)
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -256,7 +255,7 @@ static int celt_ec_internal_error(void)
               5632
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -281,7 +280,7 @@ static int celt_ec_internal_error(void)
                  0,     0,  -256,   226
         };
         err = opus_multistream_encode(enc, pcm, 40, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -3 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -334,7 +333,7 @@ static int mscbr_encode_fail10(void)
                  0
         };
         err = opus_multistream_encode(enc, pcm, 20, data, 627300);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -1 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -384,7 +383,7 @@ static int mscbr_encode_fail(void)
                  0
         };
         err = opus_multistream_encode(enc, pcm, 20, data, 472320);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -1 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -740,7 +739,7 @@ static int surround_analysis_uninit(void)
             -20992, 25859,  5372, 12040, 13307, -4355,-30213,    -9, -6019
         };
         err = opus_multistream_encode(enc, pcm, 960, data, 7380);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(1));
@@ -885,7 +884,7 @@ static int surround_analysis_uninit(void)
         };
         err = opus_multistream_encode(enc, pcm, 1440, data, 7380);
         /* reads uninitialized data at src/opus_multistream_encoder.c:293 */
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_destroy(enc);
     return 0;
@@ -935,7 +934,7 @@ static int ec_enc_shrink_assert(void)
     opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(6));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
     data_len = opus_encode(enc, pcm1, 960, data, 2000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
     opus_encoder_ctl(enc, OPUS_SET_PREDICTION_DISABLED(1));
@@ -943,12 +942,12 @@ static int ec_enc_shrink_assert(void)
     opus_encoder_ctl(enc, OPUS_SET_INBAND_FEC(1));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(15600));
     data_len = opus_encode(enc, pcm2, 2880, data, 122);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(27000));
     data_len = opus_encode(enc, pcm3, 2880, data, 122); /* assertion failure */
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_destroy(enc);
     return 0;
@@ -970,7 +969,7 @@ static int ec_enc_shrink_assert2(void)
     {
         static const short pcm[960] = { 0 };
         data_len = opus_encode(enc, pcm, 960, data, 2000);
-        assert(data_len > 0);
+        opus_test_assert(data_len > 0);
     }
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     {
@@ -980,7 +979,7 @@ static int ec_enc_shrink_assert2(void)
             -32768, -32768, 0, 0, -32768, -32768, 0, 0, -32768, -32768
         };
         data_len = opus_encode(enc, pcm, 480, data, 19);
-        assert(data_len > 0);
+        opus_test_assert(data_len > 0);
     }
     opus_encoder_destroy(enc);
     return 0;
@@ -1009,14 +1008,14 @@ static int silk_gain_assert(void)
     opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_NARROWBAND));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
     data_len = opus_encode(enc, pcm1, 160, data, 1000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_VBR(0));
     opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(0));
     opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_MEDIUMBAND));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(2867));
     data_len = opus_encode(enc, pcm2, 960, data, 1000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_destroy(enc);
     return 0;
diff --git a/tests/test_opus_common.h b/tests/test_opus_common.h
index d96c7d84..5fb924f4 100644
--- a/tests/test_opus_common.h
+++ b/tests/test_opus_common.h
@@ -81,5 +81,5 @@ static OPUS_INLINE void _test_failed(const char *file, int line)
   abort();
 }
 #define test_failed() _test_failed(__FILE__, __LINE__);
-
+#define opus_test_assert(cond) {if (!(cond)) {test_failed();}}
 void regression_test(void);
diff --git a/tests/test_opus_projection.c b/tests/test_opus_projection.c
index 5f0d672c..4e06613e 100644
--- a/tests/test_opus_projection.c
+++ b/tests/test_opus_projection.c
@@ -29,7 +29,6 @@
 #include "config.h"
 #endif
 
-#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
-- 
cgit v1.2.3


From 4782baf0afd506da58e5275a254d34a248a63bf9 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Fri, 1 Jul 2022 06:41:27 -0700
Subject: Remove unused variable in tests

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 tests/test_opus_padding.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/test_opus_padding.c b/tests/test_opus_padding.c
index c22e8f0d..c9ef7375 100644
--- a/tests/test_opus_padding.c
+++ b/tests/test_opus_padding.c
@@ -39,7 +39,7 @@
 #define CHANNELS 2
 #define FRAMESIZE 5760
 
-int test_overflow(void)
+void test_overflow(void)
 {
   OpusDecoder *decoder;
   int result;
@@ -51,7 +51,7 @@ int test_overflow(void)
   fprintf(stderr, "  Checking for padding overflow... ");
   if (!in || !out) {
     fprintf(stderr, "FAIL (out of memory)\n");
-    return -1;
+    test_failed();
   }
   in[0] = 0xff;
   in[1] = 0x41;
@@ -71,21 +71,18 @@ int test_overflow(void)
   }
 
   fprintf(stderr, "OK.\n");
-
-  return 1;
 }
 
 int main(void)
 {
   const char *oversion;
-  int tests = 0;;
 
   iseed = 0;
   oversion = opus_get_version_string();
   if (!oversion) test_failed();
   fprintf(stderr, "Testing %s padding.\n", oversion);
 
-  tests += test_overflow();
+  test_overflow();
 
   fprintf(stderr, "All padding tests passed.\n");
 
-- 
cgit v1.2.3


From 99afa9c48e934711f5b07a917b68411353453338 Mon Sep 17 00:00:00 2001
From: Alexander Traud <pabstraud@compuserve.com>
Date: Fri, 22 Oct 2021 12:19:42 +0000
Subject: Fix opus.h for doxygen when referencing alternative values

Doxygen was not able to resolve the references because it looked
for OPUS_APPLICATION_VOIP/@ref.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 include/opus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/opus.h b/include/opus.h
index d282f21d..0a4e508a 100644
--- a/include/opus.h
+++ b/include/opus.h
@@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels);
  *                                     This must be one of 8000, 12000, 16000,
  *                                     24000, or 48000.
  * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP|@ref OPUS_APPLICATION_AUDIO|@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
  * @param [out] error <tt>int*</tt>: @ref opus_errorcodes
  * @note Regardless of the sampling rate and number channels selected, the Opus encoder
  * can switch to a lower audio bandwidth or number of channels if the bitrate
-- 
cgit v1.2.3


From 51fb8955e44ee9ccefd0212f67c7dd6fc7184be0 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Tue, 5 Jul 2022 20:12:43 -0700
Subject: doc: Use consistent alternative notation

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 include/opus.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/opus.h b/include/opus.h
index 0a4e508a..0c69c627 100644
--- a/include/opus.h
+++ b/include/opus.h
@@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels);
  *                                     This must be one of 8000, 12000, 16000,
  *                                     24000, or 48000.
  * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP|@ref OPUS_APPLICATION_AUDIO|@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
  * @param [out] error <tt>int*</tt>: @ref opus_errorcodes
  * @note Regardless of the sampling rate and number channels selected, the Opus encoder
  * can switch to a lower audio bandwidth or number of channels if the bitrate
@@ -222,7 +222,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create(
  *                                      This must be one of 8000, 12000, 16000,
  *                                      24000, or 48000.
   * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
-  * @param [in] application <tt>int</tt>: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+  * @param [in] application <tt>int</tt>: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY)
   * @retval #OPUS_OK Success or @ref opus_errorcodes
   */
 OPUS_EXPORT int opus_encoder_init(
-- 
cgit v1.2.3


From 4b8becdfc5df002701ea1dd97b37d424b12b519d Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Tue, 5 Jul 2022 19:53:18 -0700
Subject: cmake - Add OPUS_BUILD to test targets

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02de9b74..589d1794 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -565,6 +565,7 @@ if(OPUS_BUILD_PROGRAMS)
     target_include_directories(opus_custom_demo
                                PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
     target_link_libraries(opus_custom_demo PRIVATE opus)
+    target_compile_definitions(opus_custom_demo PRIVATE OPUS_BUILD)
   endif()
 
   add_executable(opus_demo ${opus_demo_sources})
@@ -572,6 +573,7 @@ if(OPUS_BUILD_PROGRAMS)
   target_include_directories(opus_demo PRIVATE silk) # debug.h
   target_include_directories(opus_demo PRIVATE celt) # arch.h
   target_link_libraries(opus_demo PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+  target_compile_definitions(opus_demo PRIVATE OPUS_BUILD)
 
   # compare
   add_executable(opus_compare ${opus_compare_sources})
@@ -587,6 +589,7 @@ if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
   target_include_directories(test_opus_decode
                              PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(test_opus_decode PRIVATE opus)
+  target_compile_definitions(test_opus_decode PRIVATE OPUS_BUILD)
   if(OPUS_FIXED_POINT)
     target_compile_definitions(test_opus_decode PRIVATE DISABLE_FLOAT_API)
   endif()
@@ -608,6 +611,7 @@ if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
   target_include_directories(test_opus_api
                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
   target_link_libraries(test_opus_api PRIVATE opus)
+  target_compile_definitions(test_opus_api PRIVATE OPUS_BUILD)
   if(OPUS_FIXED_POINT)
     target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
   endif()
@@ -620,6 +624,7 @@ if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
   target_include_directories(test_opus_encode
                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
   target_link_libraries(test_opus_encode PRIVATE opus)
+  target_compile_definitions(test_opus_encode PRIVATE OPUS_BUILD)
   add_test(NAME test_opus_encode COMMAND ${CMAKE_COMMAND}
         -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_encode>
         -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
-- 
cgit v1.2.3


From 50407983f2f785adf55a16797e1d42741dc1295b Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Wed, 6 Jul 2022 00:50:54 -0400
Subject: Fix warnings when compiling with FUZZING enabled

---
 celt/rate.c        | 2 ++
 src/opus_encoder.c | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/celt/rate.c b/celt/rate.c
index 465e1ba2..7f7ad3fa 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
             else
                depth_threshold = 0;
 #ifdef FUZZING
+            (void)signalBandwidth;
+            (void)depth_threshold;
             if ((rand()&0x1) == 0)
 #else
             if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 64b6726c..8c8db5a5 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1315,6 +1315,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         st->stream_channels = st->force_channels;
     } else {
 #ifdef FUZZING
+        (void)stereo_music_threshold;
+        (void)stereo_voice_threshold;
        /* Random mono/stereo decision */
        if (st->channels == 2 && (rand()&0x1F)==0)
           st->stream_channels = 3-st->stream_channels;
@@ -1353,6 +1355,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     } else if (st->user_forced_mode == OPUS_AUTO)
     {
 #ifdef FUZZING
+        (void)stereo_width;
+        (void)mode_thresholds;
        /* Random mode switching */
        if ((rand()&0xF)==0)
        {
-- 
cgit v1.2.3


From 918a09a344fde1eb28e0f8475fa84133691b01b1 Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <territim@amazon.com>
Date: Sat, 2 Jul 2022 15:13:10 -0700
Subject: Update x86 CPU detection configure check.

Commit 6577534a80c8 switched from using __get_cpuid() to
 __get_cpuid_count(), but the corresponding configure check was not
 updated.
Since __get_cpuid_count() was introduced much later, make sure we
 check for the function we actually use.

Thanks to Mark Harris for the report.
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index f12f0aa9..e2c88ed1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -724,7 +724,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
                  unsigned int CPUInfo2;
                  unsigned int CPUInfo3;
                  unsigned int InfoType;
-                 __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+                 __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
             ]])],
             [AC_MSG_RESULT([C method])
                  AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])],
-- 
cgit v1.2.3


From 4ad7d2108133d070605b25b1eb91e32c279bf81e Mon Sep 17 00:00:00 2001
From: Doug Nazar <nazard@nazar.ca>
Date: Fri, 26 Mar 2021 15:49:02 -0400
Subject: meson: Fix reporting of cpu family if intrinsics not supported

Signed-off-by: Doug Nazar <nazard@nazar.ca>
---
 meson.build | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/meson.build b/meson.build
index 41f69353..ed66d380 100644
--- a/meson.build
+++ b/meson.build
@@ -532,9 +532,9 @@ if not opt_intrinsics.disabled()
     endif # opt_rtcd
   else
     if opt_intrinsics.enabled()
-      error('intrinsics option enabled, but no intrinsics support for ' + host_machine.get_cpu())
+      error('intrinsics option enabled, but no intrinsics support for ' + host_cpu_family)
     endif
-    warning('No intrinsics support for ' + host_machine.get_cpu())
+    warning('No intrinsics support for ' + host_cpu_family)
   endif
 endif
 
-- 
cgit v1.2.3


From f1b088001ecbc523ffb07109f301d8773bec44b8 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Wed, 6 Jul 2022 22:45:41 -0700
Subject: Fix uninitialized field on custom mode malloc fail

---
 celt/modes.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/celt/modes.c b/celt/modes.c
index 390c5e8a..23f7cde6 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode)
    mode->nbAllocVectors = BITALLOC_SIZE;
    allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
    if (allocVectors==NULL)
+   {
+      mode->allocVectors = NULL;
       return;
+   }
 
    /* Check for standard mode */
    if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
-- 
cgit v1.2.3


From a80e9e9533d4edeaae282b82f77b8bd8a4903eca Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Tue, 5 Jul 2022 22:48:35 -0700
Subject: cmake - fix lrintf, lrint detection

This commit addresses the issues of not finding lrintf and lrint. We
switch to check_symbol_exists instead per cmake documentation. Also
make sure to link math lib for detection for nix.

For MSVC the issue for non x86 builds was that the standard was set to
default which is 199409L. This resulted in not using lrintf even that
it was found. To address this we set the C standard to C11 and it will
only apply to newer versions of MSVC where the /std flag is supported.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 cmake/OpusConfig.cmake | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/cmake/OpusConfig.cmake b/cmake/OpusConfig.cmake
index 8d19a535..b82307a1 100644
--- a/cmake/OpusConfig.cmake
+++ b/cmake/OpusConfig.cmake
@@ -9,16 +9,18 @@ configure_file(cmake/config.h.cmake.in config.h @ONLY)
 add_definitions(-DHAVE_CONFIG_H)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-set_property(GLOBAL PROPERTY C_STANDARD 99)
 
 if(MSVC)
-  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+  # For compilers that have no notion of a C standard level,
+  # such as Microsoft Visual C++ before VS 16.7,
+  # this property has no effect.
+  set(CMAKE_C_STANDARD 11)
+else()
+  set(CMAKE_C_STANDARD 99)
 endif()
 
-include(CheckLibraryExists)
-check_library_exists(m floor "" HAVE_LIBM)
-if(HAVE_LIBM)
-  list(APPEND OPUS_REQUIRED_LIBRARIES m)
+if(MSVC)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 endif()
 
 include(CFeatureCheck)
@@ -35,9 +37,18 @@ else()
   check_symbol_exists(alloca "stdlib.h;malloc.h" USE_ALLOCA_SUPPORTED)
 endif()
 
-include(CheckFunctionExists)
-check_function_exists(lrintf HAVE_LRINTF)
-check_function_exists(lrint HAVE_LRINT)
+include(CMakePushCheckState)
+cmake_push_check_state(RESET)
+include(CheckLibraryExists)
+check_library_exists(m floor "" HAVE_LIBM)
+if(HAVE_LIBM)
+  list(APPEND OPUS_REQUIRED_LIBRARIES m)
+  set(CMAKE_REQUIRED_LIBRARIES m)
+endif()
+
+check_symbol_exists(lrintf "math.h" HAVE_LRINTF)
+check_symbol_exists(lrint "math.h" HAVE_LRINT)
+cmake_pop_check_state()
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(i[0-9]86|x86|X86|amd64|AMD64|x86_64)")
   if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-- 
cgit v1.2.3


From 510e1029b45aa01a7047e5b3b82169be40911b96 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Wed, 6 Jul 2022 21:35:16 -0700
Subject: cmake - fix rtcd detection on x86 non windows

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 CMakeLists.txt            | 14 ++++++++++++--
 Makefile.am               |  2 ++
 cmake/OpusFunctions.cmake | 30 ++++++++++++++++++++++--------
 cmake/cpu_info_by_asm.c   | 31 +++++++++++++++++++++++++++++++
 cmake/cpu_info_by_c.c     |  9 +++++++++
 5 files changed, 76 insertions(+), 10 deletions(-)
 create mode 100644 cmake/cpu_info_by_asm.c
 create mode 100644 cmake/cpu_info_by_c.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 589d1794..75362ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,11 +366,21 @@ if(NOT OPUS_ENABLE_FLOAT_API)
 endif()
 
 if(NOT OPUS_DISABLE_INTRINSICS)
-  if((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
+  if(((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
      (OPUS_X86_MAY_HAVE_SSE2 AND NOT OPUS_X86_PRESUME_SSE2) OR
      (OPUS_X86_MAY_HAVE_SSE4_1 AND NOT OPUS_X86_PRESUME_SSE4_1) OR
-     (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX))
+     (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX)) AND
+      RUNTIME_CPU_CAPABILITY_DETECTION)
     target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+    if(NOT MSVC)
+      if(CPU_INFO_BY_ASM_SUPPORTED)
+        target_compile_definitions(opus PRIVATE CPU_INFO_BY_ASM)
+      elseif(CPU_INFO_BY_C_SUPPORTED)
+        target_compile_definitions(opus PRIVATE CPU_INFO_BY_C)
+      else()
+        message(ERROR "Runtime cpu capability detection is enabled while CPU_INFO is not supported")
+      endif()
+    endif()
   endif()
 
   if(SSE1_SUPPORTED)
diff --git a/Makefile.am b/Makefile.am
index 70a2ebfa..e1f8c2c4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -225,6 +225,8 @@ EXTRA_DIST = opus.pc.in \
              cmake/RunTest.cmake \
              cmake/config.h.cmake.in \
              cmake/vla.c \
+             cmake/cpu_info_by_asm.c \
+             cmake/cpu_info_by_c.c \
              meson/get-version.py \
              meson/read-sources-list.py \
              meson.build \
diff --git a/cmake/OpusFunctions.cmake b/cmake/OpusFunctions.cmake
index fcf3351f..3f22ad81 100644
--- a/cmake/OpusFunctions.cmake
+++ b/cmake/OpusFunctions.cmake
@@ -142,14 +142,28 @@ function(opus_detect_neon COMPILER_SUPPORT_NEON)
 endfunction()
 
 function(opus_supports_cpu_detection RUNTIME_CPU_CAPABILITY_DETECTION)
-  if(MSVC)
-    check_include_file(intrin.h HAVE_INTRIN_H)
-  else()
-    check_include_file(cpuid.h HAVE_CPUID_H)
-  endif()
-  if(HAVE_INTRIN_H OR HAVE_CPUID_H)
-    set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
-  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
+  set(RUNTIME_CPU_CAPABILITY_DETECTION 0 PARENT_SCOPE)
+  if(OPUS_CPU_X86 OR OPUS_CPU_X64)
+    if(MSVC)
+      check_include_file(intrin.h HAVE_INTRIN_H)
+      if(HAVE_INTRIN_H)
+        # if intrin.h is available we assume __cpuid is there
+        set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+      endif()
+    else()
+      include(CFeatureCheck)
+      c_feature_check(CPU_INFO_BY_ASM)
+      set(CPU_INFO_BY_ASM_SUPPORTED ${CPU_INFO_BY_ASM_SUPPORTED} PARENT_SCOPE)
+      check_include_file(cpuid.h HAVE_CPUID_H)
+      if(HAVE_CPUID_H)
+        c_feature_check(CPU_INFO_BY_C)
+        set(CPU_INFO_BY_C_SUPPORTED ${CPU_INFO_BY_C_SUPPORTED} PARENT_SCOPE)
+      endif()
+      if(CPU_INFO_BY_ASM_SUPPORTED OR CPU_INFO_BY_C_SUPPORTED)
+        set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+      endif()
+    endif()
+  elseif(OPUS_CPU_ARM)
     # ARM cpu detection is implemented for Windows and anything
     # using a Linux kernel (such as Android).
     if (CMAKE_SYSTEM_NAME MATCHES "(Windows|Linux|Android)")
diff --git a/cmake/cpu_info_by_asm.c b/cmake/cpu_info_by_asm.c
new file mode 100644
index 00000000..1a70a815
--- /dev/null
+++ b/cmake/cpu_info_by_asm.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+int main() {
+    unsigned int CPUInfo0;
+    unsigned int CPUInfo1;
+    unsigned int CPUInfo2;
+    unsigned int CPUInfo3;
+    unsigned int InfoType;
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+    __asm__ __volatile__ (
+        "xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n":
+        "=a" (CPUInfo0),
+        "=r" (CPUInfo1),
+        "=c" (CPUInfo2),
+        "=d" (CPUInfo3) :
+        "0" (InfoType), "2" (0)
+    );
+#else
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (CPUInfo0),
+        "=b" (CPUInfo1),
+        "=c" (CPUInfo2),
+        "=d" (CPUInfo3) :
+        "0" (InfoType), "2" (0)
+    );
+#endif
+    return 0;
+}
diff --git a/cmake/cpu_info_by_c.c b/cmake/cpu_info_by_c.c
new file mode 100644
index 00000000..117084eb
--- /dev/null
+++ b/cmake/cpu_info_by_c.c
@@ -0,0 +1,9 @@
+#include <cpuid.h>
+int main() {
+    unsigned int CPUInfo0;
+    unsigned int CPUInfo1;
+    unsigned int CPUInfo2;
+    unsigned int CPUInfo3;
+    unsigned int InfoType;
+    return __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+}
-- 
cgit v1.2.3


From 1f891e3616b4e8b17276830930c296c9f88332e3 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Thu, 7 Jul 2022 20:14:37 -0700
Subject: cmake - move warning C4244 to level 4

Opus compare is used to generate test vectors so no cosmetic changes
is taken. Hence we move this warning to level 4 for opus compare.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75362ccf..21c7ae53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -589,6 +589,10 @@ if(OPUS_BUILD_PROGRAMS)
   add_executable(opus_compare ${opus_compare_sources})
   target_include_directories(opus_compare PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(opus_compare PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+  if(MSVC)
+    # move cosmetic warning to level 4 for opus_compare
+    target_compile_options(opus_compare PRIVATE /w44244)
+  endif()
 endif()
 
 if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
-- 
cgit v1.2.3


From caf56aab41c53b129491c986844de029e619ce27 Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Fri, 8 Jul 2022 18:25:46 -0700
Subject: update doc on custom mode

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 include/opus_custom.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/opus_custom.h b/include/opus_custom.h
index 2227be01..2f22d4b3 100644
--- a/include/opus_custom.h
+++ b/include/opus_custom.h
@@ -104,7 +104,8 @@ typedef struct OpusCustomDecoder OpusCustomDecoder;
 /** The mode contains all the information necessary to create an
     encoder. Both the encoder and decoder need to be initialized
     with exactly the same mode, otherwise the output will be
-    corrupted.
+    corrupted. The mode MUST NOT BE DESTROYED until the encoders and
+    decoders that use it are destroyed as well.
    @brief Mode configuration
  */
 typedef struct OpusCustomMode OpusCustomMode;
-- 
cgit v1.2.3


From 1504d2d4aaa8ec3a2d298a5bc226cde427f82b26 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Sat, 9 Jul 2022 03:15:18 -0400
Subject: Fix C90-related warnings

---
 celt/tests/test_unit_dft.c               | 3 ++-
 celt/tests/test_unit_entropy.c           | 4 ++--
 celt/tests/test_unit_mdct.c              | 3 ++-
 silk/tests/test_unit_LPC_inv_pred_gain.c | 2 +-
 silk/x86/NSQ_del_dec_sse4_1.c            | 3 ++-
 silk/x86/NSQ_sse4_1.c                    | 2 +-
 6 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 70f8f493..ae9a7b56 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index 7f674529..b1619b74 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@ int main(int _argc,char **_argv){
   nbits=ec_tell_frac(&enc);
   ec_enc_done(&enc);
   fprintf(stderr,
-   "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+   "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
    entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
   fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
   ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@ int main(int _argc,char **_argv){
   nbits2=ec_tell_frac(&dec);
   if(nbits!=nbits2){
     fprintf(stderr,
-     "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+     "Reported number of bits used was %0.2f, should be %0.2f.\n",
      ldexp(nbits2,-3),ldexp(nbits,-3));
     ret=-1;
   }
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 4a563ccf..844c5b48 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/silk/tests/test_unit_LPC_inv_pred_gain.c b/silk/tests/test_unit_LPC_inv_pred_gain.c
index 67067cea..7ca902ad 100644
--- a/silk/tests/test_unit_LPC_inv_pred_gain.c
+++ b/silk/tests/test_unit_LPC_inv_pred_gain.c
@@ -43,6 +43,7 @@ int check_stability(opus_int16 *A_Q12, int order) {
     int i;
     int j;
     int sum_a, sum_abs_a;
+    double y[SILK_MAX_ORDER_LPC] = {0};
     sum_a = sum_abs_a = 0;
     for( j = 0; j < order; j++ ) {
         sum_a += A_Q12[ j ];
@@ -57,7 +58,6 @@ int check_stability(opus_int16 *A_Q12, int order) {
     if( sum_abs_a < 4096 ) {
         return 1;
     }
-    double y[SILK_MAX_ORDER_LPC] = {0};
     y[0] = 1;
     for( i = 0; i < 10000; i++ ) {
         double sum = 0;
diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c
index 42735c52..a58a76cd 100644
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@@ -387,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    int rdo_offset;
 
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
@@ -399,7 +400,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     celt_assert( nStatesDelayedDecision > 0 );
     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
 
-    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+    rdo_offset = (Lambda_Q10 >> 1) - 512;
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c
index a2a74659..d5ae1d3b 100644
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@@ -719,10 +719,10 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 
     /* Adjust for changing gain */
     if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
         gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
 
         /* Scale long-term shaping state */
-        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
 
         /* prepare gain_adj_Q16 in packed 4 32-bits */
         xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
-- 
cgit v1.2.3


From affb551e47052d5b9a0e37c681c816a6cf4159a7 Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <territim@amazon.com>
Date: Wed, 6 Jul 2022 16:12:11 -0700
Subject: Make silk/x86 header indentation consistent.

The indentation for nested #ifs was all over the place.
---
 silk/x86/SigProc_FIX_sse.h | 40 +++++++++++++++---------------
 silk/x86/main_sse.h        | 62 +++++++++++++++++++++++-----------------------
 2 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index e49d5d4e..9bcaa805 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -26,13 +26,13 @@
 */
 
 #ifndef SIGPROC_FIX_SSE_H
-#define SIGPROC_FIX_SSE_H
+# define SIGPROC_FIX_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 void silk_burg_modified_sse4_1(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
@@ -45,11 +45,11 @@ void silk_burg_modified_sse4_1(
     int                         arch                /* I    Run-time architecture                                       */
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+       ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#else
+#  else
 
 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
@@ -62,10 +62,10 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              D,                  /* I    Order                                                       */
     int                         arch                /* I    Run-time architecture                                       */);
 
-#  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+     ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#endif
+#  endif
 
 opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16 *inVec1,
@@ -74,21 +74,21 @@ opus_int64 silk_inner_prod16_sse4_1(
 );
 
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#define silk_inner_prod16(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+       ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
 
-#else
+#  else
 
 extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
                     const opus_int   len);
 
-#  define silk_inner_prod16(inVec1, inVec2, len, arch) \
-    ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+     ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
 
-#endif
-#endif
+#  endif
+# endif
 #endif
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 0a0391a2..9ed436bb 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -26,11 +26,11 @@
 */
 
 #ifndef MAIN_SSE_H
-#define MAIN_SSE_H
+# define MAIN_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
@@ -51,14 +51,14 @@ void silk_VQ_WMat_EC_sse4_1(
     const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          subfr_len, max_gain_Q7, L, arch) \
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
     ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
                           subfr_len, max_gain_Q7, L))
 
-#else
+#  else
 
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
@@ -75,12 +75,12 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#  define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          subfr_len, max_gain_Q7, L, arch) \
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
     ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
                           subfr_len, max_gain_Q7, L))
 
-#endif
+#  endif
 
 #  define OVERRIDE_silk_NSQ
 
@@ -102,14 +102,14 @@ void silk_NSQ_sse4_1(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  else
 
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
@@ -129,12 +129,12 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
+#  endif
 
 #  define OVERRIDE_silk_NSQ_del_dec
 
@@ -156,14 +156,14 @@ void silk_NSQ_del_dec_sse4_1(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  else
 
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
@@ -183,12 +183,12 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
+#  endif
 
 void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
@@ -228,19 +228,19 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(
     const opus_int16   pIn[]
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
 
-#else
+#  else
 
-#  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
-     ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
 
 extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
      silk_encoder_state *psEnC,
      const opus_int16   pIn[]);
 
-#endif
+#  endif
 
 # endif
 #endif
-- 
cgit v1.2.3


From 71fb707875b95672f0cd1cb153c890eff4219720 Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <territim@amazon.com>
Date: Wed, 6 Jul 2022 15:21:16 -0700
Subject: Don't compile x86 cpu detection without RTCD.

Also #error if RTCD is enabled without a detection method, like Arm.
A number of SILK functions also still used the lookup tables, even
 when RTCD was disabled.
Fix those, too.
---
 celt/cpu_support.h         |  5 +++--
 celt/x86/x86cpu.c          |  9 ++++++---
 silk/SigProc_FIX.h         |  4 +++-
 silk/x86/SigProc_FIX_sse.h |  9 +++++++--
 silk/x86/main_sse.h        | 31 ++++++++++++++++---------------
 silk/x86/x86_silk_map.c    |  2 +-
 6 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 68fc6067..7b5c56ca 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,10 +43,11 @@
  */
 #define OPUS_ARCHMASK 3
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #include "x86/x86cpu.h"
 /* We currently support 5 x86 variants:
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 7cfc8db5..6a1914de 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #if defined(_MSC_VER)
 
@@ -91,6 +91,9 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
             what we want on CPUs that don't support CPUID. */
         CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
     }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
 #endif
 }
 
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index 1d9bf2f1..fbdfa82e 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -609,10 +609,12 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 /* the following seems faster on x86 */
 #define silk_SMMUL(a32, b32)                (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
 
-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if !defined(OVERRIDE_silk_burg_modified)
 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
     ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#endif
 
+#if !defined(OVERRIDE_silk_inner_prod16)
 #define silk_inner_prod16(inVec1, inVec2, len, arch) \
     ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
 #endif
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 9bcaa805..89a5ec88 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -46,10 +46,12 @@ void silk_burg_modified_sse4_1(
 );
 
 #  if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#   define OVERRIDE_silk_burg_modified
 #   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
        ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#  else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
@@ -62,6 +64,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              D,                  /* I    Order                                                       */
     int                         arch                /* I    Run-time architecture                                       */);
 
+#   define OVERRIDE_silk_burg_modified
 #   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
      ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
@@ -76,16 +79,18 @@ opus_int64 silk_inner_prod16_sse4_1(
 
 #  if defined(OPUS_X86_PRESUME_SSE4_1)
 
+#   define OVERRIDE_silk_inner_prod16
 #   define silk_inner_prod16(inVec1, inVec2, len, arch) \
        ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
 
-#  else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
                     const opus_int   len);
 
+#   define OVERRIDE_silk_inner_prod16
 #   define silk_inner_prod16(inVec1, inVec2, len, arch) \
      ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
 
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 9ed436bb..a01d7f6c 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -34,8 +34,6 @@
 
 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
-#  define OVERRIDE_silk_VQ_WMat_EC
-
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
     opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
@@ -53,12 +51,13 @@ void silk_VQ_WMat_EC_sse4_1(
 
 #  if defined OPUS_X86_PRESUME_SSE4_1
 
+#   define OVERRIDE_silk_VQ_WMat_EC
 #   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
                            subfr_len, max_gain_Q7, L, arch) \
     ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
                           subfr_len, max_gain_Q7, L))
 
-#  else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
@@ -75,6 +74,7 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
+#   define OVERRIDE_silk_VQ_WMat_EC
 #   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
                            subfr_len, max_gain_Q7, L, arch) \
     ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
@@ -82,8 +82,6 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
 
 #  endif
 
-#  define OVERRIDE_silk_NSQ
-
 void silk_NSQ_sse4_1(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
@@ -104,12 +102,13 @@ void silk_NSQ_sse4_1(
 
 #  if defined OPUS_X86_PRESUME_SSE4_1
 
+#   define OVERRIDE_silk_NSQ
 #   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                     HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#  else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
@@ -129,6 +128,7 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
+#   define OVERRIDE_silk_NSQ
 #   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                     HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
@@ -136,8 +136,6 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
 
 #  endif
 
-#  define OVERRIDE_silk_NSQ_del_dec
-
 void silk_NSQ_del_dec_sse4_1(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
@@ -158,12 +156,13 @@ void silk_NSQ_del_dec_sse4_1(
 
 #  if defined OPUS_X86_PRESUME_SSE4_1
 
+#   define OVERRIDE_silk_NSQ_del_dec
 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#  else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
@@ -183,6 +182,7 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
+#   define OVERRIDE_silk_NSQ_del_dec
 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
@@ -221,25 +221,26 @@ void silk_VAD_GetNoiseLevels(
     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
 );
 
-#  define OVERRIDE_silk_VAD_GetSA_Q8
-
 opus_int silk_VAD_GetSA_Q8_sse4_1(
     silk_encoder_state *psEnC,
     const opus_int16   pIn[]
 );
 
 #  if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#   define OVERRIDE_silk_VAD_GetSA_Q8
 #   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
 
-#  else
-
-#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
-      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
      silk_encoder_state *psEnC,
      const opus_int16   pIn[]);
 
+#   define OVERRIDE_silk_VAD_GetSA_Q8
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+
 #  endif
 
 # endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index ca13cde9..70f60078 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -35,7 +35,7 @@
 #include "pitch.h"
 #include "main.h"
 
-#if !defined(OPUS_X86_PRESUME_SSE4_1)
+#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1)
 
 #if defined(FIXED_POINT)
 
-- 
cgit v1.2.3


From 08088411259056f63774befb2d00951fdd5c46ba Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <territim@amazon.com>
Date: Wed, 6 Jul 2022 15:23:12 -0700
Subject: Only build platform RTCD sources when enabled.

To avoid issues with empty compilation units.
---
 CMakeLists.txt          |  8 ++++----
 Makefile.am             | 12 ++++++++++--
 celt/meson.build        |  8 +++++++-
 celt_sources.mk         |  8 +++++---
 cmake/OpusSources.cmake |  5 ++++-
 configure.ac            |  5 +++++
 silk/meson.build        | 10 ++++++++++
 silk_sources.mk         | 10 +++++++---
 8 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21c7ae53..9d824cdc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -381,6 +381,8 @@ if(NOT OPUS_DISABLE_INTRINSICS)
         message(ERROR "Runtime cpu capability detection is enabled while CPU_INFO is not supported")
       endif()
     endif()
+    add_sources_group(opus celt ${celt_sources_x86_rtcd})
+    add_sources_group(opus silk ${silk_sources_x86_rtcd})
   endif()
 
   if(SSE1_SUPPORTED)
@@ -465,15 +467,13 @@ if(NOT OPUS_DISABLE_INTRINSICS)
     endif()
   endif()
 
-  if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
-    add_sources_group(opus celt ${celt_sources_arm})
-  endif()
-
   if(COMPILER_SUPPORT_NEON)
     if(OPUS_MAY_HAVE_NEON)
       if(RUNTIME_CPU_CAPABILITY_DETECTION)
         message(STATUS "OPUS_MAY_HAVE_NEON enabling runtime detection")
         target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+        add_sources_group(opus celt ${celt_sources_arm_rtcd})
+        add_sources_group(opus silk ${silk_sources_arm_rtcd})
       else()
         message(ERROR "Runtime cpu capability detection needed for MAY_HAVE_NEON")
       endif()
diff --git a/Makefile.am b/Makefile.am
index e1f8c2c4..492fc09d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -36,6 +36,11 @@ else
 OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
 endif
 
+if CPU_X86
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+endif
 if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
@@ -45,10 +50,13 @@ endif
 if HAVE_SSE4_1
 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
 endif
+endif
 
 if CPU_ARM
-CELT_SOURCES += $(CELT_SOURCES_ARM)
-SILK_SOURCES += $(SILK_SOURCES_ARM)
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_ARM_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_ARM_RTCD)
+endif
 
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
diff --git a/celt/meson.build b/celt/meson.build
index 370ea1fe..ad95d949 100644
--- a/celt/meson.build
+++ b/celt/meson.build
@@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR']
 
 celt_static_libs = []
 
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  celt_sources +=  sources['CELT_SOURCES_X86_RTCD']
+endif
+
 foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr']
   have_intr = get_variable('have_' + intr_name)
   if not have_intr
@@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio
 endif
 
 if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
-  celt_sources +=  sources['CELT_SOURCES_ARM']
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    celt_sources +=  sources['CELT_SOURCES_ARM_RTCD']
+  endif
   if have_arm_ne10
     celt_sources += sources['CELT_SOURCES_ARM_NE10']
   endif
diff --git a/celt_sources.mk b/celt_sources.mk
index c9dab06e..d6b6765b 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -18,9 +18,11 @@ celt/quant_bands.c \
 celt/rate.c \
 celt/vq.c
 
-CELT_SOURCES_SSE = \
+CELT_SOURCES_X86_RTCD = \
 celt/x86/x86cpu.c \
-celt/x86/x86_celt_map.c \
+celt/x86/x86_celt_map.c
+
+CELT_SOURCES_SSE = \
 celt/x86/pitch_sse.c
 
 CELT_SOURCES_SSE2 = \
@@ -31,7 +33,7 @@ CELT_SOURCES_SSE4_1 = \
 celt/x86/celt_lpc_sse4_1.c \
 celt/x86/pitch_sse4_1.c
 
-CELT_SOURCES_ARM = \
+CELT_SOURCES_ARM_RTCD = \
 celt/arm/armcpu.c \
 celt/arm/arm_celt_map.c
 
diff --git a/cmake/OpusSources.cmake b/cmake/OpusSources.cmake
index 01e75d1a..b47f8c69 100644
--- a/cmake/OpusSources.cmake
+++ b/cmake/OpusSources.cmake
@@ -9,9 +9,11 @@ get_opus_sources(SILK_HEAD silk_headers.mk silk_headers)
 get_opus_sources(SILK_SOURCES silk_sources.mk silk_sources)
 get_opus_sources(SILK_SOURCES_FLOAT silk_sources.mk silk_sources_float)
 get_opus_sources(SILK_SOURCES_FIXED silk_sources.mk silk_sources_fixed)
+get_opus_sources(SILK_SOURCES_X86_RTCD silk_sources.mk silk_sources_x86_rtcd)
 get_opus_sources(SILK_SOURCES_SSE4_1 silk_sources.mk silk_sources_sse4_1)
 get_opus_sources(SILK_SOURCES_FIXED_SSE4_1 silk_sources.mk
                  silk_sources_fixed_sse4_1)
+get_opus_sources(SILK_SOURCES_ARM_RTCD silk_sources.mk silk_sources_arm_rtcd)
 get_opus_sources(SILK_SOURCES_ARM_NEON_INTR silk_sources.mk
                  silk_sources_arm_neon_intr)
 get_opus_sources(SILK_SOURCES_FIXED_ARM_NEON_INTR silk_sources.mk
@@ -23,10 +25,11 @@ get_opus_sources(OPUS_SOURCES_FLOAT opus_sources.mk opus_sources_float)
 
 get_opus_sources(CELT_HEAD celt_headers.mk celt_headers)
 get_opus_sources(CELT_SOURCES celt_sources.mk celt_sources)
+get_opus_sources(CELT_SOURCES_X86_RTCD celt_sources.mk celt_sources_x86_rtcd)
 get_opus_sources(CELT_SOURCES_SSE celt_sources.mk celt_sources_sse)
 get_opus_sources(CELT_SOURCES_SSE2 celt_sources.mk celt_sources_sse2)
 get_opus_sources(CELT_SOURCES_SSE4_1 celt_sources.mk celt_sources_sse4_1)
-get_opus_sources(CELT_SOURCES_ARM celt_sources.mk celt_sources_arm)
+get_opus_sources(CELT_SOURCES_ARM_RTCD celt_sources.mk celt_sources_arm_rtcd)
 get_opus_sources(CELT_SOURCES_ARM_ASM celt_sources.mk celt_sources_arm_asm)
 get_opus_sources(CELT_AM_SOURCES_ARM_ASM celt_sources.mk
                  celt_am_sources_arm_asm)
diff --git a/configure.ac b/configure.ac
index e2c88ed1..1d426f27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -195,6 +195,7 @@ AC_ARG_ENABLE([intrinsics],
 
 rtcd_support=no
 cpu_arm=no
+cpu_x86=no
 
 AS_IF([test x"${enable_asm}" = x"yes"],[
     inline_optimization="No inline ASM for your platform, please send patches"
@@ -535,6 +536,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
    ],
    [i?86|x86_64],
    [
+      cpu_x86=yes
       OPUS_CHECK_INTRINSICS(
          [SSE],
          [$X86_SSE_CFLAGS],
@@ -744,6 +746,7 @@ AM_CONDITIONAL([HAVE_ARM_NEON_INTR],
     [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
 AM_CONDITIONAL([HAVE_ARM_NE10],
     [test x"$HAVE_ARM_NE10" = x"1"])
+AM_CONDITIONAL([CPU_X86], [test "$cpu_x86" = "yes"])
 AM_CONDITIONAL([HAVE_SSE],
     [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
 AM_CONDITIONAL([HAVE_SSE2],
@@ -753,6 +756,8 @@ AM_CONDITIONAL([HAVE_SSE4_1],
 AM_CONDITIONAL([HAVE_AVX],
     [test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"])
 
+AM_CONDITIONAL([HAVE_RTCD],
+ [test x"$enable_rtcd" = x"yes" -a x"$rtcd_support" != x"no"])
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
         AC_DEFINE([OPUS_HAVE_RTCD], [1],
diff --git a/silk/meson.build b/silk/meson.build
index 70692372..917048b2 100644
--- a/silk/meson.build
+++ b/silk/meson.build
@@ -21,6 +21,16 @@ endif
 silk_includes = [opus_includes, include_directories('float', 'fixed')]
 silk_static_libs = []
 
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  silk_sources +=  sources['SILK_SOURCES_X86_RTCD']
+endif
+
+if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    silk_sources +=  sources['SILK_SOURCES_ARM_RTCD']
+  endif
+endif
+
 foreach intr_name : ['sse4_1', 'neon_intr']
   have_intr = get_variable('have_' + intr_name)
   if not have_intr
diff --git a/silk_sources.mk b/silk_sources.mk
index d2666e66..3df24816 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -77,15 +77,19 @@ silk/stereo_find_predictor.c \
 silk/stereo_quant_pred.c \
 silk/LPC_fit.c
 
-SILK_SOURCES_SSE4_1 =  \
+SILK_SOURCES_X86_RTCD = \
+silk/x86/x86_silk_map.c
+
+SILK_SOURCES_SSE4_1 = \
 silk/x86/NSQ_sse4_1.c \
 silk/x86/NSQ_del_dec_sse4_1.c \
-silk/x86/x86_silk_map.c \
 silk/x86/VAD_sse4_1.c \
 silk/x86/VQ_WMat_EC_sse4_1.c
 
+SILK_SOURCES_ARM_RTCD = \
+silk/arm/arm_silk_map.c
+
 SILK_SOURCES_ARM_NEON_INTR = \
-silk/arm/arm_silk_map.c \
 silk/arm/biquad_alt_neon_intr.c \
 silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_del_dec_neon_intr.c \
-- 
cgit v1.2.3


From 243987518a65218ffe5cf260756cbf66583a9bb4 Mon Sep 17 00:00:00 2001
From: Mark Harris <mark.hsj@gmail.com>
Date: Sat, 9 Jul 2022 22:14:13 -0700
Subject: Silence Clang 13+ null-pointer-subtraction warning

---
 celt/pitch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celt/pitch.c b/celt/pitch.c
index 586ca8c3..7998db41 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -258,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
    opus_val32 maxcorr=1;
 #endif
    celt_assert(max_pitch>0);
-   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert(((size_t)_x&3)==0);
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-- 
cgit v1.2.3


From e4a74ddeb9788eab84d10660e958fe706619892f Mon Sep 17 00:00:00 2001
From: Marcus Asteborg <maastebo@microsoft.com>
Date: Tue, 12 Jul 2022 10:12:21 +0200
Subject: Silence MSVC C4244 warning

When building with FLOAT_APPROX.

Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 celt/mathops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celt/mathops.h b/celt/mathops.h
index fe29dac1..478ac918 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x)
       float f;
       opus_uint32 i;
    } res;
-   integer = floor(x);
+   integer = (int)floor(x);
    if (integer < -50)
       return 0;
    frac = x-integer;
-- 
cgit v1.2.3


From c9d5bea13e3cb7381bfa897a45d8bab4e7b767a7 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Wed, 13 Jul 2022 19:20:06 -0400
Subject: Fix NORM_ALIASING_HACK

We need to move the history out of the way before we write to the
shape array X, or else we get corruption of the audio.

Signed-off-by: Jean-Marc Valin <jmvalin@amazon.com>
---
 celt/celt_decoder.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 77eb44f4..35a2073a 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -557,6 +557,10 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 #else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
 #endif
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+(overlap>>1));
+      } while (++c<C);
 
       /* Energy decay */
       decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
@@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       }
       st->rng = seed;
 
-      c=0; do {
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
-               DECODE_BUFFER_SIZE-N+(overlap>>1));
-      } while (++c<C);
-
       celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
    } else {
       int exc_length;
-- 
cgit v1.2.3


From fbed746cb2d0fee57eae090e67d148d89923f6ff Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 22 Jul 2022 02:27:34 -0400
Subject: Relaxing checks for MULT16_32_QX()

MULT16_32_QX() is now implemented using a signed-unsigned multiply,
so the second argument can now have one extra bit compared to the
old signed-signed implementation.

Reviewed by Mark Harris
---
 celt/fixed_debug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index c2cf5a83..ef2e5d02 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -491,7 +491,7 @@ static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_val32)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -524,7 +524,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_int64)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line);
 #ifdef FIXED_DEBUG_ASSERT
-- 
cgit v1.2.3


From e05aea9785f709f5aebb696ee5b4460681676e10 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 22 Jul 2022 02:29:05 -0400
Subject: Using saturating round to fix some wrap-arounds

Reviewed by Mark Harris
---
 celt/celt_decoder.c |  8 ++++----
 celt/celt_lpc.c     | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 35a2073a..883dae15 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -629,7 +629,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 
          buf = decode_mem[c];
          for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
-            exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+            exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
 
          if (loss_duration == 0)
          {
@@ -731,7 +731,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
                         exc[extrapolation_offset+j])), SIG_SHIFT);
             /* Compute the energy of the previously decoded signal whose
                excitation we're copying. */
-            tmp = ROUND16(
+            tmp = SROUND16(
                   buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
                   SIG_SHIFT);
             S1 += SHR32(MULT16_16(tmp, tmp), 10);
@@ -741,7 +741,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             /* Copy the last decoded samples (prior to the overlap region) to
                synthesis filter memory so we can have a continuous signal. */
             for (i=0;i<LPC_ORDER;i++)
-               lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+               lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
             /* Apply the synthesis filter to convert the excitation back into
                the signal domain. */
             celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
@@ -760,7 +760,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             opus_val32 S2=0;
             for (i=0;i<extrapolation_len;i++)
             {
-               opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
                S2 += SHR32(MULT16_16(tmp, tmp), 10);
             }
             /* This checks for an "explosion" in the synthesis. */
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 242e6df5..f91721bc 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -159,17 +159,17 @@ void celt_fir_c(
       sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
       sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
       xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
-      y[i  ] = ROUND16(sum[0], SIG_SHIFT);
-      y[i+1] = ROUND16(sum[1], SIG_SHIFT);
-      y[i+2] = ROUND16(sum[2], SIG_SHIFT);
-      y[i+3] = ROUND16(sum[3], SIG_SHIFT);
+      y[i  ] = SROUND16(sum[0], SIG_SHIFT);
+      y[i+1] = SROUND16(sum[1], SIG_SHIFT);
+      y[i+2] = SROUND16(sum[2], SIG_SHIFT);
+      y[i+3] = SROUND16(sum[3], SIG_SHIFT);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
          sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
-      y[i] = ROUND16(sum, SIG_SHIFT);
+      y[i] = SROUND16(sum, SIG_SHIFT);
    }
    RESTORE_STACK;
 }
-- 
cgit v1.2.3


From 4c6bae5078be71b0a4d69d0c3fa77a5a849f9876 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 22 Jul 2022 02:32:04 -0400
Subject: More ubsan fixes for the debug macros themselves

Reviewed by Mark Harris
---
 silk/MacroDebug.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index e505d02a..aceeef7e 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -101,9 +101,9 @@ static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file
 #undef silk_SUB32
 #define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){
-    opus_int32 ret;
+    opus_int64 ret;
 
-    ret = a - b;
+    ret = a - (opus_int64)b;
     if ( ret != silk_SUB_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -333,8 +333,8 @@ static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char
 #define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){
     opus_int32 ret;
-    ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) );
-    if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
+    ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) );
+    if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
     {
         fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -465,7 +465,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char
 
     if ( fail )
     {
-        fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line);
+        fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
         silk_assert( 0 );
 #endif
@@ -723,7 +723,7 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int
 #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (opus_int32)((opus_uint32)b << shift);
+    ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -768,7 +768,7 @@ static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int
 #define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b >> shift);
+    ret = silk_ADD32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -798,7 +798,7 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32
 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (opus_int32)((opus_uint32)b << shift);
+    ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -813,7 +813,7 @@ static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opu
 #define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b >> shift);
+    ret = silk_SUB32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
-- 
cgit v1.2.3


From 378b4e5fc31b63c1f1a9f6a87a62609c5a083724 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 22 Jul 2022 02:53:39 -0400
Subject: Ensuring we can see where crashes occur

Reviewed by Mark Harris
---
 tests/test_opus_encode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_opus_encode.c b/tests/test_opus_encode.c
index 00795a1e..d6e8e2d3 100644
--- a/tests/test_opus_encode.c
+++ b/tests/test_opus_encode.c
@@ -297,6 +297,7 @@ int run_test1(int no_fuzz)
   /*FIXME: encoder api tests, fs!=48k, mono, VBR*/
 
    fprintf(stdout,"  Encode+Decode tests.\n");
+   fflush(stdout);
 
    enc = opus_encoder_create(48000, 2, OPUS_APPLICATION_VOIP, &err);
    if(err != OPUS_OK || enc==NULL)test_failed();
@@ -466,6 +467,7 @@ int run_test1(int no_fuzz)
             count++;
          }while(i<(SSAMPLES-MAX_FRAME_SAMP));
          fprintf(stdout,"    Mode %s FB encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+         fflush(stdout);
       }
    }
 
@@ -543,6 +545,7 @@ int run_test1(int no_fuzz)
             count++;
          }while(i<(SSAMPLES/12-MAX_FRAME_SAMP));
          fprintf(stdout,"    Mode %s NB dual-mono MS encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+         fflush(stdout);
       }
    }
 
@@ -612,6 +615,7 @@ int run_test1(int no_fuzz)
       i+=frame_size;
    }while(i<SAMPLES*4);
    fprintf(stdout,"    All framesize pairs switching encode, %d frames OK.\n",count);
+   fflush(stdout);
 
    if(opus_encoder_ctl(enc, OPUS_RESET_STATE)!=OPUS_OK)test_failed();
    opus_encoder_destroy(enc);
-- 
cgit v1.2.3


From 5413ef784941652448e703688fbe3b96ef6d7e86 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 24 Jul 2022 02:12:03 -0400
Subject: Re-tuning the use of LTP scaling

Making LTP scaling depend on the bitrate and whether FEC is on.
The thresholds for scaling 1 and 2 are now independent.
---
 silk/fixed/LTP_scale_ctrl_FIX.c | 9 +++++++--
 silk/float/LTP_scale_ctrl_FLP.c | 8 +++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/silk/fixed/LTP_scale_ctrl_FIX.c b/silk/fixed/LTP_scale_ctrl_FIX.c
index 3dcedef8..b3afb70b 100644
--- a/silk/fixed/LTP_scale_ctrl_FIX.c
+++ b/silk/fixed/LTP_scale_ctrl_FIX.c
@@ -43,8 +43,13 @@ void silk_LTP_scale_ctrl_FIX(
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
         round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT(
-            silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 );
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/silk/float/LTP_scale_ctrl_FLP.c b/silk/float/LTP_scale_ctrl_FLP.c
index 8dbe29d0..1fed0993 100644
--- a/silk/float/LTP_scale_ctrl_FLP.c
+++ b/silk/float/LTP_scale_ctrl_FLP.c
@@ -42,7 +42,13 @@ void silk_LTP_scale_ctrl_FLP(
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
         round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f );
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
-- 
cgit v1.2.3


From fd9c0f1e1f1b74c46c5872217e3289a9edf69d48 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 24 Jul 2022 02:14:53 -0400
Subject: More FEC tuning: lowering the LBRR bitrate a bit

---
 silk/control_codec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/silk/control_codec.c b/silk/control_codec.c
index 52aa8fde..784ffe66 100644
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -415,7 +415,7 @@ static OPUS_INLINE opus_int silk_setup_LBRR(
             /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */
             psEncC->LBRR_GainIncreases = 7;
         } else {
-            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
+            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 );
         }
     }
 
-- 
cgit v1.2.3


From ab04fbb1b7d0b727636d28fc2cadb5df9febe515 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Sun, 24 Jul 2022 03:46:16 -0400
Subject: Smooth out the LBRR rate estimate

Reduces fluctuations in the non-FEC target bitrate.
---
 silk/enc_API.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/silk/enc_API.c b/silk/enc_API.c
index 55a33f37..548e0736 100644
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -270,6 +270,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                        psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
     ALLOC( buf, nSamplesFromInputMax, opus_int16 );
     while( 1 ) {
+        int curr_nBitsUsedLBRR = 0;
         nSamplesToBuffer  = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
         nSamplesToBuffer  = silk_min( nSamplesToBuffer, nSamplesToBufferMax );
         nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
@@ -342,6 +343,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                 opus_uint8 iCDF[ 2 ] = { 0, 0 };
                 iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal );
                 ec_enc_icdf( psRangeEnc, 0, iCDF, 8 );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc );
 
                 /* Encode any LBRR data from previous packet */
                 /* Encode LBRR flags */
@@ -386,8 +388,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                 for( n = 0; n < encControl->nChannelsInternal; n++ ) {
                     silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) );
                 }
-
-                psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR;
             }
 
             silk_HP_variable_cutoff( psEnc->state_Fxx );
@@ -396,6 +397,16 @@ opus_int silk_Encode(                                   /* O    Returns error co
             nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 );
             /* Subtract bits used for LBRR */
             if( !prefillFlag ) {
+                /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage,
+                   except that for the first LBRR frame it does no averaging and for the first
+                   frame after after LBRR, it goes back to zero immediately. */
+                if ( curr_nBitsUsedLBRR < 10 ) {
+                    psEnc->nBitsUsedLBRR = 0;
+                } else if ( psEnc->nBitsUsedLBRR < 10) {
+                    psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR;
+                } else {
+                    psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2;
+                }
                 nBits -= psEnc->nBitsUsedLBRR;
             }
             /* Divide by number of uncoded frames left in packet */
-- 
cgit v1.2.3


From 997fdf54e781ae1c04dee42018f35388a04fe483 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 4 Aug 2022 18:59:37 -0400
Subject: Change pitch scaling behavior wrt nFramesPerPacket

Not sure if it was the original intent, but we now reduce the
loss percentage threshold for pitch scaling as 1/nFramesPerPacket
since only the first frame will have pitch scaling anyway.
As a side effect, this brings back the original behavior of
disabling pitch scaling for 0% loss.
---
 silk/fixed/LTP_scale_ctrl_FIX.c | 2 +-
 silk/float/LTP_scale_ctrl_FLP.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/silk/fixed/LTP_scale_ctrl_FIX.c b/silk/fixed/LTP_scale_ctrl_FIX.c
index b3afb70b..db1016e0 100644
--- a/silk/fixed/LTP_scale_ctrl_FIX.c
+++ b/silk/fixed/LTP_scale_ctrl_FIX.c
@@ -42,7 +42,7 @@ void silk_LTP_scale_ctrl_FIX(
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
         if ( psEnc->sCmn.LBRR_flag ) {
             /* LBRR reduces the effective loss. In practice, it does not square the loss because
                losses aren't independent, but that still seems to work best. We also never go below 2%. */
diff --git a/silk/float/LTP_scale_ctrl_FLP.c b/silk/float/LTP_scale_ctrl_FLP.c
index 1fed0993..6f30ff09 100644
--- a/silk/float/LTP_scale_ctrl_FLP.c
+++ b/silk/float/LTP_scale_ctrl_FLP.c
@@ -41,7 +41,7 @@ void silk_LTP_scale_ctrl_FLP(
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
         if ( psEnc->sCmn.LBRR_flag ) {
             /* LBRR reduces the effective loss. In practice, it does not square the loss because
                losses aren't independent, but that still seems to work best. We also never go below 2%. */
-- 
cgit v1.2.3


From ee6c24b39ba62fd9d9caeacada896823b94f1af6 Mon Sep 17 00:00:00 2001
From: Harish Mahendrakar <harish.mahendrakar@ittiam.com>
Date: Thu, 11 Aug 2022 08:56:38 -0700
Subject: Ignore integer overflows in silk_noise_shape_quantizer_del_dec_sse4_1

Bug: 241956787
Test: POC in bug description
Change-Id: Ibad588250212844dfa0de2a51bba294bfc5c5a7a
---
 libopus_blocklist.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt
index 51db6112..d876af66 100644
--- a/libopus_blocklist.txt
+++ b/libopus_blocklist.txt
@@ -24,6 +24,8 @@ fun:ec_decode_bin
 fun:silk_noise_shape_quantizer_del_dec
 # silk/NSQ.c:265:25: 1318152552 + 1068143768 cannot be represented in type 'int'
 fun:silk_noise_shape_quantizer
+# silk/x86/NSQ_del_dec_sse4_1.c:571:28: 1162446838 - -1165932966 cannot be represented in type 'int'
+fun:silk_noise_shape_quantizer_del_dec_sse4_1
 
 src:*/celt/kiss_fft.c
 
-- 
cgit v1.2.3


From bce1f392353d72d77d543bb2069a044ae1045e9d Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <eggenath@amazon.com>
Date: Sun, 4 Sep 2022 22:02:10 -0400
Subject: Fix typo in MacroDebug.h comment.

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 silk/MacroDebug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index aceeef7e..3110da9a 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -829,7 +829,7 @@ static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opu
 static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -844,7 +844,7 @@ static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift,
 #define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){
     opus_int64 ret;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>=64) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line);
-- 
cgit v1.2.3


From d6b9e45d0cf5fe0771f21c5b0425fe4aaffa9a3e Mon Sep 17 00:00:00 2001
From: William Escande <wescande@google.com>
Date: Thu, 8 Sep 2022 22:38:48 -0700
Subject: [Bluetooth apex] Use new apex name

The Bluetooth apex name is now called com.android.btservices

Bug: 243054261
Test: Build
Change-Id: I8d1bf8b493e822662eacdc52ae15181af2d186db
---
 Android.bp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Android.bp b/Android.bp
index f2f09f8d..29a421c4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -383,7 +383,7 @@ cc_library {
     apex_available: [
         "//apex_available:platform", // used by libstagefright_soft_opusdec
         "com.android.media.swcodec",
-        "com.android.bluetooth",
+        "com.android.btservices",
     ],
     min_sdk_version: "29",
 }
-- 
cgit v1.2.3


From d11ac2a40b7eb1fdbdcf0f7533d3623e4662c131 Mon Sep 17 00:00:00 2001
From: Harish Mahendrakar <harish.mahendrakar@ittiam.com>
Date: Fri, 23 Sep 2022 14:57:09 -0700
Subject: Ignore integer overflows in silk_burg_modified

Bug: 247938163
Test: POC in bug description
Change-Id: Iaae38589357b823498a40691c1f647df5cc05ead
---
 libopus_blocklist.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt
index d876af66..84c19ee4 100644
--- a/libopus_blocklist.txt
+++ b/libopus_blocklist.txt
@@ -26,7 +26,11 @@ fun:silk_noise_shape_quantizer_del_dec
 fun:silk_noise_shape_quantizer
 # silk/x86/NSQ_del_dec_sse4_1.c:571:28: 1162446838 - -1165932966 cannot be represented in type 'int'
 fun:silk_noise_shape_quantizer_del_dec_sse4_1
-
+# silk/fixed/x86/burg_modified_FIX_sse4_1.c:277: 1940085720 + 252655088 cannot be represented
+# in type 'int'
+fun:silk_burg_modified_sse4_1
+# silk/fixed/burg_modified_FIX.c:181 1940085720 + 252655088 cannot be represented in type 'int'
+fun:silk_burg_modified_c
 src:*/celt/kiss_fft.c
 
 # assembly optimizations that know what they are doing
-- 
cgit v1.2.3


From 757c53f775a0b651b0512a1992d67f4b2159a378 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Wed, 23 Nov 2022 03:19:54 +0000
Subject: opus.m4: fix -Wstrict-prototypes

Signed-off-by: Sam James <sam@gentoo.org>
Signed-off-by: Mark Harris <mark.hsj@gmail.com>
---
 opus.m4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opus.m4 b/opus.m4
index 47f5ec49..263470d4 100644
--- a/opus.m4
+++ b/opus.m4
@@ -63,7 +63,7 @@ dnl
 #include <string.h>
 #include <opus.h>
 
-int main ()
+int main (void)
 {
   system("touch conf.opustest");
   return 0;
-- 
cgit v1.2.3


From 8cf872a186b96085b1bb3a547afd598354ebeb87 Mon Sep 17 00:00:00 2001
From: Zheng Lv <lvzheng@google.com>
Date: Tue, 13 Sep 2022 14:40:52 +0800
Subject: Make CELT FFT twiddle complex type aligned

This makes kiss_twiddle_cpx 4-byte aligned (instead of 2-byte) for
fixed-point builds. Tested with an armv6j+nofp development board, CELT
encoding becomes 1.4x as fast, and decoding over 2x.

Performance gain is mostly attributed to the proper alignment of the
static const array mdct_twiddles960.

Co-authored-by: David Gao <davidgao@google.com>
Signed-off-by: Felicia Lim <flim@google.com>
---
 celt/kiss_fft.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
index bffa2bfa..267f72f9 100644
--- a/celt/kiss_fft.h
+++ b/celt/kiss_fft.h
@@ -52,6 +52,10 @@ extern "C" {
 #  define kiss_fft_scalar opus_int32
 #  define kiss_twiddle_scalar opus_int16
 
+/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory
+ * access, and could benefit from additional alignment.
+ */
+#  define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32))
 
 #else
 # ifndef kiss_fft_scalar
@@ -62,6 +66,12 @@ extern "C" {
 # endif
 #endif
 
+#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT)
+#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT)))
+#else
+#define KISS_TWIDDLE_CPX_ALIGNED
+#endif
+
 typedef struct {
     kiss_fft_scalar r;
     kiss_fft_scalar i;
@@ -70,7 +80,7 @@ typedef struct {
 typedef struct {
    kiss_twiddle_scalar r;
    kiss_twiddle_scalar i;
-}kiss_twiddle_cpx;
+} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx;
 
 #define MAXFACTORS 8
 /* e.g. an fft of length 128 has 4 factors
-- 
cgit v1.2.3


From 6e94b3487d3d9df5272a9fdf683817544487f29c Mon Sep 17 00:00:00 2001
From: Ayushi Khopkar <ayushi.khopkar@ittiam.com>
Date: Thu, 2 Mar 2023 15:17:51 +0530
Subject: Updated fuzz_config in Android.bp file

Added new fields in fuzz_config like - hotlists,
description, vector, service_privilege, users, fuzzed_code_usage, etc.

Bug: 271384401
Test: Build the updated fuzz targets

Change-Id: Ie5ef02563da28d176d34be591df5eec8bd9f167e
---
 fuzzer/Android.bp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fuzzer/Android.bp b/fuzzer/Android.bp
index 45ce6ab4..be47f44a 100644
--- a/fuzzer/Android.bp
+++ b/fuzzer/Android.bp
@@ -39,6 +39,14 @@ cc_defaults {
             "android-media-fuzzing-reports@google.com",
         ],
         componentid: 155276,
+        hotlists: [
+            "4593311",
+        ],
+        description: "The fuzzer targets the APIs of libopus",
+        vector: "remote",
+        service_privilege: "constrained",
+        users: "multi_user",
+        fuzzed_code_usage: "shipped",
     },
 }
 
-- 
cgit v1.2.3