diff options
Diffstat (limited to 'cras/src/dsp/drc_kernel.c')
-rw-r--r-- | cras/src/dsp/drc_kernel.c | 1060 |
1 files changed, 0 insertions, 1060 deletions
diff --git a/cras/src/dsp/drc_kernel.c b/cras/src/dsp/drc_kernel.c deleted file mode 100644 index 8c3404fc..00000000 --- a/cras/src/dsp/drc_kernel.c +++ /dev/null @@ -1,1060 +0,0 @@ -/* Copyright (c) 2013 The Chromium OS Authors. All rights reserved. - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -/* Copyright (C) 2011 Google Inc. All rights reserved. - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE.WEBKIT file. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "drc_math.h" -#include "drc_kernel.h" - -#define MAX_PRE_DELAY_FRAMES 1024 -#define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1) -#define DEFAULT_PRE_DELAY_FRAMES 256 -#define DIVISION_FRAMES 32 -#define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1) - -#define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)])) -#define assert_on_compile_is_power_of_2(n) \ - assert_on_compile((n) != 0 && (((n) & ((n)-1)) == 0)) - -const float uninitialized_value = -1; -static int drc_math_initialized; - -void dk_init(struct drc_kernel *dk, float sample_rate) -{ - int i; - - if (!drc_math_initialized) { - drc_math_initialized = 1; - drc_math_init(); - } - - dk->sample_rate = sample_rate; - dk->detector_average = 0; - dk->compressor_gain = 1; - dk->enabled = 0; - dk->processed = 0; - dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES; - dk->pre_delay_read_index = 0; - dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES; - dk->max_attack_compression_diff_db = -INFINITY; - dk->ratio = uninitialized_value; - dk->slope = uninitialized_value; - dk->linear_threshold = uninitialized_value; - dk->db_threshold = uninitialized_value; - dk->db_knee = uninitialized_value; - dk->knee_threshold = uninitialized_value; - dk->ratio_base = uninitialized_value; - dk->K = uninitialized_value; - - assert_on_compile_is_power_of_2(DIVISION_FRAMES); - assert_on_compile(DIVISION_FRAMES % 4 == 0); - /* Allocate predelay buffers */ - assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES); - for (i = 0; i < DRC_NUM_CHANNELS; i++) { - size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; - dk->pre_delay_buffers[i] = (float *)calloc(1, size); - } -} - -void dk_free(struct drc_kernel *dk) -{ - int i; - for (i = 0; i < DRC_NUM_CHANNELS; ++i) - free(dk->pre_delay_buffers[i]); -} - -/* Sets the pre-delay (lookahead) buffer size */ -static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time) -{ - int i; - /* Re-configure look-ahead section pre-delay if delay time has - * changed. */ - unsigned pre_delay_frames = pre_delay_time * dk->sample_rate; - pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1); - - /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we - * won't split a division of samples into two blocks of memory, so it is - * easier to process. This may make the actual delay time slightly less - * than the specified value, but the difference is less than 1ms. */ - pre_delay_frames &= ~DIVISION_FRAMES_MASK; - - /* We need at least one division buffer, so the incoming data won't - * overwrite the output data */ - pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES); - - if (dk->last_pre_delay_frames != pre_delay_frames) { - dk->last_pre_delay_frames = pre_delay_frames; - for (i = 0; i < DRC_NUM_CHANNELS; ++i) { - size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; - memset(dk->pre_delay_buffers[i], 0, size); - } - - dk->pre_delay_read_index = 0; - dk->pre_delay_write_index = pre_delay_frames; - } -} - -/* Exponential curve for the knee. It is 1st derivative matched at - * dk->linear_threshold and asymptotically approaches the value - * dk->linear_threshold + 1 / k. - * - * This is used only when calculating the static curve, not used when actually - * compress the input data (knee_curveK below is used instead). - */ -static float knee_curve(struct drc_kernel *dk, float x, float k) -{ - /* Linear up to threshold. */ - if (x < dk->linear_threshold) - return x; - - return dk->linear_threshold + - (1 - knee_expf(-k * (x - dk->linear_threshold))) / k; -} - -/* Approximate 1st derivative with input and output expressed in dB. This slope - * is equal to the inverse of the compression "ratio". In other words, a - * compression ratio of 20 would be a slope of 1/20. - */ -static float slope_at(struct drc_kernel *dk, float x, float k) -{ - if (x < dk->linear_threshold) - return 1; - - float x2 = x * 1.001; - - float x_db = linear_to_decibels(x); - float x2Db = linear_to_decibels(x2); - - float y_db = linear_to_decibels(knee_curve(dk, x, k)); - float y2Db = linear_to_decibels(knee_curve(dk, x2, k)); - - float m = (y2Db - y_db) / (x2Db - x_db); - - return m; -} - -static float k_at_slope(struct drc_kernel *dk, float desired_slope) -{ - float x_db = dk->db_threshold + dk->db_knee; - float x = decibels_to_linear(x_db); - - /* Approximate k given initial values. */ - float minK = 0.1; - float maxK = 10000; - float k = 5; - int i; - - for (i = 0; i < 15; ++i) { - /* A high value for k will more quickly asymptotically approach - * a slope of 0. */ - float slope = slope_at(dk, x, k); - - if (slope < desired_slope) { - /* k is too high. */ - maxK = k; - } else { - /* k is too low. */ - minK = k; - } - - /* Re-calculate based on geometric mean. */ - k = sqrtf(minK * maxK); - } - - return k; -} - -static void update_static_curve_parameters(struct drc_kernel *dk, - float db_threshold, float db_knee, - float ratio) -{ - if (db_threshold != dk->db_threshold || db_knee != dk->db_knee || - ratio != dk->ratio) { - /* Threshold and knee. */ - dk->db_threshold = db_threshold; - dk->linear_threshold = decibels_to_linear(db_threshold); - dk->db_knee = db_knee; - - /* Compute knee parameters. */ - dk->ratio = ratio; - dk->slope = 1 / dk->ratio; - - float k = k_at_slope(dk, 1 / dk->ratio); - dk->K = k; - /* See knee_curveK() for details */ - dk->knee_alpha = dk->linear_threshold + 1 / k; - dk->knee_beta = -expf(k * dk->linear_threshold) / k; - - dk->knee_threshold = decibels_to_linear(db_threshold + db_knee); - /* See volume_gain() for details */ - float y0 = knee_curve(dk, dk->knee_threshold, k); - dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope); - } -} - -/* This is the knee part of the compression curve. Returns the output level - * given the input level x. */ -static float knee_curveK(struct drc_kernel *dk, float x) -{ - /* The formula in knee_curveK is dk->linear_threshold + - * (1 - expf(-k * (x - dk->linear_threshold))) / k - * which simplifies to (alpha + beta * expf(gamma)) - * where alpha = dk->linear_threshold + 1 / k - * beta = -expf(k * dk->linear_threshold) / k - * gamma = -k * x - */ - return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x); -} - -/* Full compression curve with constant ratio after knee. Returns the ratio of - * output and input signal. */ -static float volume_gain(struct drc_kernel *dk, float x) -{ - float y; - - if (x < dk->knee_threshold) { - if (x < dk->linear_threshold) - return 1; - y = knee_curveK(dk, x) / x; - } else { - /* Constant ratio after knee. - * log(y/y0) = s * log(x/x0) - * => y = y0 * (x/x0)^s - * => y = [y0 * (1/x0)^s] * x^s - * => y = dk->ratio_base * x^s - * => y/x = dk->ratio_base * x^(s - 1) - * => y/x = dk->ratio_base * e^(log(x) * (s - 1)) - */ - y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1)); - } - - return y; -} - -void dk_set_parameters(struct drc_kernel *dk, float db_threshold, float db_knee, - float ratio, float attack_time, float release_time, - float pre_delay_time, float db_post_gain, - float releaseZone1, float releaseZone2, - float releaseZone3, float releaseZone4) -{ - float sample_rate = dk->sample_rate; - - update_static_curve_parameters(dk, db_threshold, db_knee, ratio); - - /* Makeup gain. */ - float full_range_gain = volume_gain(dk, 1); - float full_range_makeup_gain = 1 / full_range_gain; - - /* Empirical/perceptual tuning. */ - full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f); - - dk->main_linear_gain = - decibels_to_linear(db_post_gain) * full_range_makeup_gain; - - /* Attack parameters. */ - attack_time = max(0.001f, attack_time); - dk->attack_frames = attack_time * sample_rate; - - /* Release parameters. */ - float release_frames = sample_rate * release_time; - - /* Detector release time. */ - float sat_release_time = 0.0025f; - float sat_release_frames = sat_release_time * sample_rate; - dk->sat_release_frames_inv_neg = -1 / sat_release_frames; - dk->sat_release_rate_at_neg_two_db = - decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1; - - /* Create a smooth function which passes through four points. - * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4 - */ - float y1 = release_frames * releaseZone1; - float y2 = release_frames * releaseZone2; - float y3 = release_frames * releaseZone3; - float y4 = release_frames * releaseZone4; - - /* All of these coefficients were derived for 4th order polynomial curve - * fitting where the y values match the evenly spaced x values as - * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3) - */ - dk->kA = 0.9999999999999998f * y1 + 1.8432219684323923e-16f * y2 - - 1.9373394351676423e-16f * y3 + 8.824516011816245e-18f * y4; - dk->kB = -1.5788320352845888f * y1 + 2.3305837032074286f * y2 - - 0.9141194204840429f * y3 + 0.1623677525612032f * y4; - dk->kC = 0.5334142869106424f * y1 - 1.272736789213631f * y2 + - 0.9258856042207512f * y3 - 0.18656310191776226f * y4; - dk->kD = 0.08783463138207234f * y1 - 0.1694162967925622f * y2 + - 0.08588057951595272f * y3 - 0.00429891410546283f * y4; - dk->kE = -0.042416883008123074f * y1 + 0.1115693827987602f * y2 - - 0.09764676325265872f * y3 + 0.028494263462021576f * y4; - - /* x ranges from 0 -> 3 0 1 2 3 - * -15 -10 -5 0db - * - * y calculates adaptive release frames depending on the amount of - * compression. - */ - set_pre_delay_time(dk, pre_delay_time); -} - -void dk_set_enabled(struct drc_kernel *dk, int enabled) -{ - dk->enabled = enabled; -} - -/* Updates the envelope_rate used for the next division */ -static void dk_update_envelope(struct drc_kernel *dk) -{ - const float kA = dk->kA; - const float kB = dk->kB; - const float kC = dk->kC; - const float kD = dk->kD; - const float kE = dk->kE; - const float attack_frames = dk->attack_frames; - - /* Calculate desired gain */ - float desired_gain = dk->detector_average; - - /* Pre-warp so we get desired_gain after sin() warp below. */ - float scaled_desired_gain = warp_asinf(desired_gain); - - /* Deal with envelopes */ - - /* envelope_rate is the rate we slew from current compressor level to - * the desired level. The exact rate depends on if we're attacking or - * releasing and by how much. - */ - float envelope_rate; - - int is_releasing = scaled_desired_gain > dk->compressor_gain; - - /* compression_diff_db is the difference between current compression - * level and the desired level. */ - float compression_diff_db = - linear_to_decibels(dk->compressor_gain / scaled_desired_gain); - - if (is_releasing) { - /* Release mode - compression_diff_db should be negative dB */ - dk->max_attack_compression_diff_db = -INFINITY; - - /* Fix gremlins. */ - if (isbadf(compression_diff_db)) - compression_diff_db = -1; - - /* Adaptive release - higher compression (lower - * compression_diff_db) releases faster. Contain within range: - * -12 -> 0 then scale to go from 0 -> 3 - */ - float x = compression_diff_db; - x = max(-12.0f, x); - x = min(0.0f, x); - x = 0.25f * (x + 12); - - /* Compute adaptive release curve using 4th order polynomial. - * Normal values for the polynomial coefficients would create a - * monotonically increasing function. - */ - float x2 = x * x; - float x3 = x2 * x; - float x4 = x2 * x2; - float release_frames = - kA + kB * x + kC * x2 + kD * x3 + kE * x4; - -#define kSpacingDb 5 - float db_per_frame = kSpacingDb / release_frames; - envelope_rate = decibels_to_linear(db_per_frame); - } else { - /* Attack mode - compression_diff_db should be positive dB */ - - /* Fix gremlins. */ - if (isbadf(compression_diff_db)) - compression_diff_db = 1; - - /* As long as we're still in attack mode, use a rate based off - * the largest compression_diff_db we've encountered so far. - */ - dk->max_attack_compression_diff_db = - max(dk->max_attack_compression_diff_db, - compression_diff_db); - - float eff_atten_diff_db = - max(0.5f, dk->max_attack_compression_diff_db); - - float x = 0.25f / eff_atten_diff_db; - envelope_rate = 1 - powf(x, 1 / attack_frames); - } - - dk->envelope_rate = envelope_rate; - dk->scaled_desired_gain = scaled_desired_gain; -} - -/* For a division of frames, take the absolute values of left channel and right - * channel, store the maximum of them in output. */ -#if defined(__aarch64__) -static inline void max_abs_division(float *output, const float *data0, - const float *data1) -{ - int count = DIVISION_FRAMES / 4; - - // clang-format off - __asm__ __volatile__( - "1: \n" - "ld1 {v0.4s}, [%[data0]], #16 \n" - "ld1 {v1.4s}, [%[data1]], #16 \n" - "fabs v0.4s, v0.4s \n" - "fabs v1.4s, v1.4s \n" - "fmax v0.4s, v0.4s, v1.4s \n" - "st1 {v0.4s}, [%[output]], #16 \n" - "subs %w[count], %w[count], #1 \n" - "b.ne 1b \n" - : /* output */ - [data0]"+r"(data0), - [data1]"+r"(data1), - [output]"+r"(output), - [count]"+r"(count) - : /* input */ - : /* clobber */ - "v0", "v1", "memory", "cc"); - // clang-format on -} -#elif defined(__ARM_NEON__) -static inline void max_abs_division(float *output, const float *data0, - const float *data1) -{ - int count = DIVISION_FRAMES / 4; - - // clang-format off - __asm__ __volatile__( - "1: \n" - "vld1.32 {q0}, [%[data0]]! \n" - "vld1.32 {q1}, [%[data1]]! \n" - "vabs.f32 q0, q0 \n" - "vabs.f32 q1, q1 \n" - "vmax.f32 q0, q1 \n" - "vst1.32 {q0}, [%[output]]! \n" - "subs %[count], #1 \n" - "bne 1b \n" - : /* output */ - [data0]"+r"(data0), - [data1]"+r"(data1), - [output]"+r"(output), - [count]"+r"(count) - : /* input */ - : /* clobber */ - "q0", "q1", "memory", "cc"); - // clang-format on -} -#elif defined(__SSE3__) -#include <emmintrin.h> -static inline void max_abs_division(float *output, const float *data0, - const float *data1) -{ - __m128 x, y; - int count = DIVISION_FRAMES / 4; - // clang-format off - __asm__ __volatile__( - "1: \n" - "lddqu (%[data0]), %[x] \n" - "lddqu (%[data1]), %[y] \n" - "andps %[mask], %[x] \n" - "andps %[mask], %[y] \n" - "maxps %[y], %[x] \n" - "movdqu %[x], (%[output]) \n" - "add $16, %[data0] \n" - "add $16, %[data1] \n" - "add $16, %[output] \n" - "sub $1, %[count] \n" - "jnz 1b \n" - : /* output */ - [data0]"+r"(data0), - [data1]"+r"(data1), - [output]"+r"(output), - [count]"+r"(count), - [x]"=&x"(x), - [y]"=&x"(y) - : /* input */ - [mask]"x"(_mm_set1_epi32(0x7fffffff)) - : /* clobber */ - "memory", "cc"); - // clang-format on -} -#else -static inline void max_abs_division(float *output, const float *data0, - const float *data1) -{ - int i; - for (i = 0; i < DIVISION_FRAMES; i++) - output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i])); -} -#endif - -/* Update detector_average from the last input division. */ -static void dk_update_detector_average(struct drc_kernel *dk) -{ - float abs_input_array[DIVISION_FRAMES]; - const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg; - const float sat_release_rate_at_neg_two_db = - dk->sat_release_rate_at_neg_two_db; - float detector_average = dk->detector_average; - int div_start, i; - - /* Calculate the start index of the last input division */ - if (dk->pre_delay_write_index == 0) { - div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES; - } else { - div_start = dk->pre_delay_write_index - DIVISION_FRAMES; - } - - /* The max abs value across all channels for this frame */ - max_abs_division(abs_input_array, &dk->pre_delay_buffers[0][div_start], - &dk->pre_delay_buffers[1][div_start]); - - for (i = 0; i < DIVISION_FRAMES; i++) { - /* Compute compression amount from un-delayed signal */ - float abs_input = abs_input_array[i]; - - /* Calculate shaped power on undelayed input. Put through - * shaping curve. This is linear up to the threshold, then - * enters a "knee" portion followed by the "ratio" portion. The - * transition from the threshold to the knee is smooth (1st - * derivative matched). The transition from the knee to the - * ratio portion is smooth (1st derivative matched). - */ - float gain = volume_gain(dk, abs_input); - int is_release = (gain > detector_average); - if (is_release) { - if (gain > NEG_TWO_DB) { - detector_average += - (gain - detector_average) * - sat_release_rate_at_neg_two_db; - } else { - float gain_db = linear_to_decibels(gain); - float db_per_frame = - gain_db * sat_release_frames_inv_neg; - float sat_release_rate = - decibels_to_linear(db_per_frame) - 1; - detector_average += (gain - detector_average) * - sat_release_rate; - } - } else { - detector_average = gain; - } - - /* Fix gremlins. */ - if (isbadf(detector_average)) - detector_average = 1.0f; - else - detector_average = min(detector_average, 1.0f); - } - - dk->detector_average = detector_average; -} - -/* Calculate compress_gain from the envelope and apply total_gain to compress - * the next output division. */ -/* TODO(fbarchard): Port to aarch64 */ -#if defined(__ARM_NEON__) -#include <arm_neon.h> -static void dk_compress_output(struct drc_kernel *dk) -{ - const float main_linear_gain = dk->main_linear_gain; - const float envelope_rate = dk->envelope_rate; - const float scaled_desired_gain = dk->scaled_desired_gain; - const float compressor_gain = dk->compressor_gain; - const int div_start = dk->pre_delay_read_index; - float *ptr_left = &dk->pre_delay_buffers[0][div_start]; - float *ptr_right = &dk->pre_delay_buffers[1][div_start]; - int count = DIVISION_FRAMES / 4; - - /* See warp_sinf() for the details for the constants. */ - const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f); - const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f); - const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f); - const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f); - - /* Exponential approach to desired gain. */ - if (envelope_rate < 1) { - float c = compressor_gain - scaled_desired_gain; - float r = 1 - envelope_rate; - float32x4_t x0 = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - float32x4_t x, x2, x4, left, right, tmp1, tmp2; - - // clang-format off - __asm__ __volatile( - "b 2f \n" - "1: \n" - "vmul.f32 %q[x0], %q[r4] \n" - "2: \n" - "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" - "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" - "vadd.f32 %q[x], %q[x0], %q[base] \n" - /* Calculate warp_sin() for four values in x. */ - "vmul.f32 %q[x2], %q[x], %q[x] \n" - "vmov.f32 %q[tmp1], %q[A5] \n" - "vmov.f32 %q[tmp2], %q[A1] \n" - "vmul.f32 %q[x4], %q[x2], %q[x2] \n" - "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" - "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" - "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" - "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" - /* Now tmp2 contains the result of warp_sin(). */ - "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" - "vmul.f32 %q[left], %q[tmp2] \n" - "vmul.f32 %q[right], %q[tmp2] \n" - "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" - "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" - "subs %[count], #1 \n" - "bne 1b \n" - : /* output */ - "=r"(count), - "=r"(ptr_left), - "=r"(ptr_right), - "=w"(x0), - [x]"=&w"(x), - [x2]"=&w"(x2), - [x4]"=&w"(x4), - [left]"=&w"(left), - [right]"=&w"(right), - [tmp1]"=&w"(tmp1), - [tmp2]"=&w"(tmp2) - : /* input */ - [count]"0"(count), - [ptr_left]"1"(ptr_left), - [ptr_right]"2"(ptr_right), - [x0]"3"(x0), - [A1]"w"(A1), - [A3]"w"(A3), - [A5]"w"(A5), - [A7]"w"(A7), - [base]"w"(vdupq_n_f32(scaled_desired_gain)), - [r4]"w"(vdupq_n_f32(r*r*r*r)), - [g]"w"(vdupq_n_f32(main_linear_gain)) - : /* clobber */ - "memory", "cc"); - // clang-format on - dk->compressor_gain = x[3]; - } else { - float c = compressor_gain; - float r = envelope_rate; - float32x4_t x = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - float32x4_t x2, x4, left, right, tmp1, tmp2; - - // clang-format off - __asm__ __volatile( - "b 2f \n" - "1: \n" - "vmul.f32 %q[x], %q[r4] \n" - "2: \n" - "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" - "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" - "vmin.f32 %q[x], %q[one] \n" - /* Calculate warp_sin() for four values in x. */ - "vmul.f32 %q[x2], %q[x], %q[x] \n" - "vmov.f32 %q[tmp1], %q[A5] \n" - "vmov.f32 %q[tmp2], %q[A1] \n" - "vmul.f32 %q[x4], %q[x2], %q[x2] \n" - "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" - "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" - "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" - "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" - /* Now tmp2 contains the result of warp_sin(). */ - "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" - "vmul.f32 %q[left], %q[tmp2] \n" - "vmul.f32 %q[right], %q[tmp2] \n" - "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" - "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" - "subs %[count], #1 \n" - "bne 1b \n" - : /* output */ - "=r"(count), - "=r"(ptr_left), - "=r"(ptr_right), - "=w"(x), - [x2]"=&w"(x2), - [x4]"=&w"(x4), - [left]"=&w"(left), - [right]"=&w"(right), - [tmp1]"=&w"(tmp1), - [tmp2]"=&w"(tmp2) - : /* input */ - [count]"0"(count), - [ptr_left]"1"(ptr_left), - [ptr_right]"2"(ptr_right), - [x]"3"(x), - [A1]"w"(A1), - [A3]"w"(A3), - [A5]"w"(A5), - [A7]"w"(A7), - [one]"w"(vdupq_n_f32(1)), - [r4]"w"(vdupq_n_f32(r*r*r*r)), - [g]"w"(vdupq_n_f32(main_linear_gain)) - : /* clobber */ - "memory", "cc"); - // clang-format on - dk->compressor_gain = x[3]; - } -} -#elif defined(__SSE3__) && defined(__x86_64__) -#include <emmintrin.h> -static void dk_compress_output(struct drc_kernel *dk) -{ - const float main_linear_gain = dk->main_linear_gain; - const float envelope_rate = dk->envelope_rate; - const float scaled_desired_gain = dk->scaled_desired_gain; - const float compressor_gain = dk->compressor_gain; - const int div_start = dk->pre_delay_read_index; - float *ptr_left = &dk->pre_delay_buffers[0][div_start]; - float *ptr_right = &dk->pre_delay_buffers[1][div_start]; - int count = DIVISION_FRAMES / 4; - - /* See warp_sinf() for the details for the constants. */ - const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); - const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); - const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); - const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); - - /* Exponential approach to desired gain. */ - if (envelope_rate < 1) { - float c = compressor_gain - scaled_desired_gain; - float r = 1 - envelope_rate; - __m128 x0 = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - __m128 x, x2, x4, left, right, tmp1, tmp2; - - // clang-format off - __asm__ __volatile( - "jmp 2f \n" - "1: \n" - "mulps %[r4], %[x0] \n" - "2: \n" - "lddqu (%[ptr_left]), %[left] \n" - "lddqu (%[ptr_right]), %[right] \n" - "movaps %[x0], %[x] \n" - "addps %[base], %[x] \n" - /* Calculate warp_sin() for four values in x. */ - "movaps %[x], %[x2] \n" - "mulps %[x], %[x2] \n" - "movaps %[x2], %[x4] \n" - "movaps %[x2], %[tmp1] \n" - "movaps %[x2], %[tmp2] \n" - "mulps %[x2], %[x4] \n" - "mulps %[A7], %[tmp1] \n" - "mulps %[A3], %[tmp2] \n" - "addps %[A5], %[tmp1] \n" - "addps %[A1], %[tmp2] \n" - "mulps %[x4], %[tmp1] \n" - "addps %[tmp1], %[tmp2] \n" - "mulps %[x], %[tmp2] \n" - /* Now tmp2 contains the result of warp_sin(). */ - "mulps %[g], %[tmp2] \n" - "mulps %[tmp2], %[left] \n" - "mulps %[tmp2], %[right] \n" - "movdqu %[left], (%[ptr_left]) \n" - "movdqu %[right], (%[ptr_right]) \n" - "add $16, %[ptr_left] \n" - "add $16, %[ptr_right] \n" - "sub $1, %[count] \n" - "jne 1b \n" - : /* output */ - "=r"(count), - "=r"(ptr_left), - "=r"(ptr_right), - "=x"(x0), - [x]"=&x"(x), - [x2]"=&x"(x2), - [x4]"=&x"(x4), - [left]"=&x"(left), - [right]"=&x"(right), - [tmp1]"=&x"(tmp1), - [tmp2]"=&x"(tmp2) - : /* input */ - [count]"0"(count), - [ptr_left]"1"(ptr_left), - [ptr_right]"2"(ptr_right), - [x0]"3"(x0), - [A1]"x"(A1), - [A3]"x"(A3), - [A5]"x"(A5), - [A7]"x"(A7), - [base]"x"(_mm_set1_ps(scaled_desired_gain)), - [r4]"x"(_mm_set1_ps(r*r*r*r)), - [g]"x"(_mm_set1_ps(main_linear_gain)) - : /* clobber */ - "memory", "cc"); - // clang-format on - dk->compressor_gain = x[3]; - } else { - /* See warp_sinf() for the details for the constants. */ - __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); - __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); - __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); - __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); - - float c = compressor_gain; - float r = envelope_rate; - __m128 x = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - __m128 x2, x4, left, right, tmp1, tmp2; - - // clang-format off - __asm__ __volatile( - "jmp 2f \n" - "1: \n" - "mulps %[r4], %[x] \n" - "2: \n" - "lddqu (%[ptr_left]), %[left] \n" - "lddqu (%[ptr_right]), %[right] \n" - "minps %[one], %[x] \n" - /* Calculate warp_sin() for four values in x. */ - "movaps %[x], %[x2] \n" - "mulps %[x], %[x2] \n" - "movaps %[x2], %[x4] \n" - "movaps %[x2], %[tmp1] \n" - "movaps %[x2], %[tmp2] \n" - "mulps %[x2], %[x4] \n" - "mulps %[A7], %[tmp1] \n" - "mulps %[A3], %[tmp2] \n" - "addps %[A5], %[tmp1] \n" - "addps %[A1], %[tmp2] \n" - "mulps %[x4], %[tmp1] \n" - "addps %[tmp1], %[tmp2] \n" - "mulps %[x], %[tmp2] \n" - /* Now tmp2 contains the result of warp_sin(). */ - "mulps %[g], %[tmp2] \n" - "mulps %[tmp2], %[left] \n" - "mulps %[tmp2], %[right] \n" - "movdqu %[left], (%[ptr_left]) \n" - "movdqu %[right], (%[ptr_right]) \n" - "add $16, %[ptr_left] \n" - "add $16, %[ptr_right] \n" - "sub $1, %[count] \n" - "jne 1b \n" - : /* output */ - "=r"(count), - "=r"(ptr_left), - "=r"(ptr_right), - "=x"(x), - [x2]"=&x"(x2), - [x4]"=&x"(x4), - [left]"=&x"(left), - [right]"=&x"(right), - [tmp1]"=&x"(tmp1), - [tmp2]"=&x"(tmp2) - : /* input */ - [count]"0"(count), - [ptr_left]"1"(ptr_left), - [ptr_right]"2"(ptr_right), - [x]"3"(x), - [A1]"x"(A1), - [A3]"x"(A3), - [A5]"x"(A5), - [A7]"x"(A7), - [one]"x"(_mm_set1_ps(1)), - [r4]"x"(_mm_set1_ps(r*r*r*r)), - [g]"x"(_mm_set1_ps(main_linear_gain)) - : /* clobber */ - "memory", "cc"); - // clang-format on - dk->compressor_gain = x[3]; - } -} -#else -static void dk_compress_output(struct drc_kernel *dk) -{ - const float main_linear_gain = dk->main_linear_gain; - const float envelope_rate = dk->envelope_rate; - const float scaled_desired_gain = dk->scaled_desired_gain; - const float compressor_gain = dk->compressor_gain; - const int div_start = dk->pre_delay_read_index; - float *ptr_left = &dk->pre_delay_buffers[0][div_start]; - float *ptr_right = &dk->pre_delay_buffers[1][div_start]; - int count = DIVISION_FRAMES / 4; - - int i, j; - - /* Exponential approach to desired gain. */ - if (envelope_rate < 1) { - /* Attack - reduce gain to desired. */ - float c = compressor_gain - scaled_desired_gain; - float base = scaled_desired_gain; - float r = 1 - envelope_rate; - float x[4] = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - float r4 = r * r * r * r; - - i = 0; - while (1) { - for (j = 0; j < 4; j++) { - /* Warp pre-compression gain to smooth out sharp - * exponential transition points. - */ - float post_warp_compressor_gain = - warp_sinf(x[j] + base); - - /* Calculate total gain using main gain. */ - float total_gain = main_linear_gain * - post_warp_compressor_gain; - - /* Apply final gain. */ - *ptr_left++ *= total_gain; - *ptr_right++ *= total_gain; - } - - if (++i == count) - break; - - for (j = 0; j < 4; j++) - x[j] = x[j] * r4; - } - - dk->compressor_gain = x[3] + base; - } else { - /* Release - exponentially increase gain to 1.0 */ - float c = compressor_gain; - float r = envelope_rate; - float x[4] = { c * r, c * r * r, c * r * r * r, - c * r * r * r * r }; - float r4 = r * r * r * r; - - i = 0; - while (1) { - for (j = 0; j < 4; j++) { - /* Warp pre-compression gain to smooth out sharp - * exponential transition points. - */ - float post_warp_compressor_gain = - warp_sinf(x[j]); - - /* Calculate total gain using main gain. */ - float total_gain = main_linear_gain * - post_warp_compressor_gain; - - /* Apply final gain. */ - *ptr_left++ *= total_gain; - *ptr_right++ *= total_gain; - } - - if (++i == count) - break; - - for (j = 0; j < 4; j++) - x[j] = min(1.0f, x[j] * r4); - } - - dk->compressor_gain = x[3]; - } -} -#endif - -/* After one complete divison of samples have been received (and one divison of - * samples have been output), we calculate shaped power average - * (detector_average) from the input division, update envelope parameters from - * detector_average, then prepare the next output division by applying the - * envelope to compress the samples. - */ -static void dk_process_one_division(struct drc_kernel *dk) -{ - dk_update_detector_average(dk); - dk_update_envelope(dk); - dk_compress_output(dk); -} - -/* Copy the input data to the pre-delay buffer, and copy the output data back to - * the input buffer */ -static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[], - unsigned frame_index, int frames_to_process) -{ - int write_index = dk->pre_delay_write_index; - int read_index = dk->pre_delay_read_index; - int j; - - for (j = 0; j < DRC_NUM_CHANNELS; ++j) { - memcpy(&dk->pre_delay_buffers[j][write_index], - &data_channels[j][frame_index], - frames_to_process * sizeof(float)); - memcpy(&data_channels[j][frame_index], - &dk->pre_delay_buffers[j][read_index], - frames_to_process * sizeof(float)); - } - - dk->pre_delay_write_index = - (write_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK; - dk->pre_delay_read_index = - (read_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK; -} - -/* Delay the input sample only and don't do other processing. This is used when - * the kernel is disabled. We want to do this to match the processing delay in - * kernels of other bands. - */ -static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[], - unsigned count) -{ - int read_index = dk->pre_delay_read_index; - int write_index = dk->pre_delay_write_index; - int i = 0; - - while (i < count) { - int j; - int small = min(read_index, write_index); - int large = max(read_index, write_index); - /* chunk is the minimum of readable samples in contiguous - * buffer, writable samples in contiguous buffer, and the - * available input samples. */ - int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large); - chunk = min(chunk, count - i); - for (j = 0; j < DRC_NUM_CHANNELS; ++j) { - memcpy(&dk->pre_delay_buffers[j][write_index], - &data_channels[j][i], chunk * sizeof(float)); - memcpy(&data_channels[j][i], - &dk->pre_delay_buffers[j][read_index], - chunk * sizeof(float)); - } - read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; - write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; - i += chunk; - } - - dk->pre_delay_read_index = read_index; - dk->pre_delay_write_index = write_index; -} - -void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count) -{ - int i = 0; - int fragment; - - if (!dk->enabled) { - dk_process_delay_only(dk, data_channels, count); - return; - } - - if (!dk->processed) { - dk_update_envelope(dk); - dk_compress_output(dk); - dk->processed = 1; - } - - int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK; - while (i < count) { - fragment = min(DIVISION_FRAMES - offset, count - i); - dk_copy_fragment(dk, data_channels, i, fragment); - i += fragment; - offset = (offset + fragment) & DIVISION_FRAMES_MASK; - - /* Process the input division (32 frames). */ - if (offset == 0) - dk_process_one_division(dk); - } -} |