summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorandrew@webrtc.org <andrew@webrtc.org>2014-11-03 17:17:51 +0000
committerandrew@webrtc.org <andrew@webrtc.org>2014-11-03 17:17:51 +0000
commit7e2ad876db743f9c20671750b948cac3aa1c524b (patch)
treebe35ad102d0f4edc844b0d45378e6e05d5a3417b
parent63c5ce87e15f918b6d365509b1ebefc73fe73a5a (diff)
downloadwebrtc-7e2ad876db743f9c20671750b948cac3aa1c524b.tar.gz
replace inline assembly WebRtcNsx_AnalysisUpdate by intrinsics.
The modification only uses the unique part of the analysis_update function. Pass byte to byte conformance test on both ARMv7 and AArch64, and the single function performance is similar with original assembly version on different platforms. If not specified, the code is compiled by GCC 4.6. The result is the "X version / C version" ratio, and the less is better. | run 100k times | cortex-a7 | cortex-a9 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.0Ghz) | (1.7Ghz) | | CPU target | | | | |----------------------------+-----------+-----------+------------| | Neon asm | 15.61% | 20.15% | 14.89% | | Neon inline asm (LLVM 3.4) | 25.98% | 33.96% | 18.18% | | Neon intrinsics (GCC 4.6) | 22.06% | 27.01% | 19.24% | | Neon intrinsics (GCC 4.8) | 17.28% | 18.23% | 18.55% | | Neon intrinsics (LLVM 3.4) | 21.02% | 19.98% | 16.76% | BUG= R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/28849004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@7596 4adac7df-926f-26a2-2b94-8c16560cd09d
-rw-r--r--modules/audio_processing/ns/nsx_core_neon.c105
1 files changed, 47 insertions, 58 deletions
diff --git a/modules/audio_processing/ns/nsx_core_neon.c b/modules/audio_processing/ns/nsx_core_neon.c
index 52f35cc7..93099dbf 100644
--- a/modules/audio_processing/ns/nsx_core_neon.c
+++ b/modules/audio_processing/ns/nsx_core_neon.c
@@ -582,75 +582,64 @@ void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst,
void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech) {
-
- int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
- int16_t* ptr_out = &inst->analysisBuffer[0];
+ assert(inst->blockLen10ms % 16 == 0);
+ assert(inst->anaLen % 16 == 0);
// For lower band update analysis buffer.
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
// inst->analysisBuffer + inst->blockLen10ms,
// inst->anaLen - inst->blockLen10ms);
- for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
- // Loop unrolled once, so both pointers are incremented by 8 twice.
- __asm__ __volatile__(
- "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
- "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
- "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
- "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
- :[ptr_ana]"+r"(ptr_ana),
- [ptr_out]"+r"(ptr_out)
- :
- :"d20", "d21", "d22", "d23"
- );
+ int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms;
+ int16_t* p_end_src = inst->analysisBuffer + inst->anaLen;
+ int16_t* p_start_dst = inst->analysisBuffer;
+ while (p_start_src < p_end_src) {
+ int16x8_t frame = vld1q_s16(p_start_src);
+ vst1q_s16(p_start_dst, frame);
+
+ p_start_src += 8;
+ p_start_dst += 8;
}
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
// + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
- for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
- // Loop unrolled once, so both pointers are incremented by 8 twice.
- __asm__ __volatile__(
- "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
- "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
- "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
- "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
- :[ptr_ana]"+r"(ptr_ana),
- [ptr_out]"+r"(ptr_out)
- :
- :"d20", "d21", "d22", "d23"
- );
- }
-
- // Window data before FFT
- const int16_t* ptr_window = &inst->window[0];
- ptr_out = &out[0];
- ptr_ana = &inst->analysisBuffer[0];
- for (; ptr_out < &out[inst->anaLen];) {
+ p_start_src = new_speech;
+ p_end_src = new_speech + inst->blockLen10ms;
+ p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms;
+ while (p_start_src < p_end_src) {
+ int16x8_t frame = vld1q_s16(p_start_src);
+ vst1q_s16(p_start_dst, frame);
- // Loop unrolled once, so all pointers are incremented by 4 twice.
- __asm__ __volatile__(
- "vld1.16 d20, [%[ptr_ana]]!\n\t"
- "vld1.16 d21, [%[ptr_window]]!\n\t"
- // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
- // inst->window[i], inst->analysisBuffer[i], 14); // Q0
- "vmull.s16 q10, d20, d21\n\t"
- "vrshrn.i32 d20, q10, #14\n\t"
- "vst1.16 d20, [%[ptr_out]]!\n\t"
-
- "vld1.16 d22, [%[ptr_ana]]!\n\t"
- "vld1.16 d23, [%[ptr_window]]!\n\t"
- // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
- // inst->window[i], inst->analysisBuffer[i], 14); // Q0
- "vmull.s16 q11, d22, d23\n\t"
- "vrshrn.i32 d22, q11, #14\n\t"
- "vst1.16 d22, [%[ptr_out]]!\n\t"
+ p_start_src += 8;
+ p_start_dst += 8;
+ }
- // Specify constraints.
- :[ptr_ana]"+r"(ptr_ana),
- [ptr_window]"+r"(ptr_window),
- [ptr_out]"+r"(ptr_out)
- :
- :"d20", "d21", "d22", "d23", "q10", "q11"
- );
+ // Window data before FFT.
+ int16_t* p_start_window = (int16_t*) inst->window;
+ int16_t* p_start_buffer = inst->analysisBuffer;
+ int16_t* p_start_out = out;
+ const int16_t* p_end_out = out + inst->anaLen;
+
+ // Load the first element to reduce pipeline bubble.
+ int16x8_t window = vld1q_s16(p_start_window);
+ int16x8_t buffer = vld1q_s16(p_start_buffer);
+ p_start_window += 8;
+ p_start_buffer += 8;
+
+ while (p_start_out < p_end_out) {
+ // Unroll loop.
+ int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer));
+ int32x4_t tmp32_high = vmull_s16(vget_high_s16(window),
+ vget_high_s16(buffer));
+ window = vld1q_s16(p_start_window);
+ buffer = vld1q_s16(p_start_buffer);
+
+ int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14);
+ int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14);
+ vst1q_s16(p_start_out, vcombine_s16(result_low, result_high));
+
+ p_start_buffer += 8;
+ p_start_window += 8;
+ p_start_out += 8;
}
}