Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
50#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52
53#include <stdio.h>
54#include <volk/volk_common.h>
55
56#ifdef LV_HAVE_GENERIC
57
59 const lv_32fc_t* input,
60 const float* taps,
61 unsigned int num_points)
62{
63
64 float res[2];
65 float *realpt = &res[0], *imagpt = &res[1];
66 const float* aPtr = (float*)input;
67 const float* bPtr = taps;
68 unsigned int number = 0;
69
70 *realpt = 0;
71 *imagpt = 0;
72
73 for (number = 0; number < num_points; number++) {
74 *realpt += ((*aPtr++) * (*bPtr));
75 *imagpt += ((*aPtr++) * (*bPtr++));
76 }
77
78 *result = *(lv_32fc_t*)(&res[0]);
79}
80
81#endif /*LV_HAVE_GENERIC*/
82
83#if LV_HAVE_AVX2 && LV_HAVE_FMA
84
85#include <immintrin.h>
86
87static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
88 const lv_32fc_t* input,
89 const float* taps,
90 unsigned int num_points)
91{
92
93 unsigned int number = 0;
94 const unsigned int sixteenthPoints = num_points / 16;
95
96 float res[2];
97 float *realpt = &res[0], *imagpt = &res[1];
98 const float* aPtr = (float*)input;
99 const float* bPtr = taps;
100
101 __m256 a0Val, a1Val, a2Val, a3Val;
102 __m256 b0Val, b1Val, b2Val, b3Val;
103 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
104
105 __m256 dotProdVal0 = _mm256_setzero_ps();
106 __m256 dotProdVal1 = _mm256_setzero_ps();
107 __m256 dotProdVal2 = _mm256_setzero_ps();
108 __m256 dotProdVal3 = _mm256_setzero_ps();
109
110 for (; number < sixteenthPoints; number++) {
111
112 a0Val = _mm256_load_ps(aPtr);
113 a1Val = _mm256_load_ps(aPtr + 8);
114 a2Val = _mm256_load_ps(aPtr + 16);
115 a3Val = _mm256_load_ps(aPtr + 24);
116
117 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
118 x1Val = _mm256_load_ps(bPtr + 8);
119 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
120 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
121 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
122 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
123
124 // TODO: it may be possible to rearrange swizzling to better pipeline data
125 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
126 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
127 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
128 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
129
130 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
131 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
132 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
133 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
134
135 aPtr += 32;
136 bPtr += 16;
137 }
138
139 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
142
143 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
144
145 _mm256_store_ps(dotProductVector,
146 dotProdVal0); // Store the results back into the dot product vector
147
148 *realpt = dotProductVector[0];
149 *imagpt = dotProductVector[1];
150 *realpt += dotProductVector[2];
151 *imagpt += dotProductVector[3];
152 *realpt += dotProductVector[4];
153 *imagpt += dotProductVector[5];
154 *realpt += dotProductVector[6];
155 *imagpt += dotProductVector[7];
156
157 number = sixteenthPoints * 16;
158 for (; number < num_points; number++) {
159 *realpt += ((*aPtr++) * (*bPtr));
160 *imagpt += ((*aPtr++) * (*bPtr++));
161 }
162
163 *result = *(lv_32fc_t*)(&res[0]);
164}
165
166#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
167
168#ifdef LV_HAVE_AVX
169
170#include <immintrin.h>
171
173 const lv_32fc_t* input,
174 const float* taps,
175 unsigned int num_points)
176{
177
178 unsigned int number = 0;
179 const unsigned int sixteenthPoints = num_points / 16;
180
181 float res[2];
182 float *realpt = &res[0], *imagpt = &res[1];
183 const float* aPtr = (float*)input;
184 const float* bPtr = taps;
185
186 __m256 a0Val, a1Val, a2Val, a3Val;
187 __m256 b0Val, b1Val, b2Val, b3Val;
188 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
189 __m256 c0Val, c1Val, c2Val, c3Val;
190
191 __m256 dotProdVal0 = _mm256_setzero_ps();
192 __m256 dotProdVal1 = _mm256_setzero_ps();
193 __m256 dotProdVal2 = _mm256_setzero_ps();
194 __m256 dotProdVal3 = _mm256_setzero_ps();
195
196 for (; number < sixteenthPoints; number++) {
197
198 a0Val = _mm256_load_ps(aPtr);
199 a1Val = _mm256_load_ps(aPtr + 8);
200 a2Val = _mm256_load_ps(aPtr + 16);
201 a3Val = _mm256_load_ps(aPtr + 24);
202
203 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
204 x1Val = _mm256_load_ps(bPtr + 8);
205 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
206 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
207 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
208 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
209
210 // TODO: it may be possible to rearrange swizzling to better pipeline data
211 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
212 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
213 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
214 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
215
216 c0Val = _mm256_mul_ps(a0Val, b0Val);
217 c1Val = _mm256_mul_ps(a1Val, b1Val);
218 c2Val = _mm256_mul_ps(a2Val, b2Val);
219 c3Val = _mm256_mul_ps(a3Val, b3Val);
220
221 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
222 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
223 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
224 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
225
226 aPtr += 32;
227 bPtr += 16;
228 }
229
230 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
231 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
232 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
233
234 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
235
236 _mm256_store_ps(dotProductVector,
237 dotProdVal0); // Store the results back into the dot product vector
238
239 *realpt = dotProductVector[0];
240 *imagpt = dotProductVector[1];
241 *realpt += dotProductVector[2];
242 *imagpt += dotProductVector[3];
243 *realpt += dotProductVector[4];
244 *imagpt += dotProductVector[5];
245 *realpt += dotProductVector[6];
246 *imagpt += dotProductVector[7];
247
248 number = sixteenthPoints * 16;
249 for (; number < num_points; number++) {
250 *realpt += ((*aPtr++) * (*bPtr));
251 *imagpt += ((*aPtr++) * (*bPtr++));
252 }
253
254 *result = *(lv_32fc_t*)(&res[0]);
255}
256
257#endif /*LV_HAVE_AVX*/
258
259
260#ifdef LV_HAVE_SSE
261
262
264 const lv_32fc_t* input,
265 const float* taps,
266 unsigned int num_points)
267{
268
269 unsigned int number = 0;
270 const unsigned int sixteenthPoints = num_points / 8;
271
272 float res[2];
273 float *realpt = &res[0], *imagpt = &res[1];
274 const float* aPtr = (float*)input;
275 const float* bPtr = taps;
276
277 __m128 a0Val, a1Val, a2Val, a3Val;
278 __m128 b0Val, b1Val, b2Val, b3Val;
279 __m128 x0Val, x1Val, x2Val, x3Val;
280 __m128 c0Val, c1Val, c2Val, c3Val;
281
282 __m128 dotProdVal0 = _mm_setzero_ps();
283 __m128 dotProdVal1 = _mm_setzero_ps();
284 __m128 dotProdVal2 = _mm_setzero_ps();
285 __m128 dotProdVal3 = _mm_setzero_ps();
286
287 for (; number < sixteenthPoints; number++) {
288
289 a0Val = _mm_load_ps(aPtr);
290 a1Val = _mm_load_ps(aPtr + 4);
291 a2Val = _mm_load_ps(aPtr + 8);
292 a3Val = _mm_load_ps(aPtr + 12);
293
294 x0Val = _mm_load_ps(bPtr);
295 x1Val = _mm_load_ps(bPtr);
296 x2Val = _mm_load_ps(bPtr + 4);
297 x3Val = _mm_load_ps(bPtr + 4);
298 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
299 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
300 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
301 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
302
303 c0Val = _mm_mul_ps(a0Val, b0Val);
304 c1Val = _mm_mul_ps(a1Val, b1Val);
305 c2Val = _mm_mul_ps(a2Val, b2Val);
306 c3Val = _mm_mul_ps(a3Val, b3Val);
307
308 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
309 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
310 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
311 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
312
313 aPtr += 16;
314 bPtr += 8;
315 }
316
317 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
318 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
319 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
320
321 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
322
323 _mm_store_ps(dotProductVector,
324 dotProdVal0); // Store the results back into the dot product vector
325
326 *realpt = dotProductVector[0];
327 *imagpt = dotProductVector[1];
328 *realpt += dotProductVector[2];
329 *imagpt += dotProductVector[3];
330
331 number = sixteenthPoints * 8;
332 for (; number < num_points; number++) {
333 *realpt += ((*aPtr++) * (*bPtr));
334 *imagpt += ((*aPtr++) * (*bPtr++));
335 }
336
337 *result = *(lv_32fc_t*)(&res[0]);
338}
339
340#endif /*LV_HAVE_SSE*/
341
342#if LV_HAVE_AVX2 && LV_HAVE_FMA
343
344#include <immintrin.h>
345
346static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
347 const lv_32fc_t* input,
348 const float* taps,
349 unsigned int num_points)
350{
351
352 unsigned int number = 0;
353 const unsigned int sixteenthPoints = num_points / 16;
354
355 float res[2];
356 float *realpt = &res[0], *imagpt = &res[1];
357 const float* aPtr = (float*)input;
358 const float* bPtr = taps;
359
360 __m256 a0Val, a1Val, a2Val, a3Val;
361 __m256 b0Val, b1Val, b2Val, b3Val;
362 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
363
364 __m256 dotProdVal0 = _mm256_setzero_ps();
365 __m256 dotProdVal1 = _mm256_setzero_ps();
366 __m256 dotProdVal2 = _mm256_setzero_ps();
367 __m256 dotProdVal3 = _mm256_setzero_ps();
368
369 for (; number < sixteenthPoints; number++) {
370
371 a0Val = _mm256_loadu_ps(aPtr);
372 a1Val = _mm256_loadu_ps(aPtr + 8);
373 a2Val = _mm256_loadu_ps(aPtr + 16);
374 a3Val = _mm256_loadu_ps(aPtr + 24);
375
376 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
377 x1Val = _mm256_load_ps(bPtr + 8);
378 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
379 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
380 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
381 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
382
383 // TODO: it may be possible to rearrange swizzling to better pipeline data
384 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
385 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
386 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
387 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
388
389 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
390 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
391 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
392 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
393
394 aPtr += 32;
395 bPtr += 16;
396 }
397
398 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
401
402 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
403
404 _mm256_store_ps(dotProductVector,
405 dotProdVal0); // Store the results back into the dot product vector
406
407 *realpt = dotProductVector[0];
408 *imagpt = dotProductVector[1];
409 *realpt += dotProductVector[2];
410 *imagpt += dotProductVector[3];
411 *realpt += dotProductVector[4];
412 *imagpt += dotProductVector[5];
413 *realpt += dotProductVector[6];
414 *imagpt += dotProductVector[7];
415
416 number = sixteenthPoints * 16;
417 for (; number < num_points; number++) {
418 *realpt += ((*aPtr++) * (*bPtr));
419 *imagpt += ((*aPtr++) * (*bPtr++));
420 }
421
422 *result = *(lv_32fc_t*)(&res[0]);
423}
424
425#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
426
427#ifdef LV_HAVE_AVX
428
429#include <immintrin.h>
430
432 const lv_32fc_t* input,
433 const float* taps,
434 unsigned int num_points)
435{
436
437 unsigned int number = 0;
438 const unsigned int sixteenthPoints = num_points / 16;
439
440 float res[2];
441 float *realpt = &res[0], *imagpt = &res[1];
442 const float* aPtr = (float*)input;
443 const float* bPtr = taps;
444
445 __m256 a0Val, a1Val, a2Val, a3Val;
446 __m256 b0Val, b1Val, b2Val, b3Val;
447 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
448 __m256 c0Val, c1Val, c2Val, c3Val;
449
450 __m256 dotProdVal0 = _mm256_setzero_ps();
451 __m256 dotProdVal1 = _mm256_setzero_ps();
452 __m256 dotProdVal2 = _mm256_setzero_ps();
453 __m256 dotProdVal3 = _mm256_setzero_ps();
454
455 for (; number < sixteenthPoints; number++) {
456
457 a0Val = _mm256_loadu_ps(aPtr);
458 a1Val = _mm256_loadu_ps(aPtr + 8);
459 a2Val = _mm256_loadu_ps(aPtr + 16);
460 a3Val = _mm256_loadu_ps(aPtr + 24);
461
462 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
463 x1Val = _mm256_loadu_ps(bPtr + 8);
464 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
465 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
466 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
467 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
468
469 // TODO: it may be possible to rearrange swizzling to better pipeline data
470 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
471 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
472 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
473 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
474
475 c0Val = _mm256_mul_ps(a0Val, b0Val);
476 c1Val = _mm256_mul_ps(a1Val, b1Val);
477 c2Val = _mm256_mul_ps(a2Val, b2Val);
478 c3Val = _mm256_mul_ps(a3Val, b3Val);
479
480 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
481 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
482 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
483 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
484
485 aPtr += 32;
486 bPtr += 16;
487 }
488
489 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
490 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
491 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
492
493 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
494
495 _mm256_store_ps(dotProductVector,
496 dotProdVal0); // Store the results back into the dot product vector
497
498 *realpt = dotProductVector[0];
499 *imagpt = dotProductVector[1];
500 *realpt += dotProductVector[2];
501 *imagpt += dotProductVector[3];
502 *realpt += dotProductVector[4];
503 *imagpt += dotProductVector[5];
504 *realpt += dotProductVector[6];
505 *imagpt += dotProductVector[7];
506
507 number = sixteenthPoints * 16;
508 for (; number < num_points; number++) {
509 *realpt += ((*aPtr++) * (*bPtr));
510 *imagpt += ((*aPtr++) * (*bPtr++));
511 }
512
513 *result = *(lv_32fc_t*)(&res[0]);
514}
515#endif /*LV_HAVE_AVX*/
516
517#ifdef LV_HAVE_NEON
518#include <arm_neon.h>
519
520static inline void
522 const lv_32fc_t* __restrict input,
523 const float* __restrict taps,
524 unsigned int num_points)
525{
526
527 unsigned int number;
528 const unsigned int quarterPoints = num_points / 8;
529
530 float res[2];
531 float *realpt = &res[0], *imagpt = &res[1];
532 const float* inputPtr = (float*)input;
533 const float* tapsPtr = taps;
534 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
535 float accVector_real[4];
536 float accVector_imag[4];
537
538 float32x4x2_t inputVector0, inputVector1;
539 float32x4_t tapsVector0, tapsVector1;
540 float32x4_t tmp_real0, tmp_imag0;
541 float32x4_t tmp_real1, tmp_imag1;
542 float32x4_t real_accumulator0, imag_accumulator0;
543 float32x4_t real_accumulator1, imag_accumulator1;
544
545 // zero out accumulators
546 // take a *float, return float32x4_t
547 real_accumulator0 = vld1q_f32(zero);
548 imag_accumulator0 = vld1q_f32(zero);
549 real_accumulator1 = vld1q_f32(zero);
550 imag_accumulator1 = vld1q_f32(zero);
551
552 for (number = 0; number < quarterPoints; number++) {
553 // load doublewords and duplicate in to second lane
554 tapsVector0 = vld1q_f32(tapsPtr);
555 tapsVector1 = vld1q_f32(tapsPtr + 4);
556
557 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
558 inputVector0 = vld2q_f32(inputPtr);
559 inputVector1 = vld2q_f32(inputPtr + 8);
560 // inputVector is now a struct of two vectors, 0th is real, 1st is imag
561
562 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
563 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
564
565 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
566 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
567
568 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
569 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
570
571 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
572 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
573
574 tapsPtr += 8;
575 inputPtr += 16;
576 }
577
578 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
579 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
580 // void vst1q_f32( float32_t * ptr, float32x4_t val);
581 // store results back to a complex (array of 2 floats)
582 vst1q_f32(accVector_real, real_accumulator0);
583 vst1q_f32(accVector_imag, imag_accumulator0);
584 *realpt =
585 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
586
587 *imagpt =
588 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
589
590 // clean up the remainder
591 for (number = quarterPoints * 8; number < num_points; number++) {
592 *realpt += ((*inputPtr++) * (*tapsPtr));
593 *imagpt += ((*inputPtr++) * (*tapsPtr++));
594 }
595
596 *result = *(lv_32fc_t*)(&res[0]);
597}
598
599#endif /*LV_HAVE_NEON*/
600
601#ifdef LV_HAVE_NEON
602#include <arm_neon.h>
603
604static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
605 const lv_32fc_t* __restrict input,
606 const float* __restrict taps,
607 unsigned int num_points)
608{
609
610 unsigned int number;
611 const unsigned int quarterPoints = num_points / 4;
612
613 float res[2];
614 float *realpt = &res[0], *imagpt = &res[1];
615 const float* inputPtr = (float*)input;
616 const float* tapsPtr = taps;
617 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
618 float accVector_real[4];
619 float accVector_imag[4];
620
621 float32x4x2_t inputVector;
622 float32x4_t tapsVector;
623 float32x4_t tmp_real, tmp_imag;
624 float32x4_t real_accumulator, imag_accumulator;
625
626
627 // zero out accumulators
628 // take a *float, return float32x4_t
629 real_accumulator = vld1q_f32(zero);
630 imag_accumulator = vld1q_f32(zero);
631
632 for (number = 0; number < quarterPoints; number++) {
633 // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
634 // load doublewords and duplicate in to second lane
635 tapsVector = vld1q_f32(tapsPtr);
636
637 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
638 inputVector = vld2q_f32(inputPtr);
639
640 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
641 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
642
643 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
644 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
645
646
647 tapsPtr += 4;
648 inputPtr += 8;
649 }
650
651 // store results back to a complex (array of 2 floats)
652 vst1q_f32(accVector_real, real_accumulator);
653 vst1q_f32(accVector_imag, imag_accumulator);
654 *realpt =
655 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
656
657 *imagpt =
658 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
659
660 // clean up the remainder
661 for (number = quarterPoints * 4; number < num_points; number++) {
662 *realpt += ((*inputPtr++) * (*tapsPtr));
663 *imagpt += ((*inputPtr++) * (*tapsPtr++));
664 }
665
666 *result = *(lv_32fc_t*)(&res[0]);
667}
668
669#endif /*LV_HAVE_NEON*/
670
671#ifdef LV_HAVE_NEONV7
672extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
673 const lv_32fc_t* input,
674 const float* taps,
675 unsigned int num_points);
676#endif /*LV_HAVE_NEONV7*/
677
678#ifdef LV_HAVE_NEONV7
679extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
680 const lv_32fc_t* input,
681 const float* taps,
682 unsigned int num_points);
683#endif /*LV_HAVE_NEONV7*/
684
685#ifdef LV_HAVE_NEONV7
686extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
687 const lv_32fc_t* input,
688 const float* taps,
689 unsigned int num_points);
690#endif /*LV_HAVE_NEONV7*/
691
692#ifdef LV_HAVE_SSE
693
695 const lv_32fc_t* input,
696 const float* taps,
697 unsigned int num_points)
698{
699
700 unsigned int number = 0;
701 const unsigned int sixteenthPoints = num_points / 8;
702
703 float res[2];
704 float *realpt = &res[0], *imagpt = &res[1];
705 const float* aPtr = (float*)input;
706 const float* bPtr = taps;
707
708 __m128 a0Val, a1Val, a2Val, a3Val;
709 __m128 b0Val, b1Val, b2Val, b3Val;
710 __m128 x0Val, x1Val, x2Val, x3Val;
711 __m128 c0Val, c1Val, c2Val, c3Val;
712
713 __m128 dotProdVal0 = _mm_setzero_ps();
714 __m128 dotProdVal1 = _mm_setzero_ps();
715 __m128 dotProdVal2 = _mm_setzero_ps();
716 __m128 dotProdVal3 = _mm_setzero_ps();
717
718 for (; number < sixteenthPoints; number++) {
719
720 a0Val = _mm_loadu_ps(aPtr);
721 a1Val = _mm_loadu_ps(aPtr + 4);
722 a2Val = _mm_loadu_ps(aPtr + 8);
723 a3Val = _mm_loadu_ps(aPtr + 12);
724
725 x0Val = _mm_loadu_ps(bPtr);
726 x1Val = _mm_loadu_ps(bPtr);
727 x2Val = _mm_loadu_ps(bPtr + 4);
728 x3Val = _mm_loadu_ps(bPtr + 4);
729 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
730 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
731 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
732 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
733
734 c0Val = _mm_mul_ps(a0Val, b0Val);
735 c1Val = _mm_mul_ps(a1Val, b1Val);
736 c2Val = _mm_mul_ps(a2Val, b2Val);
737 c3Val = _mm_mul_ps(a3Val, b3Val);
738
739 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
740 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
741 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
742 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
743
744 aPtr += 16;
745 bPtr += 8;
746 }
747
748 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
749 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
750 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
751
752 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
753
754 _mm_store_ps(dotProductVector,
755 dotProdVal0); // Store the results back into the dot product vector
756
757 *realpt = dotProductVector[0];
758 *imagpt = dotProductVector[1];
759 *realpt += dotProductVector[2];
760 *imagpt += dotProductVector[3];
761
762 number = sixteenthPoints * 8;
763 for (; number < num_points; number++) {
764 *realpt += ((*aPtr++) * (*bPtr));
765 *imagpt += ((*aPtr++) * (*bPtr++));
766 }
767
768 *result = *(lv_32fc_t*)(&res[0]);
769}
770
771#endif /*LV_HAVE_SSE*/
772
773
774#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:431
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:263
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:521
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:604
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:58
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:172
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:694
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74