r8brain-free-src
High-quality pro audio sample rate converter library
 
Loading...
Searching...
No Matches
CDSPHBUpsampler.inc
1// Auto-generated by `genhbc`, do not edit!
2
3#if defined( R8B_SSE2 )
4
5R8BHBC1( convolve1 )
6op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
7R8BHBC2
8
9R8BHBC1( convolve2 )
10__m128d v1, v2, m1, s1;
11v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
12m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
13 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
14s1 = m1;
15_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
16R8BHBC2
17
18R8BHBC1( convolve3 )
19__m128d v1, v2, m1, s1;
20v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
21m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
22 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
23s1 = m1;
24_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
25op[ 1 ] += flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
26R8BHBC2
27
28R8BHBC1( convolve4 )
29__m128d v1, v2, m1, s1;
30v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
31m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
32 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
33s1 = m1;
34__m128d v3, v4, m3, s3;
35v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
36m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
37 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
38s3 = m3;
39s1 = _mm_add_pd( s1, s3 );
40_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
41R8BHBC2
42
43R8BHBC1( convolve5 )
44__m128d v1, v2, m1, s1;
45v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
46m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
47 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
48s1 = m1;
49__m128d v3, v4, m3, s3;
50v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
51m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
52 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
53s3 = m3;
54s1 = _mm_add_pd( s1, s3 );
55_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
56op[ 1 ] += flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
57R8BHBC2
58
59R8BHBC1( convolve6 )
60__m128d v1, v2, m1, s1;
61v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
62m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
63 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
64s1 = m1;
65__m128d v3, v4, m3, s3;
66v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
67m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
68 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
69s3 = m3;
70v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
71m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
72 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
73s1 = _mm_add_pd( s1, m1 );
74s1 = _mm_add_pd( s1, s3 );
75_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
76R8BHBC2
77
78R8BHBC1( convolve7 )
79__m128d v1, v2, m1, s1;
80v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
81m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
82 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
83s1 = m1;
84__m128d v3, v4, m3, s3;
85v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
86m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
87 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
88s3 = m3;
89v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
90m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
91 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
92s1 = _mm_add_pd( s1, m1 );
93s1 = _mm_add_pd( s1, s3 );
94_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
95op[ 1 ] += flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
96R8BHBC2
97
98R8BHBC1( convolve8 )
99__m128d v1, v2, m1, s1;
100v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
101m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
102 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
103s1 = m1;
104__m128d v3, v4, m3, s3;
105v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
106m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
107 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
108s3 = m3;
109v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
110m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
111 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
112s1 = _mm_add_pd( s1, m1 );
113v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
114m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
115 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
116s3 = _mm_add_pd( s3, m3 );
117s1 = _mm_add_pd( s1, s3 );
118_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
119R8BHBC2
120
121R8BHBC1( convolve9 )
122__m128d v1, v2, m1, s1;
123v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
124m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
125 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
126s1 = m1;
127__m128d v3, v4, m3, s3;
128v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
129m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
130 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
131s3 = m3;
132v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
133m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
134 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
135s1 = _mm_add_pd( s1, m1 );
136v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
137m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
138 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
139s3 = _mm_add_pd( s3, m3 );
140s1 = _mm_add_pd( s1, s3 );
141_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
142op[ 1 ] += flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
143R8BHBC2
144
145R8BHBC1( convolve10 )
146__m128d v1, v2, m1, s1;
147v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
148m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
149 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
150s1 = m1;
151__m128d v3, v4, m3, s3;
152v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
153m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
154 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
155s3 = m3;
156v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
157m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
158 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
159s1 = _mm_add_pd( s1, m1 );
160v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
161m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
162 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
163s3 = _mm_add_pd( s3, m3 );
164v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
165m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
166 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
167s1 = _mm_add_pd( s1, m1 );
168s1 = _mm_add_pd( s1, s3 );
169_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
170R8BHBC2
171
172R8BHBC1( convolve11 )
173__m128d v1, v2, m1, s1;
174v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
175m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
176 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
177s1 = m1;
178__m128d v3, v4, m3, s3;
179v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
180m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
181 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
182s3 = m3;
183v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
184m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
185 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
186s1 = _mm_add_pd( s1, m1 );
187v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
188m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
189 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
190s3 = _mm_add_pd( s3, m3 );
191v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
192m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
193 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
194s1 = _mm_add_pd( s1, m1 );
195s1 = _mm_add_pd( s1, s3 );
196_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
197op[ 1 ] += flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
198R8BHBC2
199
200R8BHBC1( convolve12 )
201__m128d v1, v2, m1, s1;
202v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
203m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
204 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
205s1 = m1;
206__m128d v3, v4, m3, s3;
207v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
208m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
209 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
210s3 = m3;
211v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
212m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
213 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
214s1 = _mm_add_pd( s1, m1 );
215v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
216m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
217 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
218s3 = _mm_add_pd( s3, m3 );
219v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
220m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
221 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
222s1 = _mm_add_pd( s1, m1 );
223v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
224m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
225 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
226s3 = _mm_add_pd( s3, m3 );
227s1 = _mm_add_pd( s1, s3 );
228_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
229R8BHBC2
230
231R8BHBC1( convolve13 )
232__m128d v1, v2, m1, s1;
233v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
234m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
235 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
236s1 = m1;
237__m128d v3, v4, m3, s3;
238v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
239m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
240 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
241s3 = m3;
242v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
243m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
244 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
245s1 = _mm_add_pd( s1, m1 );
246v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
247m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
248 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
249s3 = _mm_add_pd( s3, m3 );
250v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
251m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
252 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
253s1 = _mm_add_pd( s1, m1 );
254v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
255m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
256 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
257s3 = _mm_add_pd( s3, m3 );
258s1 = _mm_add_pd( s1, s3 );
259_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
260op[ 1 ] += flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
261R8BHBC2
262
263R8BHBC1( convolve14 )
264__m128d v1, v2, m1, s1;
265v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
266m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
267 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
268s1 = m1;
269__m128d v3, v4, m3, s3;
270v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
271m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
272 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
273s3 = m3;
274v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
275m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
276 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
277s1 = _mm_add_pd( s1, m1 );
278v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
279m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
280 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
281s3 = _mm_add_pd( s3, m3 );
282v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
283m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
284 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
285s1 = _mm_add_pd( s1, m1 );
286v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
287m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
288 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
289s3 = _mm_add_pd( s3, m3 );
290v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 );
291m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ),
292 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
293s1 = _mm_add_pd( s1, m1 );
294s1 = _mm_add_pd( s1, s3 );
295_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
296R8BHBC2
297
298#elif defined( R8B_NEON )
299
300R8BHBC1( convolve1 )
301op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
302R8BHBC2
303
304R8BHBC1( convolve2 )
305float64x2_t v1, v2, s1;
306s1 = vdupq_n_f64( 0.0 );
307v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
308s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
309 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
310op[ 1 ] = vaddvq_f64( s1 );
311R8BHBC2
312
313R8BHBC1( convolve3 )
314float64x2_t v1, v2, s1;
315s1 = vdupq_n_f64( 0.0 );
316v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
317s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
318 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
319op[ 1 ] = vaddvq_f64( s1 ) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
320R8BHBC2
321
322R8BHBC1( convolve4 )
323float64x2_t v1, v2, s1;
324s1 = vdupq_n_f64( 0.0 );
325v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
326s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
327 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
328float64x2_t v3, v4, s3;
329s3 = vdupq_n_f64( 0.0 );
330v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
331s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
332 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
333s1 = vaddq_f64( s1, s3 );
334op[ 1 ] = vaddvq_f64( s1 );
335R8BHBC2
336
337R8BHBC1( convolve5 )
338float64x2_t v1, v2, s1;
339s1 = vdupq_n_f64( 0.0 );
340v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
341s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
342 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
343float64x2_t v3, v4, s3;
344s3 = vdupq_n_f64( 0.0 );
345v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
346s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
347 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
348s1 = vaddq_f64( s1, s3 );
349op[ 1 ] = vaddvq_f64( s1 ) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
350R8BHBC2
351
352R8BHBC1( convolve6 )
353float64x2_t v1, v2, s1;
354s1 = vdupq_n_f64( 0.0 );
355v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
356s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
357 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
358float64x2_t v3, v4, s3;
359s3 = vdupq_n_f64( 0.0 );
360v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
361s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
362 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
363v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
364s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
365 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
366s1 = vaddq_f64( s1, s3 );
367op[ 1 ] = vaddvq_f64( s1 );
368R8BHBC2
369
370R8BHBC1( convolve7 )
371float64x2_t v1, v2, s1;
372s1 = vdupq_n_f64( 0.0 );
373v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
374s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
375 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
376float64x2_t v3, v4, s3;
377s3 = vdupq_n_f64( 0.0 );
378v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
379s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
380 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
381v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
382s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
383 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
384s1 = vaddq_f64( s1, s3 );
385op[ 1 ] = vaddvq_f64( s1 ) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
386R8BHBC2
387
388R8BHBC1( convolve8 )
389float64x2_t v1, v2, s1;
390s1 = vdupq_n_f64( 0.0 );
391v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
392s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
393 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
394float64x2_t v3, v4, s3;
395s3 = vdupq_n_f64( 0.0 );
396v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
397s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
398 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
399v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
400s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
401 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
402v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
403s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
404 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
405s1 = vaddq_f64( s1, s3 );
406op[ 1 ] = vaddvq_f64( s1 );
407R8BHBC2
408
409R8BHBC1( convolve9 )
410float64x2_t v1, v2, s1;
411s1 = vdupq_n_f64( 0.0 );
412v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
413s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
414 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
415float64x2_t v3, v4, s3;
416s3 = vdupq_n_f64( 0.0 );
417v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
418s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
419 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
420v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
421s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
422 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
423v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
424s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
425 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
426s1 = vaddq_f64( s1, s3 );
427op[ 1 ] = vaddvq_f64( s1 ) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
428R8BHBC2
429
430R8BHBC1( convolve10 )
431float64x2_t v1, v2, s1;
432s1 = vdupq_n_f64( 0.0 );
433v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
434s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
435 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
436float64x2_t v3, v4, s3;
437s3 = vdupq_n_f64( 0.0 );
438v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
439s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
440 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
441v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
442s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
443 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
444v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
445s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
446 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
447v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
448s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
449 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
450s1 = vaddq_f64( s1, s3 );
451op[ 1 ] = vaddvq_f64( s1 );
452R8BHBC2
453
454R8BHBC1( convolve11 )
455float64x2_t v1, v2, s1;
456s1 = vdupq_n_f64( 0.0 );
457v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
458s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
459 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
460float64x2_t v3, v4, s3;
461s3 = vdupq_n_f64( 0.0 );
462v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
463s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
464 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
465v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
466s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
467 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
468v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
469s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
470 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
471v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
472s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
473 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
474s1 = vaddq_f64( s1, s3 );
475op[ 1 ] = vaddvq_f64( s1 ) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
476R8BHBC2
477
478R8BHBC1( convolve12 )
479float64x2_t v1, v2, s1;
480s1 = vdupq_n_f64( 0.0 );
481v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
482s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
483 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
484float64x2_t v3, v4, s3;
485s3 = vdupq_n_f64( 0.0 );
486v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
487s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
488 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
489v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
490s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
491 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
492v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
493s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
494 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
495v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
496s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
497 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
498v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
499s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
500 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
501s1 = vaddq_f64( s1, s3 );
502op[ 1 ] = vaddvq_f64( s1 );
503R8BHBC2
504
505R8BHBC1( convolve13 )
506float64x2_t v1, v2, s1;
507s1 = vdupq_n_f64( 0.0 );
508v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
509s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
510 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
511float64x2_t v3, v4, s3;
512s3 = vdupq_n_f64( 0.0 );
513v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
514s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
515 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
516v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
517s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
518 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
519v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
520s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
521 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
522v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
523s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
524 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
525v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
526s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
527 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
528s1 = vaddq_f64( s1, s3 );
529op[ 1 ] = vaddvq_f64( s1 ) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
530R8BHBC2
531
532R8BHBC1( convolve14 )
533float64x2_t v1, v2, s1;
534s1 = vdupq_n_f64( 0.0 );
535v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
536s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
537 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
538float64x2_t v3, v4, s3;
539s3 = vdupq_n_f64( 0.0 );
540v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
541s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
542 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
543v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
544s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
545 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
546v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
547s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
548 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
549v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
550s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
551 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
552v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
553s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
554 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
555v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 );
556s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ),
557 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
558s1 = vaddq_f64( s1, s3 );
559op[ 1 ] = vaddvq_f64( s1 );
560R8BHBC2
561
562#else // SIMD
563
564R8BHBC1( convolve1 )
565op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
566R8BHBC2
567
568R8BHBC1( convolve2 )
569op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
570 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]);
571R8BHBC2
572
573R8BHBC1( convolve3 )
574op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
575 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
576 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
577R8BHBC2
578
579R8BHBC1( convolve4 )
580op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
581 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
582 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
583 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]);
584R8BHBC2
585
586R8BHBC1( convolve5 )
587op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
588 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
589 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
590 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
591 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
592R8BHBC2
593
594R8BHBC1( convolve6 )
595op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
596 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
597 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
598 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
599 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
600 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]);
601R8BHBC2
602
603R8BHBC1( convolve7 )
604op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
605 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
606 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
607 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
608 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
609 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
610 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
611R8BHBC2
612
613R8BHBC1( convolve8 )
614op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
615 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
616 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
617 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
618 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
619 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
620 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
621 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]);
622R8BHBC2
623
624R8BHBC1( convolve9 )
625op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
626 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
627 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
628 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
629 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
630 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
631 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
632 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
633 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
634R8BHBC2
635
636R8BHBC1( convolve10 )
637op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
638 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
639 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
640 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
641 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
642 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
643 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
644 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
645 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
646 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]);
647R8BHBC2
648
649R8BHBC1( convolve11 )
650op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
651 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
652 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
653 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
654 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
655 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
656 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
657 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
658 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
659 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
660 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
661R8BHBC2
662
663R8BHBC1( convolve12 )
664op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
665 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
666 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
667 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
668 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
669 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
670 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
671 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
672 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
673 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
674 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
675 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]);
676R8BHBC2
677
678R8BHBC1( convolve13 )
679op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
680 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
681 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
682 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
683 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
684 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
685 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
686 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
687 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
688 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
689 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
690 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
691 + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
692R8BHBC2
693
694R8BHBC1( convolve14 )
695op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
696 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
697 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
698 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
699 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
700 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
701 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
702 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
703 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
704 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
705 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
706 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
707 + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ])
708 + flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]);
709R8BHBC2
710
711#endif // SIMD