6op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
10__m128d v1, v2, m1, s1;
11v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
12m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
13 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
15_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
20__m128d v1, v2, m1, s1;
21v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
22m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
23 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
25_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
26op[ 0 ] += rp1[ 0 ] + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
30__m128d v1, v2, m1, s1;
31v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
32m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
33 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
35__m128d v3, v4, m3, s3;
36v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
37m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
38 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
40s1 = _mm_add_pd( s1, s3 );
41_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
46__m128d v1, v2, m1, s1;
47v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
48m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
49 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
51__m128d v3, v4, m3, s3;
52v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
53m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
54 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
56s1 = _mm_add_pd( s1, s3 );
57_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
58op[ 0 ] += rp1[ 0 ] + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
62__m128d v1, v2, m1, s1;
63v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
64m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
65 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
67__m128d v3, v4, m3, s3;
68v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
69m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
70 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
72v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
73m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
74 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
75s1 = _mm_add_pd( s1, m1 );
76s1 = _mm_add_pd( s1, s3 );
77_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
82__m128d v1, v2, m1, s1;
83v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
84m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
85 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
87__m128d v3, v4, m3, s3;
88v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
89m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
90 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
92v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
93m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
94 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
95s1 = _mm_add_pd( s1, m1 );
96s1 = _mm_add_pd( s1, s3 );
97_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
98op[ 0 ] += rp1[ 0 ] + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
102__m128d v1, v2, m1, s1;
103v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
104m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
105 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
107__m128d v3, v4, m3, s3;
108v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
109m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
110 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
112v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
113m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
114 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
115s1 = _mm_add_pd( s1, m1 );
116v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
117m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
118 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
119s3 = _mm_add_pd( s3, m3 );
120s1 = _mm_add_pd( s1, s3 );
121_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
126__m128d v1, v2, m1, s1;
127v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
128m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
129 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
131__m128d v3, v4, m3, s3;
132v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
133m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
134 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
136v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
137m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
138 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
139s1 = _mm_add_pd( s1, m1 );
140v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
141m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
142 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
143s3 = _mm_add_pd( s3, m3 );
144s1 = _mm_add_pd( s1, s3 );
145_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
146op[ 0 ] += rp1[ 0 ] + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
150__m128d v1, v2, m1, s1;
151v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
152m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
153 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
155__m128d v3, v4, m3, s3;
156v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
157m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
158 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
160v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
161m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
162 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
163s1 = _mm_add_pd( s1, m1 );
164v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
165m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
166 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
167s3 = _mm_add_pd( s3, m3 );
168v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
169m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
170 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
171s1 = _mm_add_pd( s1, m1 );
172s1 = _mm_add_pd( s1, s3 );
173_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
178__m128d v1, v2, m1, s1;
179v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
180m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
181 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
183__m128d v3, v4, m3, s3;
184v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
185m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
186 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
188v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
189m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
190 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
191s1 = _mm_add_pd( s1, m1 );
192v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
193m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
194 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
195s3 = _mm_add_pd( s3, m3 );
196v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
197m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
198 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
199s1 = _mm_add_pd( s1, m1 );
200s1 = _mm_add_pd( s1, s3 );
201_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
202op[ 0 ] += rp1[ 0 ] + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
206__m128d v1, v2, m1, s1;
207v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
208m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
209 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
211__m128d v3, v4, m3, s3;
212v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
213m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
214 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
216v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
217m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
218 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
219s1 = _mm_add_pd( s1, m1 );
220v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
221m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
222 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
223s3 = _mm_add_pd( s3, m3 );
224v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
225m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
226 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
227s1 = _mm_add_pd( s1, m1 );
228v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
229m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
230 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
231s3 = _mm_add_pd( s3, m3 );
232s1 = _mm_add_pd( s1, s3 );
233_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
238__m128d v1, v2, m1, s1;
239v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
240m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
241 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
243__m128d v3, v4, m3, s3;
244v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
245m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
246 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
248v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
249m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
250 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
251s1 = _mm_add_pd( s1, m1 );
252v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
253m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
254 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
255s3 = _mm_add_pd( s3, m3 );
256v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
257m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
258 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
259s1 = _mm_add_pd( s1, m1 );
260v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
261m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
262 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
263s3 = _mm_add_pd( s3, m3 );
264s1 = _mm_add_pd( s1, s3 );
265_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
266op[ 0 ] += rp1[ 0 ] + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
270__m128d v1, v2, m1, s1;
271v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
272m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
273 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
275__m128d v3, v4, m3, s3;
276v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
277m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
278 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
280v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
281m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
282 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
283s1 = _mm_add_pd( s1, m1 );
284v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
285m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
286 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
287s3 = _mm_add_pd( s3, m3 );
288v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
289m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
290 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
291s1 = _mm_add_pd( s1, m1 );
292v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
293m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
294 _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
295s3 = _mm_add_pd( s3, m3 );
296v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 );
297m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ),
298 _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
299s1 = _mm_add_pd( s1, m1 );
300s1 = _mm_add_pd( s1, s3 );
301_mm_storel_pd( op, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
305#elif defined( R8B_NEON )
308op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
312float64x2_t v1, v2, s1;
313s1 = vdupq_n_f64( 0.0 );
314v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
315s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
316 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
317op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
321float64x2_t v1, v2, s1;
322s1 = vdupq_n_f64( 0.0 );
323v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
324s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
325 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
326op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
330float64x2_t v1, v2, s1;
331s1 = vdupq_n_f64( 0.0 );
332v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
333s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
334 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
335float64x2_t v3, v4, s3;
336s3 = vdupq_n_f64( 0.0 );
337v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
338s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
339 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
340s1 = vaddq_f64( s1, s3 );
341op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
345float64x2_t v1, v2, s1;
346s1 = vdupq_n_f64( 0.0 );
347v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
348s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
349 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
350float64x2_t v3, v4, s3;
351s3 = vdupq_n_f64( 0.0 );
352v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
353s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
354 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
355s1 = vaddq_f64( s1, s3 );
356op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
360float64x2_t v1, v2, s1;
361s1 = vdupq_n_f64( 0.0 );
362v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
363s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
364 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
365float64x2_t v3, v4, s3;
366s3 = vdupq_n_f64( 0.0 );
367v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
368s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
369 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
370v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
371s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
372 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
373s1 = vaddq_f64( s1, s3 );
374op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
378float64x2_t v1, v2, s1;
379s1 = vdupq_n_f64( 0.0 );
380v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
381s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
382 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
383float64x2_t v3, v4, s3;
384s3 = vdupq_n_f64( 0.0 );
385v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
386s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
387 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
388v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
389s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
390 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
391s1 = vaddq_f64( s1, s3 );
392op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
396float64x2_t v1, v2, s1;
397s1 = vdupq_n_f64( 0.0 );
398v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
399s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
400 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
401float64x2_t v3, v4, s3;
402s3 = vdupq_n_f64( 0.0 );
403v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
404s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
405 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
406v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
407s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
408 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
409v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
410s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
411 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
412s1 = vaddq_f64( s1, s3 );
413op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
417float64x2_t v1, v2, s1;
418s1 = vdupq_n_f64( 0.0 );
419v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
420s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
421 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
422float64x2_t v3, v4, s3;
423s3 = vdupq_n_f64( 0.0 );
424v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
425s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
426 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
427v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
428s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
429 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
430v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
431s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
432 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
433s1 = vaddq_f64( s1, s3 );
434op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
438float64x2_t v1, v2, s1;
439s1 = vdupq_n_f64( 0.0 );
440v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
441s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
442 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
443float64x2_t v3, v4, s3;
444s3 = vdupq_n_f64( 0.0 );
445v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
446s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
447 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
448v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
449s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
450 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
451v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
452s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
453 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
454v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
455s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
456 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
457s1 = vaddq_f64( s1, s3 );
458op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
462float64x2_t v1, v2, s1;
463s1 = vdupq_n_f64( 0.0 );
464v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
465s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
466 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
467float64x2_t v3, v4, s3;
468s3 = vdupq_n_f64( 0.0 );
469v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
470s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
471 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
472v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
473s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
474 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
475v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
476s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
477 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
478v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
479s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
480 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
481s1 = vaddq_f64( s1, s3 );
482op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
486float64x2_t v1, v2, s1;
487s1 = vdupq_n_f64( 0.0 );
488v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
489s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
490 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
491float64x2_t v3, v4, s3;
492s3 = vdupq_n_f64( 0.0 );
493v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
494s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
495 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
496v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
497s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
498 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
499v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
500s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
501 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
502v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
503s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
504 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
505v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
506s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
507 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
508s1 = vaddq_f64( s1, s3 );
509op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
513float64x2_t v1, v2, s1;
514s1 = vdupq_n_f64( 0.0 );
515v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
516s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
517 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
518float64x2_t v3, v4, s3;
519s3 = vdupq_n_f64( 0.0 );
520v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
521s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
522 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
523v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
524s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
525 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
526v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
527s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
528 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
529v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
530s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
531 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
532v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
533s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
534 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
535s1 = vaddq_f64( s1, s3 );
536op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ] + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
540float64x2_t v1, v2, s1;
541s1 = vdupq_n_f64( 0.0 );
542v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
543s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
544 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
545float64x2_t v3, v4, s3;
546s3 = vdupq_n_f64( 0.0 );
547v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
548s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
549 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
550v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
551s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
552 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
553v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
554s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
555 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
556v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
557s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
558 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
559v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
560s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
561 vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
562v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 );
563s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ),
564 vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
565s1 = vaddq_f64( s1, s3 );
566op[ 0 ] = vaddvq_f64( s1 ) + rp1[ 0 ];
572op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
576op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
577 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]);
581op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
582 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
583 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
587op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
588 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
589 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
590 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]);
594op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
595 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
596 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
597 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
598 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
602op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
603 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
604 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
605 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
606 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
607 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]);
611op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
612 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
613 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
614 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
615 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
616 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
617 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
621op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
622 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
623 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
624 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
625 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
626 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
627 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
628 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]);
632op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
633 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
634 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
635 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
636 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
637 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
638 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
639 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
640 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
644op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
645 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
646 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
647 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
648 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
649 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
650 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
651 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
652 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
653 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]);
657op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
658 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
659 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
660 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
661 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
662 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
663 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
664 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
665 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
666 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
667 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
671op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
672 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
673 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
674 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
675 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
676 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
677 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
678 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
679 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
680 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
681 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
682 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]);
686op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
687 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
688 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
689 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
690 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
691 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
692 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
693 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
694 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
695 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
696 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
697 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
698 + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
702op[ 0 ] = rp1[ 0 ] + flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
703 + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
704 + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
705 + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
706 + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
707 + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
708 + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
709 + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
710 + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
711 + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
712 + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
713 + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
714 + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ])
715 + flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]);