36 #ifndef C7100_MMA_INLINES_H
37 #define C7100_MMA_INLINES_H
45 #include <c7x_scalable.h>
48 template <
typename V>
inline uint64_t c7x_horizontal_max(V vin);
50 template <>
inline uint64_t c7x_horizontal_max(c7x::uchar_vec vin)
52 uchar32 m1 = __max(vin.even(), vin.odd());
53 uchar16 m2 = __max(m1.even(), m1.odd());
54 uchar8 m3 = __max(m2.even(), m2.odd());
55 uchar4 m4 = __max(m3.even(), m3.odd());
56 uchar2 m5 = __max(m4.even(), m4.odd());
57 uint64_t retVal = (uint64_t) __max(m5.even(), m5.odd());
60 template <>
inline uint64_t c7x_horizontal_max(c7x::ushort_vec vin)
62 ushort16 m1 = __max(vin.even(), vin.odd());
63 ushort8 m2 = __max(m1.even(), m1.odd());
64 ushort4 m3 = __max(m2.even(), m2.odd());
65 ushort2 m4 = __max(m3.even(), m3.odd());
66 uint64_t retVal = (uint64_t) __max(m4.even(), m4.odd());
69 template <>
inline uint64_t c7x_horizontal_max(c7x::uint_vec vin)
71 uint64_t retVal = (uint64_t) (__sort_asc((uint16) vin).sf());
74 template <>
inline uint64_t c7x_horizontal_max(c7x::ulong_vec vin)
76 ulong4 m1 = __max(vin.even(), vin.odd());
77 ulong2 m2 = __max(m1.even(), m1.odd());
78 uint64_t retVal = (uint64_t) __max(m2.even(), m2.odd());
84 template <
typename V,
typename W>
85 inline void c7x_horizontal_max_with_index(V maxValVec, V vIdx, W *maxVal,
int *maxIdx);
88 inline void c7x_horizontal_max_with_index(c7x::float_vec maxValVec, c7x::float_vec vIdx,
float *maxVal,
int *maxIdx)
91 vpMask = __cmp_lt_pred(maxValVec.even(), maxValVec.odd());
92 float8 maxValVec1 = (float8) __select(vpMask, maxValVec.odd(), maxValVec.even());
93 float8 vIdx1 = (float8) __select(vpMask, vIdx.odd(), vIdx.even());
95 vpMask = __cmp_lt_pred(maxValVec1.even(), maxValVec1.odd());
96 float4 maxValVec2 = (float4) __select(vpMask, maxValVec1.odd(), maxValVec1.even());
97 float4 vIdx2 = (float4) __select(vpMask, vIdx1.odd(), vIdx1.even());
99 vpMask = __cmp_lt_pred(maxValVec2.even(), maxValVec2.odd());
100 float2 maxValVec3 = (float2) __select(vpMask, maxValVec2.odd(), maxValVec2.even());
101 float2 vIdx3 = (float2) __select(vpMask, vIdx2.odd(), vIdx2.even());
103 vpMask = __cmp_lt_pred(maxValVec3.even(), maxValVec3.odd());
104 *maxVal = (float) __select(vpMask, maxValVec3.odd(), maxValVec3.even());
105 *maxIdx = ((int) __select(vpMask, vIdx3.odd(), vIdx3.even()));
109 inline void c7x_horizontal_max_with_index(c7x::double_vec maxValVec, c7x::double_vec vIdx,
double *maxVal,
int *maxIdx)
112 vpMask = __cmp_lt_pred(maxValVec.even(), maxValVec.odd());
113 double4 maxValVec1 = (double4) __select(vpMask, maxValVec.odd(), maxValVec.even());
114 double4 vIdx1 = (double4) __select(vpMask, vIdx.odd(), vIdx.even());
116 vpMask = __cmp_lt_pred(maxValVec1.even(), maxValVec1.odd());
117 double2 maxValVec2 = (double2) __select(vpMask, maxValVec1.odd(), maxValVec1.even());
118 double2 vIdx2 = (double2) __select(vpMask, vIdx1.odd(), vIdx1.even());
120 vpMask = __cmp_lt_pred(maxValVec2.even(), maxValVec2.odd());
121 *maxVal = (double) __select(vpMask, maxValVec2.odd(), maxValVec2.even());
122 *maxIdx = ((int) __select(vpMask, vIdx2.odd(), vIdx2.even()));
126 inline void c7x_horizontal_max_with_index(c7x::char_vec maxValVec, c7x::char_vec vIdx, int8_t *maxVal,
int *maxIdx)
129 typedef typename c7x::make_full_vector<int8_t>::type vec;
131 char32 maxValVec1 = maxValVec.even();
132 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
134 char16 maxValVec2 = maxValVec1.even();
135 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
137 char8 maxValVec3 = maxValVec2.even();
138 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
140 char4 maxValVec4 = maxValVec3.even();
141 maxValVec4 = __max(maxValVec3.odd(), maxValVec4);
143 char2 maxValVec5 = maxValVec4.even();
144 maxValVec5 = __max(maxValVec4.odd(), maxValVec5);
146 int8_t maxValVec6 = maxValVec5.even();
147 maxValVec6 = __max(maxValVec5.odd(), maxValVec6);
148 *maxVal = (int8_t) maxValVec6;
149 int8_t maxValScaler = (int8_t) maxValVec6;
151 char64 zero_vec = vec(0);
152 char64 diff_vector = vec(maxValScaler) - maxValVec;
153 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
154 char64 masked_indices = __select(vpMask, vIdx, vec(-1));
155 uchar64 umasked_indices = __as_uchar64(masked_indices);
157 uchar32 vIdx1 = (uchar32) __min(umasked_indices.even(), umasked_indices.odd());
158 uchar16 vIdx2 = (uchar16) __min(vIdx1.even(), vIdx1.odd());
159 uchar8 vIdx3 = (uchar8) __min(vIdx2.even(), vIdx2.odd());
160 uchar4 vIdx4 = (uchar4) __min(vIdx3.even(), vIdx3.odd());
161 uchar2 vIdx5 = (uchar2) __min(vIdx4.even(), vIdx4.odd());
162 *maxIdx = (uint8_t) __min(vIdx5.even(), vIdx5.odd());
166 inline void c7x_horizontal_max_with_index(c7x::short_vec maxValVec, c7x::short_vec vIdx, int16_t *maxVal,
int *maxIdx)
169 typedef typename c7x::make_full_vector<int16_t>::type vec;
171 vec sortIn = __vdsortdd16h_vv(maxValVec);
172 *maxVal = (short) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
174 short maxValScaler = (short) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
176 short32 zero_vec = vec(0);
177 short32 diff_vector = vec(maxValScaler) - maxValVec;
178 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
179 short32 masked_indices = __select(vpMask, vIdx, vec(255));
180 short32 sorted_indices = __vdsortii16h_vv(masked_indices);
182 *maxIdx = (sorted_indices.s[0] < sorted_indices.s[16]) ? sorted_indices.s[0] : sorted_indices.s[16];
186 inline void c7x_horizontal_max_with_index(c7x::int_vec maxValVec, c7x::int_vec vIdx, int32_t *maxVal,
int *maxIdx)
189 typedef typename c7x::make_full_vector<int32_t>::type vec;
191 vec sortIn = __sort_desc(maxValVec);
192 *maxVal = (int) sortIn.s[0];
194 int maxValScaler = (
int) sortIn.s[0];
196 int16 zero_vec = vec(0);
197 int16 diff_vector = vec(maxValScaler) - maxValVec;
198 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
199 int16 masked_indices = __select(vpMask, vIdx, vec(255));
200 int16 sorted_indices = __sort_asc(masked_indices);
202 *maxIdx = sorted_indices.s[0];
206 inline void c7x_horizontal_max_with_index(c7x::long_vec maxValVec, c7x::long_vec vIdx, int64_t *maxVal,
int *maxIdx)
209 typedef typename c7x::make_full_vector<int64_t>::type vec;
211 long4 maxValVec1 = maxValVec.even();
212 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
214 long2 maxValVec2 = maxValVec1.even();
215 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
217 long maxValVec3 = maxValVec2.even();
218 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
219 *maxVal = (long) maxValVec3;
221 long maxValScaler = (long) maxValVec3;
223 long8 zero_vec = vec(0);
224 long8 diff_vector = vec(maxValScaler) - maxValVec;
225 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
226 long8 maxIdxVec = vec(255);
227 long8 masked_indices = __select(vpMask, vIdx, maxIdxVec);
229 long4 vIdx1 = (long4) __min(masked_indices.even(), masked_indices.odd());
230 long2 vIdx2 = (long2) __min(vIdx1.even(), vIdx1.odd());
231 *maxIdx = (long) __min(vIdx2.even(), vIdx2.odd());
235 inline void c7x_horizontal_max_with_index(c7x::uchar_vec maxValVec, c7x::uchar_vec vIdx, uint8_t *maxVal,
int *maxIdx)
238 typedef typename c7x::make_full_vector<uint8_t>::type vec;
240 uchar32 maxValVec1 = maxValVec.even();
241 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
243 uchar16 maxValVec2 = maxValVec1.even();
244 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
246 uchar8 maxValVec3 = maxValVec2.even();
247 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
249 uchar4 maxValVec4 = maxValVec3.even();
250 maxValVec4 = __max(maxValVec3.odd(), maxValVec4);
252 uchar2 maxValVec5 = maxValVec4.even();
253 maxValVec5 = __max(maxValVec4.odd(), maxValVec5);
255 uint8_t maxValVec6 = maxValVec5.even();
256 maxValVec6 = __max(maxValVec5.odd(), maxValVec6);
257 *maxVal = (uint8_t) maxValVec6;
258 uint8_t maxValScaler = (uint8_t) maxValVec6;
260 uchar64 zero_vec = vec(0);
261 uchar64 diff_vector = vec(maxValScaler) - maxValVec;
262 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
263 uchar64 maxIdxVec = vec(255);
264 uchar64 masked_indices = __select(vpMask, vIdx, maxIdxVec);
266 uchar32 vIdx1 = (uchar32) __min(masked_indices.even(), masked_indices.odd());
267 uchar16 vIdx2 = (uchar16) __min(vIdx1.even(), vIdx1.odd());
268 uchar8 vIdx3 = (uchar8) __min(vIdx2.even(), vIdx2.odd());
269 uchar4 vIdx4 = (uchar4) __min(vIdx3.even(), vIdx3.odd());
270 uchar2 vIdx5 = (uchar2) __min(vIdx4.even(), vIdx4.odd());
271 *maxIdx = (int) __min(vIdx5.even(), vIdx5.odd());
276 c7x_horizontal_max_with_index(c7x::ushort_vec maxValVec, c7x::ushort_vec vIdx, uint16_t *maxVal,
int *maxIdx)
279 typedef typename c7x::make_full_vector<uint16_t>::type vec;
281 vec sortIn = __vdsortddu16h_vv(maxValVec);
282 *maxVal = (ushort) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
284 ushort maxValScaler = (ushort) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
286 ushort32 zero_vec = vec(0);
287 ushort32 diff_vector = vec(maxValScaler) - maxValVec;
288 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
289 ushort32 masked_indices = __select(vpMask, vIdx, vec(255));
290 ushort32 sorted_indices = __vdsortiiu16h_vv(masked_indices);
292 *maxIdx = (sorted_indices.s[0] < sorted_indices.s[16]) ? sorted_indices.s[0] : sorted_indices.s[16];
296 inline void c7x_horizontal_max_with_index(c7x::uint_vec maxValVec, c7x::uint_vec vIdx, uint32_t *maxVal,
int *maxIdx)
299 typedef typename c7x::make_full_vector<uint32_t>::type vec;
301 vec sortIn = __sort_desc(maxValVec);
302 *maxVal = (uint) sortIn.s[0];
304 uint maxValScaler = (uint) sortIn.s[0];
306 uint16 zero_vec = vec(0);
307 uint16 diff_vector = vec(maxValScaler) - maxValVec;
308 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
309 uint16 masked_indices = __select(vpMask, vIdx, vec(255));
310 uint16 sorted_indices = __sort_asc(masked_indices);
312 *maxIdx = sorted_indices.s[0];
316 inline void c7x_horizontal_max_with_index(c7x::ulong_vec maxValVec, c7x::ulong_vec vIdx, uint64_t *maxVal,
int *maxIdx)
319 typedef typename c7x::make_full_vector<uint64_t>::type vec;
320 ulong4 maxValVec1 = maxValVec.even();
321 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
323 ulong2 maxValVec2 = maxValVec1.even();
324 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
326 ulong maxValVec3 = maxValVec2.even();
327 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
328 *maxVal = (ulong) maxValVec3;
330 ulong maxValScaler = (ulong) maxValVec3;
332 ulong8 zero_vec = vec(0);
333 ulong8 diff_vector = vec(maxValScaler) - maxValVec;
334 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
335 ulong8 maxIdxVec = vec(255);
336 ulong8 masked_indices = __select(vpMask, vIdx, maxIdxVec);
338 ulong4 vIdx1 = (ulong4) __min(masked_indices.even(), masked_indices.odd());
339 ulong2 vIdx2 = (ulong2) __min(vIdx1.even(), vIdx1.odd());
340 *maxIdx = (ulong) __min(vIdx2.even(), vIdx2.odd());
345 template <
typename X,
typename Y>
inline typename c7x::make_full_vector<X>::type convert_long_to_int(Y vec);
347 template <>
inline c7x::make_full_vector<int32_t>::type convert_long_to_int<int16, long8>(long8 vec)
349 typedef typename c7x::make_full_vector<int32_t>::type vecRet;
350 vecRet v8bits = __as_int16(vec);
354 template <>
inline c7x::make_full_vector<uint32_t>::type convert_long_to_int<uint16, ulong8>(ulong8 vec)
356 typedef typename c7x::make_full_vector<uint32_t>::type vecRet;
357 vecRet v8bits = __as_uint16(vec);
363 template <
typename X,
typename Y>
364 inline typename c7x::make_full_vector<X>::type convert_char_to_short(Y vecIn,
bool typeIndex);
367 inline c7x::make_full_vector<int16_t>::type convert_char_to_short<short32, char32>(char32 vecIn,
bool typeIndex)
369 typedef typename c7x::make_full_vector<int16_t>::type vecRet;
372 uchar32 vecInConv = __as_uchar32(vecIn);
373 vecOut = __convert_short32(vecInConv);
376 vecOut = __convert_short32(vecIn);
382 inline c7x::make_full_vector<uint16_t>::type convert_char_to_short<ushort32, uchar32>(uchar32 vecIn,
bool typeIndex)
384 typedef typename c7x::make_full_vector<uint16_t>::type vecRet;
385 vecRet vecOut = __convert_ushort32(vecIn);
391 template <
typename X,
typename Y>
inline void mul_char_to_short(Y vecIn1, Y vecIn2, X vecOut1, X vecOut2);
395 mul_char_to_short<short32 &, char32>(char32 vecIn1, char32 vecIn2, c7x::short_vec &vecOut1Short, c7x::short_vec &vecOut2)
397 short32 vecInShort1 = __convert_short32(vecIn1);
398 short32 vecInShort2 = __convert_short32(vecIn2);
399 vecOut1Short = vecInShort1 * vecInShort2;
404 mul_char_to_short<ushort32 &, uchar32>(uchar32 vecIn1, uchar32 vecIn2, c7x::ushort_vec &vecOut1uShort, c7x::ushort_vec &vecOut2)
406 vecOut1uShort = __mpy_ext(vecIn1, vecIn2);
409 template <
typename dataType,
typename V>
inline dataType c7x_horizontal_max_fp(V vin);
411 template <>
inline float c7x_horizontal_max_fp(c7x::float_vec vin)
414 float8 vin1 = __max(vin.hi(), vin.lo());
415 float4 vin2 = __max(vin1.hi(), vin1.lo());
416 float2 vin3 = __max(vin2.hi(), vin2.lo());
417 float maxVal = __max(vin3.hi(), vin3.lo());
421 template <>
inline double c7x_horizontal_max_fp(c7x::double_vec vin)
423 double4 vin1 = __max(vin.hi(), vin.lo());
424 double2 vin2 = __max(vin1.hi(), vin1.lo());
425 double maxVal = __max(vin2.hi(), vin2.lo());
429 template <
typename dataType,
typename V>
inline dataType c7x_horizontal_min_fp(V vin);
430 template <>
inline float c7x_horizontal_min_fp(c7x::float_vec vin)
433 float8 vin1 = __min(vin.hi(), vin.lo());
434 float4 vin2 = __min(vin1.hi(), vin1.lo());
435 float2 vin3 = __min(vin2.hi(), vin2.lo());
436 float minVal = __min(vin3.hi(), vin3.lo());
440 template <>
inline double c7x_horizontal_min_fp(c7x::double_vec vin)
442 double4 vin1 = __min(vin.hi(), vin.lo());
443 double2 vin2 = __min(vin1.hi(), vin1.lo());
444 double minVal = __min(vin2.hi(), vin2.lo());
449 template <
typename V,
typename W>
inline void c7x_horizontal_add(V inVec, W *horizontalSum);
451 template <>
inline void c7x_horizontal_add(c7x::float_vec inVec,
float *horizontalSum)
453 float8 inVec1 = inVec.hi() + inVec.lo();
454 float4 inVec2 = inVec1.hi() + inVec1.lo();
455 float2 inVec3 = inVec2.hi() + inVec2.lo();
456 *horizontalSum = inVec3.hi() + inVec3.lo();
459 template <>
inline void c7x_horizontal_add(c7x::double_vec inVec,
double *horizontalSum)
461 double4 inVec1 = inVec.hi() + inVec.lo();
462 double2 inVec2 = inVec1.hi() + inVec1.lo();
463 *horizontalSum = inVec2.hi() + inVec2.lo();
465 template <
typename V,
typename W>
466 inline void c7x_horizontal_min_with_index(V minValVec, V vIdx, W *minVal,
int *minIdx);
468 inline void c7x_horizontal_min_with_index(c7x::float_vec minValVec, c7x::float_vec vIdx,
float *minVal,
int *minIdx)
471 vpMask = __cmp_lt_pred(minValVec.even(), minValVec.odd());
472 float8 minValVec1 = (float8) __select(vpMask, minValVec.even(), minValVec.odd());
473 float8 vIdx1 = (float8) __select(vpMask, vIdx.even(), vIdx.odd());
475 vpMask = __cmp_lt_pred(minValVec1.even(), minValVec1.odd());
476 float4 minValVec2 = (float4) __select(vpMask, minValVec1.even(), minValVec1.odd());
477 float4 vIdx2 = (float4) __select(vpMask, vIdx1.even(), vIdx1.odd());
479 vpMask = __cmp_lt_pred(minValVec2.even(), minValVec2.odd());
480 float2 minValVec3 = (float2) __select(vpMask, minValVec2.even(), minValVec2.odd());
481 float2 vIdx3 = (float2) __select(vpMask, vIdx2.even(), vIdx2.odd());
483 vpMask = __cmp_lt_pred(minValVec3.even(), minValVec3.odd());
484 *minVal = (float) __select(vpMask, minValVec3.even(), minValVec3.odd());
485 *minIdx = ((int) __select(vpMask, vIdx3.even(), vIdx3.odd()));
489 inline void c7x_horizontal_min_with_index(c7x::double_vec minValVec, c7x::double_vec vIdx,
double *minVal,
int *minIdx)
492 vpMask = __cmp_lt_pred(minValVec.even(), minValVec.odd());
493 double4 minValVec1 = (double4) __select(vpMask, minValVec.even(), minValVec.odd());
494 double4 vIdx1 = (double4) __select(vpMask, vIdx.even(), vIdx.odd());
496 vpMask = __cmp_lt_pred(minValVec1.even(), minValVec1.odd());
497 double2 minValVec2 = (double2) __select(vpMask, minValVec1.even(), minValVec1.odd());
498 double2 vIdx2 = (double2) __select(vpMask, vIdx1.even(), vIdx1.odd());
500 vpMask = __cmp_lt_pred(minValVec2.even(), minValVec2.odd());
501 *minVal = (double) __select(vpMask, minValVec2.even(), minValVec2.odd());
502 *minIdx = ((int) __select(vpMask, vIdx2.even(), vIdx2.odd()));