36 #ifndef C7120_MMA_INLINES_H
37 #define C7120_MMA_INLINES_H
45 #include <c7x_scalable.h>
49 template <
typename V>
inline uint64_t c7x_horizontal_max(V vin);
51 template <>
inline uint64_t c7x_horizontal_max(c7x::uchar_vec vin)
53 uchar32 m1 = __max(vin.even(), vin.odd());
54 uchar16 m2 = __max(m1.even(), m1.odd());
55 uchar8 m3 = __max(m2.even(), m2.odd());
56 uchar4 m4 = __max(m3.even(), m3.odd());
57 uchar2 m5 = __max(m4.even(), m4.odd());
58 uint64_t retVal = (uint64_t) __max(m5.even(), m5.odd());
61 template <>
inline uint64_t c7x_horizontal_max(c7x::ushort_vec vin)
63 ushort16 m1 = __max(vin.even(), vin.odd());
64 ushort8 m2 = __max(m1.even(), m1.odd());
65 ushort4 m3 = __max(m2.even(), m2.odd());
66 ushort2 m4 = __max(m3.even(), m3.odd());
67 uint64_t retVal = (uint64_t) __max(m4.even(), m4.odd());
70 template <>
inline uint64_t c7x_horizontal_max(c7x::uint_vec vin)
72 uint64_t retVal = (uint64_t) (__sort_asc((uint16) vin).sf());
75 template <>
inline uint64_t c7x_horizontal_max(c7x::ulong_vec vin)
77 ulong4 m1 = __max(vin.even(), vin.odd());
78 ulong2 m2 = __max(m1.even(), m1.odd());
79 uint64_t retVal = (uint64_t) __max(m2.even(), m2.odd());
85 template <
typename V,
typename W>
86 inline void c7x_horizontal_max_with_index(V maxValVec, V vIdx, W *maxVal,
int *maxIdx);
89 inline void c7x_horizontal_max_with_index(c7x::float_vec maxValVec, c7x::float_vec vIdx,
float *maxVal,
int *maxIdx)
92 vpMask = __cmp_lt_pred(maxValVec.even(), maxValVec.odd());
93 float8 maxValVec1 = (float8) __select(vpMask, maxValVec.odd(), maxValVec.even());
94 float8 vIdx1 = (float8) __select(vpMask, vIdx.odd(), vIdx.even());
96 vpMask = __cmp_lt_pred(maxValVec1.even(), maxValVec1.odd());
97 float4 maxValVec2 = (float4) __select(vpMask, maxValVec1.odd(), maxValVec1.even());
98 float4 vIdx2 = (float4) __select(vpMask, vIdx1.odd(), vIdx1.even());
100 vpMask = __cmp_lt_pred(maxValVec2.even(), maxValVec2.odd());
101 float2 maxValVec3 = (float2) __select(vpMask, maxValVec2.odd(), maxValVec2.even());
102 float2 vIdx3 = (float2) __select(vpMask, vIdx2.odd(), vIdx2.even());
104 vpMask = __cmp_lt_pred(maxValVec3.even(), maxValVec3.odd());
105 *maxVal = (float) __select(vpMask, maxValVec3.odd(), maxValVec3.even());
106 *maxIdx = ((int) __select(vpMask, vIdx3.odd(), vIdx3.even()));
110 inline void c7x_horizontal_max_with_index(c7x::double_vec maxValVec, c7x::double_vec vIdx,
double *maxVal,
int *maxIdx)
113 vpMask = __cmp_lt_pred(maxValVec.even(), maxValVec.odd());
114 double4 maxValVec1 = (double4) __select(vpMask, maxValVec.odd(), maxValVec.even());
115 double4 vIdx1 = (double4) __select(vpMask, vIdx.odd(), vIdx.even());
117 vpMask = __cmp_lt_pred(maxValVec1.even(), maxValVec1.odd());
118 double2 maxValVec2 = (double2) __select(vpMask, maxValVec1.odd(), maxValVec1.even());
119 double2 vIdx2 = (double2) __select(vpMask, vIdx1.odd(), vIdx1.even());
121 vpMask = __cmp_lt_pred(maxValVec2.even(), maxValVec2.odd());
122 *maxVal = (double) __select(vpMask, maxValVec2.odd(), maxValVec2.even());
123 *maxIdx = ((int) __select(vpMask, vIdx2.odd(), vIdx2.even()));
127 inline void c7x_horizontal_max_with_index(c7x::char_vec maxValVec, c7x::char_vec vIdx, int8_t *maxVal,
int *maxIdx)
130 typedef typename c7x::make_full_vector<int8_t>::type vec;
132 char32 maxValVec1 = maxValVec.even();
133 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
135 char16 maxValVec2 = maxValVec1.even();
136 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
138 char8 maxValVec3 = maxValVec2.even();
139 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
141 char4 maxValVec4 = maxValVec3.even();
142 maxValVec4 = __max(maxValVec3.odd(), maxValVec4);
144 char2 maxValVec5 = maxValVec4.even();
145 maxValVec5 = __max(maxValVec4.odd(), maxValVec5);
147 int8_t maxValVec6 = maxValVec5.even();
148 maxValVec6 = __max(maxValVec5.odd(), maxValVec6);
149 *maxVal = (int8_t) maxValVec6;
150 int8_t maxValScaler = (int8_t) maxValVec6;
152 char64 zero_vec = vec(0);
153 char64 diff_vector = vec(maxValScaler) - maxValVec;
154 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
155 char64 masked_indices = __select(vpMask, vIdx, vec(-1));
156 uchar64 umasked_indices = __as_uchar64(masked_indices);
158 uchar32 vIdx1 = (uchar32) __min(umasked_indices.even(), umasked_indices.odd());
159 uchar16 vIdx2 = (uchar16) __min(vIdx1.even(), vIdx1.odd());
160 uchar8 vIdx3 = (uchar8) __min(vIdx2.even(), vIdx2.odd());
161 uchar4 vIdx4 = (uchar4) __min(vIdx3.even(), vIdx3.odd());
162 uchar2 vIdx5 = (uchar2) __min(vIdx4.even(), vIdx4.odd());
163 *maxIdx = (uint8_t) __min(vIdx5.even(), vIdx5.odd());
167 inline void c7x_horizontal_max_with_index(c7x::short_vec maxValVec, c7x::short_vec vIdx, int16_t *maxVal,
int *maxIdx)
170 typedef typename c7x::make_full_vector<int16_t>::type vec;
172 vec sortIn = __vdsortdd16h_vv(maxValVec);
173 *maxVal = (short) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
175 short maxValScaler = (short) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
177 short32 zero_vec = vec(0);
178 short32 diff_vector = vec(maxValScaler) - maxValVec;
179 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
180 short32 masked_indices = __select(vpMask, vIdx, vec(255));
181 short32 sorted_indices = __vdsortii16h_vv(masked_indices);
183 *maxIdx = (sorted_indices.s[0] < sorted_indices.s[16]) ? sorted_indices.s[0] : sorted_indices.s[16];
187 inline void c7x_horizontal_max_with_index(c7x::int_vec maxValVec, c7x::int_vec vIdx, int32_t *maxVal,
int *maxIdx)
190 typedef typename c7x::make_full_vector<int32_t>::type vec;
192 vec sortIn = __sort_desc(maxValVec);
193 *maxVal = (int) sortIn.s[0];
195 int maxValScaler = (
int) sortIn.s[0];
197 int16 zero_vec = vec(0);
198 int16 diff_vector = vec(maxValScaler) - maxValVec;
199 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
200 int16 masked_indices = __select(vpMask, vIdx, vec(255));
201 int16 sorted_indices = __sort_asc(masked_indices);
203 *maxIdx = sorted_indices.s[0];
207 inline void c7x_horizontal_max_with_index(c7x::long_vec maxValVec, c7x::long_vec vIdx, int64_t *maxVal,
int *maxIdx)
210 typedef typename c7x::make_full_vector<int64_t>::type vec;
212 long4 maxValVec1 = maxValVec.even();
213 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
215 long2 maxValVec2 = maxValVec1.even();
216 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
218 long maxValVec3 = maxValVec2.even();
219 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
220 *maxVal = (long) maxValVec3;
222 long maxValScaler = (long) maxValVec3;
224 long8 zero_vec = vec(0);
225 long8 diff_vector = vec(maxValScaler) - maxValVec;
226 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
227 long8 maxIdxVec = vec(255);
228 long8 masked_indices = __select(vpMask, vIdx, maxIdxVec);
230 long4 vIdx1 = (long4) __min(masked_indices.even(), masked_indices.odd());
231 long2 vIdx2 = (long2) __min(vIdx1.even(), vIdx1.odd());
232 *maxIdx = (long) __min(vIdx2.even(), vIdx2.odd());
236 inline void c7x_horizontal_max_with_index(c7x::uchar_vec maxValVec, c7x::uchar_vec vIdx, uint8_t *maxVal,
int *maxIdx)
239 typedef typename c7x::make_full_vector<uint8_t>::type vec;
241 uchar32 maxValVec1 = maxValVec.even();
242 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
244 uchar16 maxValVec2 = maxValVec1.even();
245 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
247 uchar8 maxValVec3 = maxValVec2.even();
248 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
250 uchar4 maxValVec4 = maxValVec3.even();
251 maxValVec4 = __max(maxValVec3.odd(), maxValVec4);
253 uchar2 maxValVec5 = maxValVec4.even();
254 maxValVec5 = __max(maxValVec4.odd(), maxValVec5);
256 uint8_t maxValVec6 = maxValVec5.even();
257 maxValVec6 = __max(maxValVec5.odd(), maxValVec6);
258 *maxVal = (uint8_t) maxValVec6;
259 uint8_t maxValScaler = (uint8_t) maxValVec6;
261 uchar64 zero_vec = vec(0);
262 uchar64 diff_vector = vec(maxValScaler) - maxValVec;
263 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
264 uchar64 maxIdxVec = vec(255);
265 uchar64 masked_indices = __select(vpMask, vIdx, maxIdxVec);
267 uchar32 vIdx1 = (uchar32) __min(masked_indices.even(), masked_indices.odd());
268 uchar16 vIdx2 = (uchar16) __min(vIdx1.even(), vIdx1.odd());
269 uchar8 vIdx3 = (uchar8) __min(vIdx2.even(), vIdx2.odd());
270 uchar4 vIdx4 = (uchar4) __min(vIdx3.even(), vIdx3.odd());
271 uchar2 vIdx5 = (uchar2) __min(vIdx4.even(), vIdx4.odd());
272 *maxIdx = (int) __min(vIdx5.even(), vIdx5.odd());
277 c7x_horizontal_max_with_index(c7x::ushort_vec maxValVec, c7x::ushort_vec vIdx, uint16_t *maxVal,
int *maxIdx)
280 typedef typename c7x::make_full_vector<uint16_t>::type vec;
282 vec sortIn = __vdsortddu16h_vv(maxValVec);
283 *maxVal = (ushort) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
285 ushort maxValScaler = (ushort) (sortIn.lo().s[0] > sortIn.hi().s[0] ? sortIn.lo().s[0] : sortIn.hi().s[0]);
287 ushort32 zero_vec = vec(0);
288 ushort32 diff_vector = vec(maxValScaler) - maxValVec;
289 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
290 ushort32 masked_indices = __select(vpMask, vIdx, vec(255));
291 ushort32 sorted_indices = __vdsortiiu16h_vv(masked_indices);
293 *maxIdx = (sorted_indices.s[0] < sorted_indices.s[16]) ? sorted_indices.s[0] : sorted_indices.s[16];
297 inline void c7x_horizontal_max_with_index(c7x::uint_vec maxValVec, c7x::uint_vec vIdx, uint32_t *maxVal,
int *maxIdx)
300 typedef typename c7x::make_full_vector<uint32_t>::type vec;
302 vec sortIn = __sort_desc(maxValVec);
303 *maxVal = (uint) sortIn.s[0];
305 uint maxValScaler = (uint) sortIn.s[0];
307 uint16 zero_vec = vec(0);
308 uint16 diff_vector = vec(maxValScaler) - maxValVec;
309 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
310 uint16 masked_indices = __select(vpMask, vIdx, vec(255));
311 uint16 sorted_indices = __sort_asc(masked_indices);
313 *maxIdx = sorted_indices.s[0];
317 inline void c7x_horizontal_max_with_index(c7x::ulong_vec maxValVec, c7x::ulong_vec vIdx, uint64_t *maxVal,
int *maxIdx)
320 typedef typename c7x::make_full_vector<uint64_t>::type vec;
321 ulong4 maxValVec1 = maxValVec.even();
322 maxValVec1 = __max(maxValVec.odd(), maxValVec1);
324 ulong2 maxValVec2 = maxValVec1.even();
325 maxValVec2 = __max(maxValVec1.odd(), maxValVec2);
327 ulong maxValVec3 = maxValVec2.even();
328 maxValVec3 = __max(maxValVec2.odd(), maxValVec3);
329 *maxVal = (ulong) maxValVec3;
331 ulong maxValScaler = (ulong) maxValVec3;
333 ulong8 zero_vec = vec(0);
334 ulong8 diff_vector = vec(maxValScaler) - maxValVec;
335 vpMask = __cmp_eq_pred(diff_vector, zero_vec);
336 ulong8 maxIdxVec = vec(255);
337 ulong8 masked_indices = __select(vpMask, vIdx, maxIdxVec);
339 ulong4 vIdx1 = (ulong4) __min(masked_indices.even(), masked_indices.odd());
340 ulong2 vIdx2 = (ulong2) __min(vIdx1.even(), vIdx1.odd());
341 *maxIdx = (ulong) __min(vIdx2.even(), vIdx2.odd());
346 template <
typename X,
typename Y>
inline typename c7x::make_full_vector<X>::type convert_long_to_int(Y vec);
348 template <>
inline c7x::make_full_vector<int32_t>::type convert_long_to_int<int16, long8>(long8 vec)
350 typedef typename c7x::make_full_vector<int32_t>::type vecRet;
351 vecRet v8bits = __as_int16(vec);
355 template <>
inline c7x::make_full_vector<uint32_t>::type convert_long_to_int<uint16, ulong8>(ulong8 vec)
357 typedef typename c7x::make_full_vector<uint32_t>::type vecRet;
358 vecRet v8bits = __as_uint16(vec);
364 template <
typename X,
typename Y>
365 inline typename c7x::make_full_vector<X>::type convert_char_to_short(Y vecIn,
bool typeIndex);
368 inline c7x::make_full_vector<int16_t>::type convert_char_to_short<short32, char32>(char32 vecIn,
bool typeIndex)
370 typedef typename c7x::make_full_vector<int16_t>::type vecRet;
373 uchar32 vecInConv = __as_uchar32(vecIn);
374 vecOut = __convert_short32(vecInConv);
377 vecOut = __convert_short32(vecIn);
383 inline c7x::make_full_vector<uint16_t>::type convert_char_to_short<ushort32, uchar32>(uchar32 vecIn,
bool typeIndex)
385 typedef typename c7x::make_full_vector<uint16_t>::type vecRet;
386 vecRet vecOut = __convert_ushort32(vecIn);
392 template <
typename X,
typename Y>
inline void mul_char_to_short(Y vecIn1, Y vecIn2, X vecOut1, X vecOut2);
396 mul_char_to_short<short32 &, char32>(char32 vecIn1, char32 vecIn2, c7x::short_vec &vecOut1Short, c7x::short_vec &vecOut2)
398 short32 vecInShort1 = __convert_short32(vecIn1);
399 short32 vecInShort2 = __convert_short32(vecIn2);
400 vecOut1Short = vecInShort1 * vecInShort2;
405 mul_char_to_short<ushort32 &, uchar32>(uchar32 vecIn1, uchar32 vecIn2, c7x::ushort_vec &vecOut1uShort, c7x::ushort_vec &vecEx)
407 vecOut1uShort = __mpy_ext(vecIn1, vecIn2);
409 template <
typename dataType,
typename V>
inline dataType c7x_horizontal_max_fp(V vin);
411 template <>
inline float c7x_horizontal_max_fp(c7x::float_vec vin)
414 float8 vin1 = __max(vin.hi(), vin.lo());
415 float4 vin2 = __max(vin1.hi(), vin1.lo());
416 float2 vin3 = __max(vin2.hi(), vin2.lo());
417 float maxVal = __max(vin3.hi(), vin3.lo());
421 template <>
inline double c7x_horizontal_max_fp(c7x::double_vec vin)
423 double4 vin1 = __max(vin.hi(), vin.lo());
424 double2 vin2 = __max(vin1.hi(), vin1.lo());
425 double maxVal = __max(vin2.hi(), vin2.lo());
429 template <
typename dataType,
typename V>
inline dataType c7x_horizontal_min_fp(V vin);
430 template <>
inline float c7x_horizontal_min_fp(c7x::float_vec vin)
433 float8 vin1 = __min(vin.hi(), vin.lo());
434 float4 vin2 = __min(vin1.hi(), vin1.lo());
435 float2 vin3 = __min(vin2.hi(), vin2.lo());
436 float minVal = __min(vin3.hi(), vin3.lo());
440 template <>
inline double c7x_horizontal_min_fp(c7x::double_vec vin)
442 double4 vin1 = __min(vin.hi(), vin.lo());
443 double2 vin2 = __min(vin1.hi(), vin1.lo());
444 double minVal = __min(vin2.hi(), vin2.lo());
449 template <
typename V,
typename W>
inline void c7x_horizontal_add(V inVec, W *horizontalSum);
451 template <>
inline void c7x_horizontal_add(c7x::float_vec inVec,
float *horizontalSum)
453 float8 inVec1 = inVec.hi() + inVec.lo();
454 float4 inVec2 = inVec1.hi() + inVec1.lo();
455 float2 inVec3 = inVec2.hi() + inVec2.lo();
456 *horizontalSum = inVec3.hi() + inVec3.lo();
459 template <>
inline void c7x_horizontal_add(c7x::double_vec inVec,
double *horizontalSum)
461 double4 inVec1 = inVec.hi() + inVec.lo();
462 double2 inVec2 = inVec1.hi() + inVec1.lo();
463 *horizontalSum = inVec2.hi() + inVec2.lo();
466 template <
typename V,
typename W>
467 inline void c7x_horizontal_min_with_index(V minValVec, V vIdx, W *minVal,
int *minIdx);
469 inline void c7x_horizontal_min_with_index(c7x::float_vec minValVec, c7x::float_vec vIdx,
float *minVal,
int *minIdx)
472 vpMask = __cmp_lt_pred(minValVec.even(), minValVec.odd());
473 float8 minValVec1 = (float8) __select(vpMask, minValVec.even(), minValVec.odd());
474 float8 vIdx1 = (float8) __select(vpMask, vIdx.even(), vIdx.odd());
476 vpMask = __cmp_lt_pred(minValVec1.even(), minValVec1.odd());
477 float4 minValVec2 = (float4) __select(vpMask, minValVec1.even(), minValVec1.odd());
478 float4 vIdx2 = (float4) __select(vpMask, vIdx1.even(), vIdx1.odd());
480 vpMask = __cmp_lt_pred(minValVec2.even(), minValVec2.odd());
481 float2 minValVec3 = (float2) __select(vpMask, minValVec2.even(), minValVec2.odd());
482 float2 vIdx3 = (float2) __select(vpMask, vIdx2.even(), vIdx2.odd());
484 vpMask = __cmp_lt_pred(minValVec3.even(), minValVec3.odd());
485 *minVal = (float) __select(vpMask, minValVec3.even(), minValVec3.odd());
486 *minIdx = ((int) __select(vpMask, vIdx3.even(), vIdx3.odd()));
490 inline void c7x_horizontal_min_with_index(c7x::double_vec minValVec, c7x::double_vec vIdx,
double *minVal,
int *minIdx)
493 vpMask = __cmp_lt_pred(minValVec.even(), minValVec.odd());
494 double4 minValVec1 = (double4) __select(vpMask, minValVec.even(), minValVec.odd());
495 double4 vIdx1 = (double4) __select(vpMask, vIdx.even(), vIdx.odd());
497 vpMask = __cmp_lt_pred(minValVec1.even(), minValVec1.odd());
498 double2 minValVec2 = (double2) __select(vpMask, minValVec1.even(), minValVec1.odd());
499 double2 vIdx2 = (double2) __select(vpMask, vIdx1.even(), vIdx1.odd());
501 vpMask = __cmp_lt_pred(minValVec2.even(), minValVec2.odd());
502 *minVal = (double) __select(vpMask, minValVec2.even(), minValVec2.odd());
503 *minIdx = ((int) __select(vpMask, vIdx2.even(), vIdx2.odd()));