34 #define ELEMENT_COUNT(x) c7x::element_count_of<x>::value
57 MATHLIB_tanh_exp(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst,
size_t numBlocks)
60 typedef typename c7x::make_full_vector<T>::type vec;
69 vec log2_base_x16, half, negativeHalf, C0, C1, C2, one, negTwo;
73 log2_base_x16 = (vec) 23.0831206542234f;
75 negativeHalf = (vec) -0.5f;
76 mask = (c7x::uint_vec) 0x3u;
77 p = (c7x::double_vec) 0.0433216987816623;
82 C0 = (vec) 0.166668549286041f;
83 C1 = (vec) 0.500016170012920f;
84 C2 = (vec) 0.999999998618401f;
86 for (
size_t i = 0; i < numBlocks; i++) {
87 vec inVec = c7x::strm_eng<0, vec>::get_adv();
92 vec pol, r, r2, r3, outVec, Nf, absNf, rVals_odd, rVals_even;
93 c7x::uint_vec J, K, uN, dTAdjusted_32_63, dT_32_63, dT_0_31, upperBitsK, lowerBitsK, upperBitsJ, lowerBitsJ;
94 c7x::int_vec N, minusN;
95 c7x::double_vec KVals_8_15, KVals_0_7, JVals_8_15, JVals_0_7, dTVals_8_15, dTVals_0_7, pol_0_7, pol_8_15,
96 outVec_0_7, outVec_8_15, inVecVals_odd, inVecVals_even, NVals_odd, NVals_even;
98 inVec = __abs(inVec) * negTwo;
101 Nf = inVec * log2_base_x16;
103 N = c7x::convert<c7x::int_vec>(absNf);
109 __vpred cmp_N = __cmp_lt_pred(Nf, negativeHalf);
110 N = __select(cmp_N, minusN, N);
117 inVecVals_odd = __high_float_to_double(inVec);
118 inVecVals_even = __low_float_to_double(inVec);
119 NVals_odd = __high_int_to_double(N);
120 NVals_even = __low_int_to_double(N);
121 rVals_odd = __double_to_float((inVecVals_odd - (p * NVals_odd)));
122 rVals_even = __double_to_float((inVecVals_even - (p * NVals_even)));
125 r = c7x::reinterpret<vec>(__permute_even_even_int(MATHLIB_vperm_data_interweave_0_63,
126 c7x::as_uchar_vec(rVals_odd), c7x::as_uchar_vec(rVals_even)));
131 pol = (r * C2) + ((r3 * C0) + (r2 * C1));
138 uN = c7x::convert<c7x::uint_vec>(N);
148 KVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
149 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
150 KVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
151 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
152 JVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
153 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
154 JVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
155 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
158 dTVals_8_15 = KVals_8_15 * JVals_8_15;
159 dTVals_0_7 = KVals_0_7 * JVals_0_7;
166 dT_32_63 = c7x::reinterpret<c7x::uint_vec>(__permute_odd_odd_int(
167 MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
168 dT_0_31 = c7x::reinterpret<c7x::uint_vec>(__permute_even_even_int(
169 MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
171 uN = (uN >> 4) << 20;
172 dTAdjusted_32_63 = dT_32_63 + uN;
175 dTVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
176 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
177 dTVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
178 MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
180 pol_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(MATHLIB_vperm_data_dp_interweave_0_63,
181 c7x::as_uchar_vec(__high_float_to_double(pol)),
182 c7x::as_uchar_vec(__low_float_to_double(pol))));
183 pol_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(MATHLIB_vperm_data_dp_interweave_0_63,
184 c7x::as_uchar_vec(__high_float_to_double(pol)),
185 c7x::as_uchar_vec(__low_float_to_double(pol))));
187 outVec_0_7 = dTVals_0_7 * (1.0f + pol_0_7);
188 outVec_8_15 = dTVals_8_15 * (1.0f + pol_8_15);
190 outVec = c7x::reinterpret<vec>(__permute_even_even_int(MATHLIB_vperm_data_0_63,
191 c7x::as_uchar_vec(__double_to_float(outVec_8_15)),
192 c7x::as_uchar_vec(__double_to_float(outVec_0_7))));
194 outVec = outVec + one;
196 __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
197 vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
198 __vstore_pred(tmp, addr, outVec);
204 template <
typename T>
206 MATHLIB_tanh_pol(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst,
size_t numBlocks)
209 typedef typename c7x::make_full_vector<T>::type vec;
217 vec pol_bound, C16, C14, C12, C10, C8, C6, C4, C2, two, zero, fltMax, limit;
218 pol_bound = (vec) 1.0f;
220 C16 = (vec) 0.000244528812992865f;
221 C14 = (vec) -0.00119005741172407f;
222 C12 = (vec) 0.00349212803657248f;
223 C10 = (vec) -0.00886323552990220f;
224 C8 = (vec) 0.0218794885361552f;
225 C6 = (vec) -0.0539682539682540f;
226 C4 = (vec) 0.133333333333333f;
227 C2 = (vec) -0.333333333333333f;
230 fltMax = (vec) 3.40282347e+38f;
232 for (
size_t i = 0; i < numBlocks; i++) {
233 vec expOut = c7x::strm_eng<1, vec>::get_adv();
238 vec inVec_abs, x2, x4, x6, x8, x10, x12, pol1, pol2, pol, x1, computeDiv, expRecip, outVec, sign;
245 x1 = __recip(expOut);
246 x1 = x1 * (two - (expOut * x1));
247 computeDiv = (x1 * (two - (expOut * x1))) * two;
256 __vpred cmp_gt_flt = __cmp_lt_pred(fltMax, __abs(expOut));
257 computeDiv = __select(cmp_gt_flt, zero, computeDiv);
259 expRecip = computeDiv - 1.0f;
264 vec inVec = c7x::strm_eng<0, vec>::get_adv();
265 inVec_abs = __abs(inVec);
267 x2 = inVec_abs * inVec_abs;
274 pol1 = ((C8 * x8) + (C6 * x6)) + ((C4 * x4) + (C2 * x2));
275 pol2 = (((C16 * x4) + (C14 * x2) + C12) * x12) + (C10 * x10);
278 pol = (pol * inVec_abs) + inVec_abs;
298 __vpred cmp_lt_pol = __cmp_le_pred(inVec_abs, pol_bound);
303 __vpred cmp_le_exp = __cmp_lt_pred(inVec_abs, limit);
305 __vpred cmp_else_exp = __negate(__or(cmp_le_exp, cmp_lt_pol));
307 outVec = __select(cmp_else_exp, pol_bound, expRecip);
308 outVec = __select(cmp_lt_pol, pol, outVec);
310 __vpred cmp_sign = __cmp_lt_pred(inVec, zero);
311 sign = __select(cmp_sign, -sign, sign);
313 outVec = outVec * sign;
315 __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
316 vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
317 __vstore_pred(tmp, addr, outVec);
325 size_t numBlocks = 0;
326 size_t remNumBlocks = 0;
329 typedef typename c7x::make_full_vector<T>::type vec;
331 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
332 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
337 numBlocks = length / c7x::element_count_of<vec>::value;
338 remNumBlocks = length % c7x::element_count_of<vec>::value;
static void MATHLIB_tanh_pol(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
static void MATHLIB_tanh_vector(size_t length, T *pSrc, T *pDst)
template MATHLIB_STATUS MATHLIB_tanh< float >(size_t length, float *pSrc, float *pDst)
static void MATHLIB_tanh_exp(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
#define MATHLIB_KTABLE_OFFSET
static c7x::uint_vec MATHLIB_LUTReadLowerBits(vecType vecOffset)
This method reads bits 31-0 of LUT value at vecOffset.
#define MATHLIB_JTABLE_OFFSET
static c7x::uint_vec MATHLIB_LUTReadUpperBits(vecType vecOffset)
This method reads bits 63-32 of LUT value at vecOffset.
static void MATHLIB_SE0SE1SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc0, T *pSrc1)
This method performs SE0, SE1, and SA0 open.
static void MATHLIB_SE0SA0Close()
This method performs SE0 and SA0 close.
static void MATHLIB_SE0SA01DSequentialInit(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, size_t length, T *pSrc, T *pDst)
static MATHLIB_STATUS MATHLIB_checkParams(size_t length, T *pSrc, T *pDst)
This method performs parameter checks for MATHLIB function.
static void MATHLIB_SE0SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc)
This method performs SE0 and SA0 open.
MATHLIB_STATUS MATHLIB_tanh_sp(size_t length, float *pSrc, float *pDst)
This function is the C interface for MATHLIB_tanh. Function accepts float pointers.
MATHLIB_STATUS MATHLIB_tanh(size_t length, T *pSrc, T *pDst)
Performs the elementwise hyperbolic tangent of an input vector. Function can be overloaded with float...
MATHLIB_STATUS_NAME
The enumeration of all status codes.