MATHLIB User Guide
MATHLIB_tanh.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (C) 2024 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 #define ELEMENT_COUNT(x) c7x::element_count_of<x>::value
35 
36 /******************************************************************************/
37 /* */
38 /* Includes */
39 /* */
40 /******************************************************************************/
41 
42 #include "MATHLIB_ilut.h"
43 #include "MATHLIB_lut.h"
44 #include "MATHLIB_permute.h"
45 #include "MATHLIB_tanh_scalar.h"
46 #include "MATHLIB_types.h"
47 #include "MATHLIB_utility.h"
48 
49 /******************************************************************************/
50 /* */
51 /* MATHLIB_tanh */
52 /* */
53 /******************************************************************************/
54 
55 template <typename T>
56 static inline void
57 MATHLIB_tanh_exp(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
58 {
59  // derive c7x vector type from template typename
60  typedef typename c7x::make_full_vector<T>::type vec;
61 
62  // open SE0, SE1, and SA0 for reading and writing operands
63  MATHLIB_SE0SA0Open(se0Params, sa0Params, pSrc);
64 
65  /**********************************************************************/
66  /* Create and assign values for constants employed on exp computation */
67  /**********************************************************************/
68 
69  vec log2_base_x16, half, negativeHalf, C0, C1, C2, one, negTwo;
70  c7x::uint_vec mask;
71  c7x::double_vec p;
72 
73  log2_base_x16 = (vec) 23.0831206542234f; // 1.442695041 * 16.0
74  half = (vec) 0.5f;
75  negativeHalf = (vec) -0.5f;
76  mask = (c7x::uint_vec) 0x3u;
77  p = (c7x::double_vec) 0.0433216987816623; // 1/log2_base_x16
78  one = (vec) 1.0f;
79  negTwo = (vec) -2.0f;
80 
81  // coefficients to approximate the decimal part of the result
82  C0 = (vec) 0.166668549286041f;
83  C1 = (vec) 0.500016170012920f;
84  C2 = (vec) 0.999999998618401f;
85 
86  for (size_t i = 0; i < numBlocks; i++) {
87  vec inVec = c7x::strm_eng<0, vec>::get_adv();
88 
89  /*****************************************************************/
90  /* Create variables employed on exp computation */
91  /*****************************************************************/
92  vec pol, r, r2, r3, outVec, Nf, absNf, rVals_odd, rVals_even;
93  c7x::uint_vec J, K, uN, dTAdjusted_32_63, dT_32_63, dT_0_31, upperBitsK, lowerBitsK, upperBitsJ, lowerBitsJ;
94  c7x::int_vec N, minusN;
95  c7x::double_vec KVals_8_15, KVals_0_7, JVals_8_15, JVals_0_7, dTVals_8_15, dTVals_0_7, pol_0_7, pol_8_15,
96  outVec_0_7, outVec_8_15, inVecVals_odd, inVecVals_even, NVals_odd, NVals_even;
97 
98  inVec = __abs(inVec) * negTwo;
99 
100  // Get N such that |N - inVec*16/ln(2)| is minimized
101  Nf = inVec * log2_base_x16;
102  absNf = Nf + half;
103  N = c7x::convert<c7x::int_vec>(absNf);
104  minusN = N - 1;
105 
106  // if ((x * log2_base_x16) < - half) {
107  // N--;
108  // }
109  __vpred cmp_N = __cmp_lt_pred(Nf, negativeHalf);
110  N = __select(cmp_N, minusN, N);
111 
112  /**********************************************************************/
113  /* Calculate Taylor series approximation for exp */
114  /**********************************************************************/
115 
116  // Split vectors to compute r with double precision
117  inVecVals_odd = __high_float_to_double(inVec);
118  inVecVals_even = __low_float_to_double(inVec);
119  NVals_odd = __high_int_to_double(N);
120  NVals_even = __low_int_to_double(N);
121  rVals_odd = __double_to_float((inVecVals_odd - (p * NVals_odd)));
122  rVals_even = __double_to_float((inVecVals_even - (p * NVals_even)));
123 
124  // Rejoin vectors to create r with floating point precision
125  r = c7x::reinterpret<vec>(__permute_even_even_int(MATHLIB_vperm_data_interweave_0_63,
126  c7x::as_uchar_vec(rVals_odd), c7x::as_uchar_vec(rVals_even)));
127 
128  // Taylor series approximation
129  r2 = r * r;
130  r3 = r2 * r;
131  pol = (r * C2) + ((r3 * C0) + (r2 * C1));
132 
133  /**********************************************************************/
134  /* Get index of LUT and 2^M values */
135  /**********************************************************************/
136 
137  // Create vectors of LUT indices
138  uN = c7x::convert<c7x::uint_vec>(N);
139  K = ((uN << 28u) >> 30) + MATHLIB_KTABLE_OFFSET;
140  J = (uN & mask) + MATHLIB_JTABLE_OFFSET;
141 
142  // Read values from LUT and convert and store as doubles in split vectors
143  upperBitsK = MATHLIB_LUTReadUpperBits(K);
144  lowerBitsK = MATHLIB_LUTReadLowerBits(K);
145  upperBitsJ = MATHLIB_LUTReadUpperBits(J);
146  lowerBitsJ = MATHLIB_LUTReadLowerBits(J);
147 
148  KVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
149  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
150  KVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
151  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
152  JVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
153  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
154  JVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
155  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
156 
157  // Multiply LUT values
158  dTVals_8_15 = KVals_8_15 * JVals_8_15;
159  dTVals_0_7 = KVals_0_7 * JVals_0_7;
160 
161  /**********************************************************************/
162  /* Scale exponent to adjust for 2^M */
163  /**********************************************************************/
164 
165  // Upper 32 bits of all dT values and lower 32 bits of all dT values
166  dT_32_63 = c7x::reinterpret<c7x::uint_vec>(__permute_odd_odd_int(
167  MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
168  dT_0_31 = c7x::reinterpret<c7x::uint_vec>(__permute_even_even_int(
169  MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
170 
171  uN = (uN >> 4) << 20;
172  dTAdjusted_32_63 = dT_32_63 + uN;
173 
174  // Concatenate the adjusted upper 32 bits of dT values to the lower 32 bits, convert dT to float
175  dTVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
176  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
177  dTVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
178  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
179 
180  pol_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(MATHLIB_vperm_data_dp_interweave_0_63,
181  c7x::as_uchar_vec(__high_float_to_double(pol)),
182  c7x::as_uchar_vec(__low_float_to_double(pol))));
183  pol_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(MATHLIB_vperm_data_dp_interweave_0_63,
184  c7x::as_uchar_vec(__high_float_to_double(pol)),
185  c7x::as_uchar_vec(__low_float_to_double(pol))));
186 
187  outVec_0_7 = dTVals_0_7 * (1.0f + pol_0_7);
188  outVec_8_15 = dTVals_8_15 * (1.0f + pol_8_15);
189 
190  outVec = c7x::reinterpret<vec>(__permute_even_even_int(MATHLIB_vperm_data_0_63,
191  c7x::as_uchar_vec(__double_to_float(outVec_8_15)),
192  c7x::as_uchar_vec(__double_to_float(outVec_0_7))));
193 
194  outVec = outVec + one;
195 
196  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
197  vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
198  __vstore_pred(tmp, addr, outVec);
199  }
200 
202 }
203 
204 template <typename T>
205 static inline void
206 MATHLIB_tanh_pol(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
207 {
208  // derive c7x vector type from template typename
209  typedef typename c7x::make_full_vector<T>::type vec;
210 
211  // open SE0, SE1, and SA0 for reading and writing operands
212  MATHLIB_SE0SE1SA0Open(se0Params, sa0Params, pSrc, pDst);
213 
214  /*************************************************************************************************/
215  /* Create and assign values for constants employed on tanh polynomial and reciprocal computation */
216  /*************************************************************************************************/
217  vec pol_bound, C16, C14, C12, C10, C8, C6, C4, C2, two, zero, fltMax, limit;
218  pol_bound = (vec) 1.0f;
219  limit = (vec) 9.0f;
220  C16 = (vec) 0.000244528812992865f;
221  C14 = (vec) -0.00119005741172407f;
222  C12 = (vec) 0.00349212803657248f;
223  C10 = (vec) -0.00886323552990220f;
224  C8 = (vec) 0.0218794885361552f;
225  C6 = (vec) -0.0539682539682540f;
226  C4 = (vec) 0.133333333333333f;
227  C2 = (vec) -0.333333333333333f;
228  two = (vec) 2.0f;
229  zero = (vec) 0.0f;
230  fltMax = (vec) 3.40282347e+38f;
231 
232  for (size_t i = 0; i < numBlocks; i++) {
233  vec expOut = c7x::strm_eng<1, vec>::get_adv(); // e^(-2x)
234 
235  /**********************************************************************************************/
236  /* Create variables employed on tanh polynomial and reciprocal computation */
237  /**********************************************************************************************/
238  vec inVec_abs, x2, x4, x6, x8, x10, x12, pol1, pol2, pol, x1, computeDiv, expRecip, outVec, sign;
239 
240  sign = (vec) 1.0;
241 
242  /**************************************************************/
243  /* Division computation done for inputs 1 < |input| <= 9 */
244  /**************************************************************/
245  x1 = __recip(expOut);
246  x1 = x1 * (two - (expOut * x1));
247  computeDiv = (x1 * (two - (expOut * x1))) * two;
248 
249  /************************************************/
250  /* Boundary checking for reciprocal computation */
251  /************************************************/
252 
253  // if (__abs(expOut) > max) {
254  // computeDiv = 0;
255  // }
256  __vpred cmp_gt_flt = __cmp_lt_pred(fltMax, __abs(expOut));
257  computeDiv = __select(cmp_gt_flt, zero, computeDiv);
258 
259  expRecip = computeDiv - 1.0f; // -1 + 2 / (1 + e^(-2x))
260 
261  /**************************************************************/
262  /* Polynomial computation done for inputs |input| < 1 */
263  /**************************************************************/
264  vec inVec = c7x::strm_eng<0, vec>::get_adv();
265  inVec_abs = __abs(inVec);
266  // calculate the power of input vector
267  x2 = inVec_abs * inVec_abs;
268  x4 = x2 * x2;
269  x6 = x2 * x4;
270  x8 = x4 * x4;
271  x10 = x6 * x4;
272  x12 = x8 * x4;
273 
274  pol1 = ((C8 * x8) + (C6 * x6)) + ((C4 * x4) + (C2 * x2));
275  pol2 = (((C16 * x4) + (C14 * x2) + C12) * x12) + (C10 * x10);
276 
277  pol = pol1 + pol2;
278  pol = (pol * inVec_abs) + inVec_abs;
279 
280  /**************************************************************/
281  /* Determining tanh computation for each input */
282  /**************************************************************/
283 
284  /**************************************************************************/
285  /* The type of calculation for tanh(x) depends on the value of x. */
286  /* x_abs < 1.0, res = pol_est_tanhsp_i(input x) */
287  /* */
288  /* x_abs >= 1.0, res = -1 + 2 / (1 + e^(-2x)) */
289  /* */
290  /* x_abs > 9.0, res = 1.0 (maximum value for tanh) */
291  /* where x_abs is the absolute value of the input and res is the */
292  /* calculated value for tanh(x). */
293  /**************************************************************************/
294 
295  // if(x < pol_bound){
296  // compute polynomial
297  // }
298  __vpred cmp_lt_pol = __cmp_le_pred(inVec_abs, pol_bound); // |x| <= 1
299 
300  // if(x_abs <= limit){
301  // exp computation with |x|
302  // }
303  __vpred cmp_le_exp = __cmp_lt_pred(inVec_abs, limit); // 1.0 < |x|<= 9
304 
305  __vpred cmp_else_exp = __negate(__or(cmp_le_exp, cmp_lt_pol)); // |x| > 9
306 
307  outVec = __select(cmp_else_exp, pol_bound, expRecip);
308  outVec = __select(cmp_lt_pol, pol, outVec);
309 
310  __vpred cmp_sign = __cmp_lt_pred(inVec, zero);
311  sign = __select(cmp_sign, -sign, sign);
312 
313  outVec = outVec * sign;
314 
315  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
316  vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
317  __vstore_pred(tmp, addr, outVec);
318  }
319 }
320 
321 // this method performs tanh computation of input vector
322 template <typename T> static inline void MATHLIB_tanh_vector(size_t length, T *pSrc, T *pDst)
323 {
324  // variables
325  size_t numBlocks = 0; // compute loop's iteration count
326  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
327 
328  // derive c7x vector type from template typename
329  typedef typename c7x::make_full_vector<T>::type vec;
330 
331  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
332  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
333 
334  MATHLIB_SE0SA01DSequentialInit(&se0Params, &sa0Params, length, pSrc, pDst);
335 
336  // calculate compute loop's iteration counter
337  numBlocks = length / c7x::element_count_of<vec>::value;
338  remNumBlocks = length % c7x::element_count_of<vec>::value;
339  if (remNumBlocks) {
340  numBlocks++;
341  }
342 
343  /***********************************************************************/
344  /* Exp computation for tanh */
345  /***********************************************************************/
346 
347  MATHLIB_tanh_exp(&se0Params, &sa0Params, pSrc, pDst, numBlocks);
348 
349  /***********************************************************************/
350  /* Polynomial and reciprocal computation for tanh */
351  /***********************************************************************/
352  MATHLIB_tanh_pol(&se0Params, &sa0Params, pSrc, pDst, numBlocks);
353 }
354 
355 // this method performs exponential computation of input vector
356 template <typename T> MATHLIB_STATUS MATHLIB_tanh(size_t length, T *pSrc, T *pDst)
357 {
358  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
359 
360  // check for null pointers and non-zero length
361  status = MATHLIB_checkParams(length, pSrc, pDst);
362 
363  if (status == MATHLIB_SUCCESS) {
364  MATHLIB_tanh_vector(length, pSrc, pDst);
365  }
366  return status;
367 }
368 
369 /******************************************************************************/
370 /* */
371 /* Explicit templatization for datatypes supported by MATHLIB_tanh */
372 /* */
373 /******************************************************************************/
374 
375 // single precision
376 template MATHLIB_STATUS MATHLIB_tanh<float>(size_t length, float *pSrc, float *pDst);
377 
378 /******************************************************************************/
379 /* */
380 /* C-interface wrapper functions */
381 /* */
382 /******************************************************************************/
383 
384 extern "C" {
385 
386 // single-precision wrapper
387 MATHLIB_STATUS MATHLIB_tanh_sp(size_t length, float *pSrc, float *pDst)
388 {
389  MATHLIB_STATUS status = MATHLIB_tanh(length, pSrc, pDst);
390  return status;
391 }
392 
393 } // extern "C"
static void MATHLIB_tanh_pol(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
static void MATHLIB_tanh_vector(size_t length, T *pSrc, T *pDst)
template MATHLIB_STATUS MATHLIB_tanh< float >(size_t length, float *pSrc, float *pDst)
static void MATHLIB_tanh_exp(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc, T *pDst, size_t numBlocks)
#define MATHLIB_KTABLE_OFFSET
Definition: MATHLIB_lut.h:64
static c7x::uint_vec MATHLIB_LUTReadLowerBits(vecType vecOffset)
This method reads bits 31-0 of LUT value at vecOffset.
Definition: MATHLIB_lut.h:111
#define MATHLIB_JTABLE_OFFSET
Definition: MATHLIB_lut.h:65
static c7x::uint_vec MATHLIB_LUTReadUpperBits(vecType vecOffset)
This method reads bits 63-32 of LUT value at vecOffset.
Definition: MATHLIB_lut.h:86
static void MATHLIB_SE0SE1SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc0, T *pSrc1)
This method performs SE0, SE1, and SA0 open.
static void MATHLIB_SE0SA0Close()
This method performs SE0 and SA0 close.
static void MATHLIB_SE0SA01DSequentialInit(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, size_t length, T *pSrc, T *pDst)
static MATHLIB_STATUS MATHLIB_checkParams(size_t length, T *pSrc, T *pDst)
This method performs parameter checks for MATHLIB function.
static void MATHLIB_SE0SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc)
This method performs SE0 and SA0 open.
MATHLIB_STATUS MATHLIB_tanh_sp(size_t length, float *pSrc, float *pDst)
This function is the C interface for MATHLIB_tanh. Function accepts float pointers.
MATHLIB_STATUS MATHLIB_tanh(size_t length, T *pSrc, T *pDst)
Performs the elementwise hyperbolic tangent of an input vector. Function can be overloaded with float...
MATHLIB_STATUS_NAME
The enumeration of all status codes.
@ MATHLIB_SUCCESS