MATHLIB User Guide
MATHLIB_pow.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 #define ELEMENT_COUNT(x) c7x::element_count_of<x>::value
35 
36 /******************************************************************************/
37 /* */
38 /* Includes */
39 /* */
40 /******************************************************************************/
41 
42 #include "MATHLIB_ilut.h"
43 #include "MATHLIB_lut.h"
44 #include "MATHLIB_permute.h"
45 #include "MATHLIB_pow_scalar.h"
46 #include "MATHLIB_types.h"
47 #include "MATHLIB_utility.h"
48 
49 /******************************************************************************/
50 /* */
51 /* MATHLIB_pow */
52 /* */
53 /******************************************************************************/
54 template <typename T>
55 static inline void MATHLIB_pow_vector(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst);
56 
57 template <typename T>
58 static inline MATHLIB_STATUS MATHLIB_pow_exp(__SE_TEMPLATE_v1 *restrict se0Params,
59  __SA_TEMPLATE_v1 *restrict sa0Params,
60  size_t length,
61  T *restrict pSrc0,
62  T *restrict pDst);
63 
64 template <typename T>
65 static inline MATHLIB_STATUS MATHLIB_pow_cond(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst);
66 
67 template <typename vecType> static inline vecType MATHLIB_pow_log(vecType inVec);
68 
69 template <> inline c7x::float_vec MATHLIB_pow_log<c7x::float_vec>(c7x::float_vec inVec)
70 {
71  /**********************************************************************/
72  /* Create and assign values for constants employed on log computation */
73  /**********************************************************************/
74  typedef typename c7x::make_full_vector<float>::type vecType;
75 
76  vecType C1, C2, C3, C4, C5, eMax, outVecMax;
77  c7x::double_vec ln2_vec;
78  c7x::uint_vec zero;
79  zero = (c7x::uint_vec) 0;
80 
81  ln2_vec = (c7x::double_vec) 0.693147180559945;
82  C1 = (vecType) -0.2302894f;
83  C2 = (vecType) 0.1908169f;
84  C3 = (vecType) -0.2505905f;
85  C4 = (vecType) 0.3333164f;
86  C5 = (vecType) -0.5000002f;
87  eMax = (vecType) 3.402823466e+38f;
88  outVecMax = (vecType) 88.72283905313f;
89 
90  /**********************************************************************/
91  /* Create variables employed on log computation */
92  /**********************************************************************/
93 
94  vecType Pol, R1, R2, R3, R4, outvec;
95  c7x::double_vec inVecVals_odd, inVecVals_even, inVecVals_oddReciprocal, inVecVals_evenReciprocal,
96  inVecReciprocalApprox_8_15, inVecReciprocalApprox_0_7, inVecVals_8_15, inVecVals_0_7, rVals_0_7, rVals_8_15,
97  TVals_8_15, TVals_0_7, NVals_odd, NVals_even, NVals_0_7, NVals_8_15, Pol_0_7, Pol_8_15, outvec_8_15, outvec_0_7;
98  c7x::uint_vec inVecReciprocal_32_63, inVecReciprocalClr_32_63, inVecReciprocalApprox_32_63, indexT, upperBitsIndexT,
99  lowerBitsIndexT;
100  c7x::int_vec Nvec;
101 
102  /**********************************************************************/
103  /* Calculate Taylor series approximation for log */
104  /**********************************************************************/
105 
106  // Split vectors to compute r with double precision
107  inVecVals_odd = __high_float_to_double(inVec);
108  inVecVals_even = __low_float_to_double(inVec);
109  inVecVals_oddReciprocal = __recip(inVecVals_odd);
110  inVecVals_evenReciprocal = __recip(inVecVals_even);
111 
112  // Create floating point reciprocal approximation
113  // Upper 32 bits of all inVec reciprocal values
114  inVecReciprocal_32_63 = c7x::as_uint_vec(
115  __permute_odd_odd_int(MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(inVecVals_oddReciprocal),
116  c7x::as_uchar_vec(inVecVals_evenReciprocal)));
117 
118  // Clear bits 0-16 inclusive
119  inVecReciprocalClr_32_63 = inVecReciprocal_32_63 & (c7x::uint_vec)0xFFFE0000U;
120 
121  // Concatenate cleared bit reciprocal with zero bits
122  inVecReciprocalApprox_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
123  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(inVecReciprocalClr_32_63), c7x::as_uchar_vec(zero)));
124  inVecReciprocalApprox_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
125  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(inVecReciprocalClr_32_63), c7x::as_uchar_vec(zero)));
126 
127  // Split inVec into two vectors with double precision
128  inVecVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
129  MATHLIB_vperm_data_dp_interweave_0_63, c7x::as_uchar_vec(inVecVals_odd), c7x::as_uchar_vec(inVecVals_even)));
130  inVecVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
131  MATHLIB_vperm_data_dp_interweave_0_63, c7x::as_uchar_vec(inVecVals_odd), c7x::as_uchar_vec(inVecVals_even)));
132 
133  // Calculate r in double precision
134  rVals_0_7 = (inVecReciprocalApprox_0_7 * inVecVals_0_7) - 1.0;
135  rVals_8_15 = (inVecReciprocalApprox_8_15 * inVecVals_8_15) - 1.0;
136 
137  // Convert r to float, compute r to the power of 2, 3, 4
138  R1 = c7x::reinterpret<vecType>(__permute_even_even_int(MATHLIB_vperm_data_0_63,
139  c7x::as_uchar_vec(__double_to_float(rVals_8_15)),
140  c7x::as_uchar_vec(__double_to_float(rVals_0_7))));
141  R2 = R1 * R1;
142  R3 = R1 * R2;
143  R4 = R2 * R2;
144 
145  // Compute Taylor series polynomial
146  Pol = (C5 * R2) + ((C4 * R3) + ((((C2 * R1) + C3) + (C1 * R2)) * R4));
147 
148  /**********************************************************************/
149  /* Calculate Nvec */
150  /**********************************************************************/
151 
152  // Upper 32 bits of all inVec reciprocal approximation values
153  inVecReciprocalApprox_32_63 = c7x::as_uint_vec(
154  __permute_odd_odd_int(MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(inVecReciprocalApprox_8_15),
155  c7x::as_uchar_vec(inVecReciprocalApprox_0_7)));
156 
157  // Nvec = c7x::convert_int_vec(((inVecReciprocalApprox_32_63 << (c7x::uint_vec)1U) >> (c7x::uint_vec)21U) - (c7x::uint_vec)1023U);
158  // Nvec = c7x::convert_int_vec(((inVecReciprocalApprox_32_63 << 1U) >> 21U) - 1023U);
159  c7x::uint_vec Nvec_temp = __shift_left(inVecReciprocalApprox_32_63, (c7x::uint_vec)1U);
160  Nvec_temp = __shift_right(Nvec_temp, (c7x::uint_vec)21U);
161  Nvec_temp = Nvec_temp - (c7x::uint_vec)1023U;
162  Nvec = c7x::convert_int_vec(Nvec_temp);
163 
164 
165  // Covert Nvec to double precision for later calculation with LUT values
166  NVals_odd = __high_int_to_double(Nvec);
167  NVals_even = __low_int_to_double(Nvec);
168  NVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
169  MATHLIB_vperm_data_dp_interweave_0_63, c7x::as_uchar_vec(NVals_odd), c7x::as_uchar_vec(NVals_even)));
170  NVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
171  MATHLIB_vperm_data_dp_interweave_0_63, c7x::as_uchar_vec(NVals_odd), c7x::as_uchar_vec(NVals_even)));
172 
173  /**********************************************************************/
174  /* Determine LUT values */
175  /**********************************************************************/
176 
177  // Calculate LUT index
178  // indexT = (((inVecReciprocalApprox_32_63 << 12U) >> 29U) + MATHLIB_LOGTABLE_OFFSET);
179  indexT = __shift_left(inVecReciprocalApprox_32_63, (c7x::uint_vec)12U);
180  indexT = __shift_right(indexT, (c7x::uint_vec)29U);
181  indexT = indexT + (c7x::uint_vec)MATHLIB_LOGTABLE_OFFSET;
182 
183  // Read from LUT and reconstruct double values split into two vectors
184 
185  upperBitsIndexT = MATHLIB_LUTReadUpperBits(indexT);
186  lowerBitsIndexT = MATHLIB_LUTReadLowerBits(indexT);
187 
188  TVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
189  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsIndexT), c7x::as_uchar_vec(lowerBitsIndexT)));
190  TVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
191  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsIndexT), c7x::as_uchar_vec(lowerBitsIndexT)));
192 
193  // Calculate an adjusted T
194  TVals_8_15 = TVals_8_15 - (ln2_vec * NVals_8_15);
195  TVals_0_7 = TVals_0_7 - (ln2_vec * NVals_0_7);
196 
197  /**********************************************************************/
198  /* Calculate output with adjusted LUT and Taylor series values */
199  /**********************************************************************/
200 
201  // Split polynomial result into two vectors with double precision
202  Pol_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(MATHLIB_vperm_data_dp_interweave_0_63,
203  c7x::as_uchar_vec(__high_float_to_double(Pol)),
204  c7x::as_uchar_vec(__low_float_to_double(Pol))));
205  Pol_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(MATHLIB_vperm_data_dp_interweave_0_63,
206  c7x::as_uchar_vec(__high_float_to_double(Pol)),
207  c7x::as_uchar_vec(__low_float_to_double(Pol))));
208 
209  // Add LUT values and Taylor series values
210  outvec_0_7 = rVals_0_7 + TVals_0_7 + Pol_0_7;
211  outvec_8_15 = rVals_8_15 + TVals_8_15 + Pol_8_15;
212 
213  // Combine output vector into one floating point result
214  outvec = c7x::reinterpret<vecType>(__permute_even_even_int(MATHLIB_vperm_data_0_63,
215  c7x::as_uchar_vec(__double_to_float(outvec_8_15)),
216  c7x::as_uchar_vec(__double_to_float(outvec_0_7))));
217 
218  /**********************************************************************/
219  /* Bounds checking */
220  /**********************************************************************/
221 
222  // if (a > MAXe) {
223  // res = 709.7827f;
224  // }
225  __vpred cmp_max = __cmp_lt_pred(eMax, inVec);
226  outvec = __select(cmp_max, outVecMax, outvec);
227 
228  return outvec;
229 }
230 
231 template <>
233 MATHLIB_pow_cond<float>(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
234 {
235 
236  // variables
237  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
238  size_t numBlocks = 0; // compute loop's iteration count
239  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
240 
241  // derive c7x vector type from template typename
242  typedef typename c7x::make_full_vector<float>::type vec;
243 
244  // Compile-time decision: float_vec => int_vec and double_vec=> long_vec
245  typedef
246  typename std::conditional<ELEMENT_COUNT(c7x::float_vec) == ELEMENT_COUNT(vec), c7x::int_vec, c7x::long_vec>::type
247  vec_type;
248 
249  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
250  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
251  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
252 
253  // calculate compute loop's iteration counter
254  numBlocks = length / c7x::element_count_of<vec>::value;
255  remNumBlocks = length % c7x::element_count_of<vec>::value;
256  if (remNumBlocks) {
257  numBlocks++;
258  }
259 
260  __SE_ELETYPE SE_ELETYPE;
261  __SE_VECLEN SE_VECLEN;
262  __SA_VECLEN SA_VECLEN;
263 
264  // derive c7x vector type from template typename
265  typedef typename c7x::make_full_vector<float>::type vec;
266 
267  // assign SE and SA params based on vector type
268  SE_VECLEN = c7x::se_veclen<vec>::value;
269  SA_VECLEN = c7x::sa_veclen<vec>::value;
270  SE_ELETYPE = c7x::se_eletype<vec>::value;
271 
272  se0Params.ICNT0 = length;
273  se0Params.ELETYPE = SE_ELETYPE;
274  se0Params.VECLEN = SE_VECLEN;
275  se0Params.DIMFMT = __SE_DIMFMT_1D;
276 
277  se1Params.DIMFMT = __SE_DIMFMT_3D;
278  se1Params.ICNT0 = c7x::element_count_of<vec>::value;
279  se1Params.ELETYPE = SE_ELETYPE;
280  se1Params.VECLEN = SE_VECLEN;
281  se1Params.ICNT1 = 2;
282  se1Params.ICNT2 = numBlocks;
283  se1Params.DIM2 = c7x::element_count_of<vec>::value;
284  se1Params.DECDIM1 = __SE_DECDIM_DIM2;
285  se1Params.DECDIM1_WIDTH = length;
286  se1Params.DIM1 = ((float *) pSrc1) - ((float *) pSrc0);
287 
288  sa0Params.ICNT0 = length;
289  sa0Params.DIM1 = length;
290  sa0Params.VECLEN = SA_VECLEN;
291  sa0Params.DIMFMT = __SA_DIMFMT_1D;
292 
293  // open SE0, SE1, and SA0 for reading and writing operands
294  __SE0_OPEN(pDst, se0Params);
295  __SE1_OPEN(pSrc0, se1Params);
296  __SA0_OPEN(sa0Params);
297 
298  /**********************************************************************/
299  /* Create and assign values for constants employed on pow computation */
300  /**********************************************************************/
301  vec Zero, one, intMax, inf;
302  vec_type zeroInt, signNegative;
303 
304  Zero = (vec) 0.0f;
305  one = (vec) 1.0f;
306  intMax = (vec) 0x7fffffffu;
307  inf = (vec) 0x7F800000u;
308  zeroInt = (vec_type) 0;
309  signNegative = (vec_type) -1;
310  // compute loop to perform vector pow
311  for (size_t i = 0; i < numBlocks; i++) {
312  vec inVec0 = c7x::strm_eng<1, vec>::get_adv();
313  vec outVec1 = c7x::strm_eng<0, vec>::get_adv();
314  vec inVec1 = c7x::strm_eng<1, vec>::get_adv();
315 
316  /**********************************************************************/
317  /* Create variables employed on pow computation */
318  /**********************************************************************/
319  vec_type inVec1_round, inVec1_roundEven, sign;
320 
321  sign = (vec_type) 1;
322 
323  /**********************************************************************/
324  /* Calculate sign of final result */
325  /**********************************************************************/
326  // Negative base w/ odd-integer power should be negative
327  inVec1_round = __float_to_int(inVec1);
328  inVec1_roundEven = inVec1_round & 1;
329 
330  // Check if base is < 0, if the power is an integer, if the power is odd
331  __vpred cmp_zero = __cmp_lt_pred(inVec0, Zero);
332  __vpred cmp_floateq = __cmp_eq_pred(inVec1, __int_to_float(inVec1_round));
333  __vpred cmp_odd = __negate(__cmp_eq_pred(inVec1_roundEven, zeroInt));
334  __vpred cond1 = __and(cmp_zero, cmp_floateq);
335  __vpred cond2 = __and(cond1, cmp_odd);
336 
337  // If all conditions are true, result is negative
338  sign = __select(cond2, signNegative, sign);
339 
340  outVec1 = __int_to_float(sign) * outVec1;
341 
342  /**********************************************************************/
343  /* Bounds checking */
344  /**********************************************************************/
345  // Error if base < 0 and power is not an integer
346  __vpred cond3 = __and(cmp_zero, __negate(cmp_floateq));
347  outVec1 = __select(cond3, intMax, outVec1);
348 
349  __vpred cmp_inVec0_zero = __cmp_eq_pred(inVec0, Zero);
350  __vpred cmp_inVec1_zero_lt = __cmp_lt_pred(inVec1, Zero);
351  __vpred cond4 = __and(cmp_inVec0_zero, __negate(cmp_inVec1_zero_lt));
352  __vpred cond5 = __and(cmp_inVec0_zero, cmp_inVec1_zero_lt);
353 
354  // 0 if base is 0 and power is greater than or equal to 0
355  // inf if base is 0 and power is less than Zero
356  outVec1 = __select(cond4, Zero, outVec1);
357  outVec1 = __select(cond5, inf, outVec1);
358 
359  // 1 if power is 0
360  __vpred cmp_inVec1_zero = __cmp_eq_pred(inVec1, Zero);
361  outVec1 = __select(cmp_inVec1_zero, one, outVec1);
362 
363  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
364  vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
365  __vstore_pred(tmp, addr, outVec1);
366  }
368 
369  return status;
370 }
371 
372 template <>
373 inline MATHLIB_STATUS MATHLIB_pow_exp<float>(__SE_TEMPLATE_v1 *restrict se0Params,
374  __SA_TEMPLATE_v1 *restrict sa0Params,
375  size_t length,
376  float *restrict pSrc0,
377  float *restrict pDst)
378 {
379  // variables
380  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
381  size_t numBlocks = 0; // compute loop's iteration count
382  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
383 
384  // derive c7x vector type from template typename
385  typedef typename c7x::make_full_vector<float>::type vec;
386 
387  // calculate compute loop's iteration counter
388  numBlocks = length / c7x::element_count_of<vec>::value;
389  remNumBlocks = length % c7x::element_count_of<vec>::value;
390  if (remNumBlocks) {
391  numBlocks++;
392  }
393 
394  // open SE0, SE1, and SA0 for reading and writing operands
395  MATHLIB_SE0SA0Open(se0Params, sa0Params, pDst);
396 
397  /**********************************************************************/
398  /* Create and assign values for constants employed on pow computation */
399  /**********************************************************************/
400 
401  vec log2_base_x16, half, negativeHalf, Zero, LnMin, LnMax, Max, C0, c1, c2, P1, P2;
402  c7x::uint_vec mask, inVec_min;
403 
404  log2_base_x16 = (vec) 23.083120654f;
405  half = (vec) 0.5f;
406  negativeHalf = (vec) -0.5f;
407  Zero = (vec) 0.0f;
408  LnMin = (vec) -87.33654475f;
409  LnMax = (vec) 88.72283905f;
410  Max = (vec) 3.402823466E+38f;
411  C0 = (vec) 0.166668549286041f;
412  c1 = (vec) 0.500016170012920f;
413  c2 = (vec) 0.999999998618401f;
414  P1 = (vec) 0.04331970214844f;
415  P2 = (vec) 1.99663646e-6f;
416  mask = (c7x::uint_vec) 0x3u;
417  inVec_min = (c7x::uint_vec) 114u;
418 
419  // compute loop to perform vector pow
420  for (size_t i = 0; i < numBlocks; i++) {
421  vec inVec = c7x::strm_eng<0, vec>::get_adv();
422 
423  /**********************************************************************/
424  /* Create variables employed on exp computation */
425  /**********************************************************************/
426 
427  vec pol_vec, R, R2_vec, R3_vec, outVec, Nf, absNf;
428  c7x::uint_vec J, K, uN, dTAdjusted_32_63, dT_32_63, dT_0_31, inVec_small, upperBitsK, lowerBitsK, upperBitsJ,
429  lowerBitsJ;
430  c7x::int_vec N_vec, minusN;
431  c7x::double_vec KVals_8_15, KVals_0_7, JVals_8_15, JVals_0_7, dTVals_8_15, dTVals_0_7, pol_0_7, pol_8_15,
432  outVec_0_7, outVec_8_15;
433 
434  // Get N_vec such that |N_vec - inVec*16/ln(2)| is minimized
435  Nf = inVec * log2_base_x16;
436  absNf = Nf + half;
437  N_vec = c7x::convert<c7x::int_vec>(absNf);
438  minusN = N_vec - 1;
439 
440  // if ((x * log2_base_x16) < - half) {
441  // N_vec--;
442  // }
443  __vpred cmp_N = __cmp_lt_pred(Nf, negativeHalf);
444  N_vec = __select(cmp_N, minusN, N_vec);
445 
446  /**********************************************************************/
447  /* Calculate Taylor series approximation for exp */
448  /**********************************************************************/
449 
450  // Split vectors to compute r with double precision
451  R = (inVec - (P1 * __int_to_float(N_vec))) - (P2 * __int_to_float(N_vec));
452 
453  // Taylor series approximation
454  R2_vec = R * R;
455  R3_vec = R2_vec * R;
456  pol_vec = (R * c2) + ((R3_vec * C0) + (R2_vec * c1));
457 
458  /**********************************************************************/
459  /* Get index of LUT and 2^M values */
460  /**********************************************************************/
461 
462  // Create vectors of LUT indices
463  uN = c7x::convert<c7x::uint_vec>(N_vec);
464  K = ((uN << 28u) >> 30) + MATHLIB_KTABLE_OFFSET;
465  J = (uN & mask) + MATHLIB_JTABLE_OFFSET;
466 
467  // Read values from LUT and convert and store as doubles in split vectors
468  upperBitsK = MATHLIB_LUTReadUpperBits(K);
469  lowerBitsK = MATHLIB_LUTReadLowerBits(K);
470  upperBitsJ = MATHLIB_LUTReadUpperBits(J);
471  lowerBitsJ = MATHLIB_LUTReadLowerBits(J);
472 
473  KVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
474  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
475  KVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
476  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsK), c7x::as_uchar_vec(lowerBitsK)));
477  JVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
478  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
479  JVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
480  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(upperBitsJ), c7x::as_uchar_vec(lowerBitsJ)));
481 
482  // Multiply LUT values
483  dTVals_8_15 = KVals_8_15 * JVals_8_15;
484  dTVals_0_7 = KVals_0_7 * JVals_0_7;
485 
486  /**********************************************************************/
487  /* Scale exponent to adjust for 2^M */
488  /**********************************************************************/
489 
490  // Upper 32 bits of all dT values and lower 32 bits of all dT values
491  dT_32_63 = c7x::reinterpret<c7x::uint_vec>(__permute_odd_odd_int(
492  MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
493  dT_0_31 = c7x::reinterpret<c7x::uint_vec>(__permute_even_even_int(
494  MATHLIB_vperm_data_0_63, c7x::as_uchar_vec(dTVals_8_15), c7x::as_uchar_vec(dTVals_0_7)));
495 
496  uN = (uN >> 4) << 20;
497  dTAdjusted_32_63 = dT_32_63 + uN;
498 
499  // Concatenate the adjusted upper 32 bits of dT values to the lower 32 bits, convert dT to float
500  dTVals_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(
501  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
502  dTVals_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(
503  MATHLIB_vperm_data_interweave_0_63, c7x::as_uchar_vec(dTAdjusted_32_63), c7x::as_uchar_vec(dT_0_31)));
504 
505  pol_0_7 = c7x::reinterpret<c7x::double_vec>(__permute_low_low(MATHLIB_vperm_data_dp_interweave_0_63,
506  c7x::as_uchar_vec(__high_float_to_double(pol_vec)),
507  c7x::as_uchar_vec(__low_float_to_double(pol_vec))));
508  pol_8_15 = c7x::reinterpret<c7x::double_vec>(__permute_high_high(MATHLIB_vperm_data_dp_interweave_0_63,
509  c7x::as_uchar_vec(__high_float_to_double(pol_vec)),
510  c7x::as_uchar_vec(__low_float_to_double(pol_vec))));
511 
512  outVec_0_7 = dTVals_0_7 * (1.0f + pol_0_7);
513  outVec_8_15 = dTVals_8_15 * (1.0f + pol_8_15);
514 
515  outVec = c7x::reinterpret<vec>(__permute_even_even_int(MATHLIB_vperm_data_0_63,
516  c7x::as_uchar_vec(__double_to_float(outVec_8_15)),
517  c7x::as_uchar_vec(__double_to_float(outVec_0_7))));
518 
519  /**********************************************************************/
520  /* Bounds checking */
521  /**********************************************************************/
522 
523  inVec_small = ((c7x::as_uint_vec(inVec) << 1u) >> 24u);
524  __vpred cmp_inVec = __cmp_gt_pred(inVec_min, inVec_small);
525  outVec = __select(cmp_inVec, 1.0f + inVec, outVec);
526 
527  // < LnMin returns 0
528  // if (x < LnMin) {
529  // res = 0.0f;
530  // }
531  __vpred cmp_min = __cmp_lt_pred(inVec, LnMin);
532  outVec = __select(cmp_min, Zero, outVec);
533 
534  // > LnMax returns MAX
535  // if (x > LnMax) {
536  // res = Max;
537  // }
538  __vpred cmpMax = __cmp_lt_pred(LnMax, inVec);
539  outVec = __select(cmpMax, Max, outVec);
540 
541  // Store exp computation result in
542  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
543  vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
544  __vstore_pred(tmp, addr, outVec);
545  }
547  return status;
548 }
549 
550 template <>
551 inline void MATHLIB_pow_vector<float>(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
552 {
553  size_t numBlocks = 0; // compute loop's iteration count
554  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
555 
556  // derive c7x vector type from template typename
557  typedef typename c7x::make_full_vector<float>::type vec;
558 
559  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
560  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
561 
562  // calculate compute loop's iteration counter
563  numBlocks = length / c7x::element_count_of<vec>::value;
564  remNumBlocks = length % c7x::element_count_of<vec>::value;
565  if (remNumBlocks) {
566  numBlocks++;
567  }
568  MATHLIB_SE0SA01DSequentialInit(&se0Params, &sa0Params, length, pSrc0, pDst);
569 
570  // open SE0, SE1, and SA0 for reading and writing operands
571  MATHLIB_SE0SE1SA0Open(&se0Params, &sa0Params, pSrc0, pSrc1);
572 
573  /**********************************************************************/
574  /* Create and assign values for constants employed on pow computation */
575  /**********************************************************************/
576 
577  vec zero_vec, one;
578 
579  zero_vec = (vec) 0.0f;
580  one = (vec) 1.0f;
581 
582  // compute loop to perform vector pow
583  for (size_t i = 0; i < numBlocks; i++) {
584  vec inVec0 = c7x::strm_eng<0, vec>::get_adv();
585  vec inVec1 = c7x::strm_eng<1, vec>::get_adv();
586 
587  /**********************************************************************/
588  /* Create variables employed on pow computation */
589  /**********************************************************************/
590  vec inVec_log, inVec_exp;
591 
592  /**********************************************************************/
593  /* Log and exp computation */
594  /**********************************************************************/
595  inVec_log = __abs(inVec0);
596  inVec_exp = inVec1 * MATHLIB_pow_log(inVec_log);
597  __vpred cmp_one = __cmp_eq_pred(inVec_log, one);
598  inVec_exp = __select(cmp_one, zero_vec, inVec_exp);
599 
600  // Store log computation result in
601  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
602  vec *addr = c7x::strm_agen<0, vec>::get_adv(pDst);
603  __vstore_pred(tmp, addr, inVec_exp);
604  }
606 
607  MATHLIB_pow_exp(&se0Params, &sa0Params, length, pSrc0, pDst);
608  MATHLIB_pow_cond(length, pSrc0, pSrc1, pDst);
609 }
610 
611 static inline c7x::double_vec cmn_DIVDP_opt(c7x::double_vec a, c7x::double_vec b)
612 {
613 
614  c7x::double_vec Two = (c7x::double_vec)(2.0f);
615  c7x::double_vec X;
616 
617  X = __recip(b);
618  X = X * (Two - (b * X));
619  X = X * (Two - (b * X));
620  X = X * (Two - (b * X));
621  X = a * X;
622 
623  return X;
624 }
625 
626 template <typename T>
627 static inline void MATHLIB_pow_log_opt(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst);
628 
629 template <>
630 inline void
631 MATHLIB_pow_log_opt<double>(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
632 {
633  // variables
634  size_t numBlocks = 0; // compute loop's iteration count
635  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
636 
637  // derive c7x vector type from template typename
638  typedef typename c7x::make_full_vector<double>::type vec;
639 
640  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
641  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
642 
643  MATHLIB_SE0SA01DSequentialInit(&se0Params, &sa0Params, length, pDst, pDst);
644 
645  // calculate compute loop's iteration counter
646  numBlocks = length / c7x::element_count_of<vec>::value;
647  remNumBlocks = length % c7x::element_count_of<vec>::value;
648  if (remNumBlocks) {
649  numBlocks++;
650  }
651 
652  // open SE0, SE1, and SA0 for reading and writing operands
653  MATHLIB_SE0SA0Open(&se0Params, &sa0Params, pSrc0);
654 
655  vec Half, MAXe, srHalf, Half_sq, MINe, a0, a1, a2, b0, b1, b2, c1_vec, c2_vec, W, X, Y, Z, zn, zd, Rz, Sa, Bd, Cn, Da;
656 
657  Half = (vec) 0.5;
658  Half_sq = (vec) 0.5 * 0.5;
659  MAXe = (vec) 1.7976931348623157e+308;
660  srHalf = (vec) 0.70710678118654752440; /* sqrt(0.5) */
661  MINe = (vec) 2.2250738585072014e-308;
662  a0 = (vec) -0.64124943423745581147e+2;
663  a1 = (vec) 0.16383943563021534222e+2;
664  a2 = (vec) -0.78956112887491257267e+0;
665  b0 = (vec) -0.76949932108494879777e+3;
666  b1 = (vec) 0.31203222091924532844e+3;
667  b2 = (vec) -0.35667977739034646171e+2; /* Note b3 = 1.0 */
668  c1_vec = (vec) 0.693359375; /* 355/512 */
669  c2_vec = (vec) -2.121944400546905827679e-4;
670 
671  c7x::long_vec long_zero_vec = (c7x::long_vec) 0;
672  vec double_zero_vec = (vec) 0.0;
673  vec outMAX = (vec)(308.254715974092);
674 
675  for (size_t i = 0; i < numBlocks; i++) {
676 
677  vec a = c7x::strm_eng<0, vec>::get_adv();
678  Y = __abs(a);
679  c7x::long_vec exp_ = c7x::as_long_vec((c7x::as_ulong_vec(Y) << 1) >> 53);
680 
681  c7x::ulong_vec upper = c7x::as_ulong_vec(Y) & (0x000FFFFF00000000u);
682  upper = 0x3FE0000000000000u | upper;
683 
684  Z = c7x::as_double_vec((0x00000000FFFFFFFFu & c7x::as_ulong_vec(Y)) | upper);
685 
686  __vpred cmp1 = __cmp_eq_pred(exp_, long_zero_vec);
687  Z = __select(cmp1, double_zero_vec, Z);
688 
689  vec z_minus_half = Z - Half;
690  vec z_mul_half = (Z * Half) + Half;
691  __vpred cmp2 = __cmp_lt_pred(srHalf, Z);
692  zn = __select(cmp2, (z_minus_half - Half), z_minus_half);
693  zd = __select(cmp2, z_mul_half, (z_mul_half - Half_sq));
694 
695  X = cmn_DIVDP_opt(zn, zd);
696  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
697  vec * addr = c7x::strm_agen<0, vec>::get_adv(pDst);
698  __vstore_pred(tmp, addr, X);
699  }
701 
702  MATHLIB_SE0SA0Open(&se0Params, &sa0Params, pDst);
703 
704  for (size_t i = 0; i < numBlocks; i++) {
705 
706  X = c7x::strm_eng<0, vec>::get_adv();
707 
708  W = X * X;
709  Bd = ((((W + b2) * W) + b1) * W) + b0;
710  Cn = (((W * a2) + a1) * W) + a0;
711  Rz = W * cmn_DIVDP_opt(Cn, Bd);
712  Sa = X + (X * Rz);
713  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
714  vec * addr = c7x::strm_agen<0, vec>::get_adv(pDst);
715  __vstore_pred(tmp, addr, Sa);
716  }
717 
719 
720  MATHLIB_SE0SA0Open(&se0Params, &sa0Params, pSrc0);
721 
722  __SE1_OPEN(pDst, se0Params);
723  for (size_t i = 0; i < numBlocks; i++) {
724 
725  vec a = c7x::strm_eng<0, vec>::get_adv();
726  Sa = c7x::strm_eng<1, vec>::get_adv();
727 
728  Y = a;
729  c7x::long_vec exp_ = c7x::as_long_vec((c7x::as_ulong_vec(Y) << 1) >> 53);
730  c7x::long_vec N_vec = exp_ - 1022;
731 
732  c7x::ulong_vec upper = c7x::as_ulong_vec(Y) & (0x000FFFFF00000000u);
733  upper = 0x3FE0000000000000u | upper;
734 
735  Z = c7x::as_double_vec((0x00000000FFFFFFFFu & c7x::as_ulong_vec(Y)) | upper);
736 
737  __vpred cmp1 = __cmp_eq_pred(exp_, long_zero_vec);
738  Z = __select(cmp1, double_zero_vec, Z);
739 
740  __vpred cmp2 = __cmp_lt_pred(srHalf, Z);
741 
742  N_vec = __select(cmp2, N_vec, (N_vec - 1));
743 
744  Cn = __low_int_to_double(c7x::as_int_vec(N_vec));
745  Da = ((Cn * c2_vec) + Sa) + (Cn * c1_vec);
746 
747  /**********************************************************************/
748  /* Bounds checking */
749  /**********************************************************************/
750 
751  // if (Y < MINe) {
752  // Da = -MAXe;
753  // }
754  __vpred cmp_min_vec = __cmp_lt_pred(Y, MINe);
755  Da = __select(cmp_min_vec, -MAXe, Da);
756 
757  // if (Y > MAXe) {
758  // Da = 308.254715974092;
759  // }
760  __vpred cmp_max_vec = __cmp_lt_pred(MAXe, Y);
761  Da = __select(cmp_max_vec, outMAX, Da);
762 
763  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
764  vec * addr = c7x::strm_agen<0, vec>::get_adv(pDst);
765  __vstore_pred(tmp, addr, Da);
766  }
767  __SE1_CLOSE();
769 }
770 
771 template <>
773 MATHLIB_pow_cond<double>(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
774 {
775  // variables
776  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
777  size_t numBlocks = 0; // compute loop's iteration count
778  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
779 
780  // derive c7x vector type from template typename
781  typedef typename c7x::make_full_vector<double>::type vec;
782 
783  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
784  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
785  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
786 
787  // calculate compute loop's iteration counter
788  numBlocks = length / c7x::element_count_of<vec>::value;
789  remNumBlocks = length % c7x::element_count_of<vec>::value;
790  if (remNumBlocks) {
791  numBlocks++;
792  }
793 
794  __SE_ELETYPE SE_ELETYPE;
795  __SE_VECLEN SE_VECLEN;
796  __SA_VECLEN SA_VECLEN;
797 
798  // assign SE and SA params based on vector type
799  SE_VECLEN = c7x::se_veclen<vec>::value;
800  SA_VECLEN = c7x::sa_veclen<vec>::value;
801  SE_ELETYPE = c7x::se_eletype<vec>::value;
802 
803  se0Params.ICNT0 = length;
804  se0Params.ELETYPE = SE_ELETYPE;
805  se0Params.VECLEN = SE_VECLEN;
806  se0Params.DIMFMT = __SE_DIMFMT_1D;
807 
808  se1Params.DIMFMT = __SE_DIMFMT_3D;
809  se1Params.ICNT0 = c7x::element_count_of<vec>::value;
810  se1Params.ELETYPE = SE_ELETYPE;
811  se1Params.VECLEN = SE_VECLEN;
812  se1Params.ICNT1 = 2;
813  se1Params.ICNT2 = numBlocks;
814  se1Params.DIM2 = c7x::element_count_of<vec>::value;
815  se1Params.DECDIM1 = __SE_DECDIM_DIM2;
816  se1Params.DECDIM1_WIDTH = length;
817  se1Params.DIM1 = ((double *) pSrc1) - ((double *) pSrc0);
818 
819  sa0Params.ICNT0 = length;
820  sa0Params.DIM1 = length;
821  sa0Params.VECLEN = SA_VECLEN;
822  sa0Params.DIMFMT = __SA_DIMFMT_1D;
823 
824  __SE0_OPEN(pDst, se0Params);
825  __SE1_OPEN(pSrc0, se1Params);
826  __SA0_OPEN(sa0Params);
827 
828  vec zeroVec, one, X2;
829 
830  zeroVec = (vec) 0.0f;
831  one = (vec) 1.0f;
832  c7x::long_vec zero_int = (c7x::long_vec)(0);
833  c7x::long_vec one_int = (c7x::long_vec)(1);
834  c7x::int_vec Sign = (c7x::int_vec)(1);
835  c7x::int_vec negate_Sign = (c7x::int_vec)(-1);
836 
837  c7x::long_vec valueL1 = (c7x::long_vec)(0X7fffffffffffffff);
838  c7x::long_vec valueL2 = (c7x::long_vec)(0X7ff0000000000000);
839 
840  for (size_t i = 0; i < numBlocks; i++) {
841  vec inVec0 = c7x::strm_eng<1, vec>::get_adv();
842  vec inVec1 = c7x::strm_eng<1, vec>::get_adv();
843  vec W = c7x::strm_eng<0, vec>::get_adv();
844 
845  c7x::int_vec y = __double_to_int(inVec1);
846 
847  __vpred cmp_cond1 = __cmp_lt_pred(inVec0, zeroVec);
848  __vpred cmp_cond2 = __cmp_eq_pred(inVec1, __low_int_to_double(y));
849 
850  cmp_cond2 = __and(cmp_cond2, cmp_cond1);
851 
852  c7x::long_vec y_long = c7x::as_long_vec(y);
853  __vpred cmp_cond3 = __cmp_eq_pred((y_long & one_int), zero_int);
854 
855  cmp_cond3 = __and(cmp_cond2, __negate(cmp_cond3));
856 
857  Sign = __select(cmp_cond3, negate_Sign, Sign);
858 
859  X2 = __low_int_to_double(Sign) * W;
860 
861  __vpred cmp_cond5 = __cmp_eq_pred(inVec1, __low_int_to_double(y));
862  cmp_cond5 = __and(cmp_cond1, __negate(cmp_cond5));
863  X2 = __select(cmp_cond5, c7x::as_double_vec(valueL1), X2);
864 
865  __vpred cmp_cond6 = __cmp_eq_pred(inVec0, zeroVec);
866  __vpred cmp_cond7 = __cmp_lt_pred(inVec1, zeroVec);
867  cmp_cond7 = __and(cmp_cond6, cmp_cond7);
868  X2 = __select(cmp_cond7, c7x::as_double_vec(valueL2), X2);
869 
870  // Store log computation result in
871  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
872  vec * addr = c7x::strm_agen<0, vec>::get_adv(pDst);
873  __vstore_pred(tmp, addr, X2);
874  }
876  return status;
877 }
878 
879 template <>
880 inline MATHLIB_STATUS MATHLIB_pow_exp<double>(__SE_TEMPLATE_v1 *restrict se0Params,
881  __SA_TEMPLATE_v1 *restrict sa0Params,
882  size_t length,
883  double *restrict pSrc1,
884  double *restrict pDst)
885 {
886 
887  // variables
888  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
889  size_t numBlocks = 0; // compute loop's iteration count
890  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
891 
892  // derive c7x vector type from template typename
893  typedef typename c7x::make_full_vector<double>::type vec;
894 
895  // calculate compute loop's iteration counter
896  numBlocks = length / c7x::element_count_of<vec>::value;
897  remNumBlocks = length % c7x::element_count_of<vec>::value;
898  if (remNumBlocks) {
899  numBlocks++;
900  }
901 
902  // open SE0, SE1, and SA0 for reading and writing operands
903  MATHLIB_SE0SA0Open(se0Params, sa0Params, pDst);
904 
905  __SE1_OPEN(pSrc1, *se0Params);
906 
907  vec Halfe, Maxe, LnMaxe, LnMine, a0e, a1e, a2e, b0e, b1e, b2e, c1e, C2e, L2e, Zero;
908 
909  Halfe = (vec) 0.5;
910  Maxe = (vec) 1.7976931348623157e+308;
911  LnMaxe = (vec) 709.78271289338;
912  LnMine = (vec) -708.3964185322641;
913  a0e = (vec) 0.249999999999999993;
914  a1e = (vec) 0.694360001511792852e-2;
915  a2e = (vec) 0.165203300268279130e-4;
916  b0e = (vec) 0.5;
917  b1e = (vec) 0.555538666969001188e-1;
918  b2e = (vec) 0.495862884905441294e-3;
919  c1e = (vec) 0.693359375; /* 355/512 */
920  C2e = (vec) -2.1219444005469058277e-4;
921  L2e = (vec) 1.4426950408889634074;
922  Zero = (vec) 0.0f;
923 
924  vec Ye, Xe, We, Re, Se, Be, Ce, De;
925 
926  // compute loop to perform vector pow
927  for (size_t i = 0; i < numBlocks; i++) {
928  vec invec = c7x::strm_eng<0, vec>::get_adv();
929  vec b = c7x::strm_eng<1, vec>::get_adv();
930 
931  Ye = invec * b;
932  Ce = Ye * L2e;
933 
934  c7x::int_vec Ne = __double_to_int(Ce);
935 
936  Se = __low_int_to_double(Ne);
937  Xe = (Ye - (Se * c1e)) - (Se * C2e); /* range reduction */
938 
939  We = Xe * Xe;
940  Be = (((b2e * We) + b1e) * We) + b0e; /* denominator */
941  De = ((((a2e * We) + a1e) * We) + a0e) * Xe; /* numerator */
942  Re = Halfe + cmn_DIVDP_opt(De, Be - De);
943 
944  Ye = invec;
945 
946  Ce = Ye * L2e;
947 
948  c7x::int_vec upper = 1024 + (Ne);
949  upper = c7x::as_int_vec((c7x::as_uint_vec(upper) << 20));
950  Se = c7x::as_double_vec(c7x::as_long_vec(upper) << 32);
951 
952  Ce = Re * Se;
953 
954  __vpred cmp_Max = __cmp_lt_pred(LnMaxe, invec);
955  Ce = __select(__negate(cmp_Max), Ce, Maxe);
956 
957  __vpred cmp_Min = __cmp_lt_pred(invec, LnMine);
958  Ce = __select(cmp_Min, Zero, Ce);
959 
960  // Store exp computation result in
961  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
962  vec * addr = c7x::strm_agen<0, vec>::get_adv(pDst);
963  __vstore_pred(tmp, addr, Ce);
964  }
965 
967  __SE1_CLOSE();
968  return status;
969 }
970 
971 template <>
972 inline void
973 MATHLIB_pow_vector<double>(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
974 {
975  size_t numBlocks = 0; // compute loop's iteration count
976  size_t remNumBlocks = 0; // when numBlocks is not a multiple of SIMD width
977 
978  // derive c7x vector type from template typename
979  typedef typename c7x::make_full_vector<double>::type vec;
980 
981  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
982  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
983 
984  // calculate compute loop's iteration counter
985  numBlocks = length / c7x::element_count_of<vec>::value;
986  remNumBlocks = length % c7x::element_count_of<vec>::value;
987  if (remNumBlocks) {
988  numBlocks++;
989  }
990  MATHLIB_SE0SA01DSequentialInit(&se0Params, &sa0Params, length, pSrc0, pDst);
991 
992  /**********************************************************************/
993  /* Create and assign values for constants employed on pow computation */
994  /**********************************************************************/
995 
996  MATHLIB_pow_log_opt(length, pSrc0, pSrc1, pDst);
997 
998  MATHLIB_pow_exp(&se0Params, &sa0Params, length, pSrc1, pDst);
999 
1000  MATHLIB_pow_cond(length, pSrc0, pSrc1, pDst);
1001 }
1002 
1003 // this method performs power computation of input vectors
1004 template <typename T> MATHLIB_STATUS MATHLIB_pow(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst)
1005 {
1006  // variables
1007  MATHLIB_STATUS status = MATHLIB_SUCCESS; // return function status
1008 
1009  status = MATHLIB_checkParams(length, pSrc0, pSrc1, pDst);
1010 
1011  if (status == MATHLIB_SUCCESS) {
1012  if (length < 2){
1013  pDst[0] = MATHLIB_pow_scalar_ci<T>(pSrc0[0], pSrc1[0]);
1014  }
1015  else {
1016  MATHLIB_pow_vector<T>(length, pSrc0, pSrc1, pDst);
1017  }
1018  }
1019 
1020  return status;
1021 }
1022 
1023 /******************************************************************************/
1024 /* */
1025 /* Explicit templatization for datatypes supported by MATHLIB_pow */
1026 /* */
1027 /******************************************************************************/
1028 
1029 // single precision
1031 MATHLIB_pow<float>(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst);
1032 
1033 // double precision
1035 MATHLIB_pow<double>(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst);
1036 
1037 /******************************************************************************/
1038 /* */
1039 /* C-interface wrapper functions */
1040 /* */
1041 /******************************************************************************/
1042 
1043 extern "C" {
1044 
1045 // single-precision wrapper
1046 MATHLIB_STATUS MATHLIB_pow_sp(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
1047 {
1048  MATHLIB_STATUS status = MATHLIB_pow<float>(length, pSrc0, pSrc1, pDst);
1049  return status;
1050 }
1051 
1052 // double-precision wrapper
1053 MATHLIB_STATUS MATHLIB_pow_dp(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
1054 {
1055  MATHLIB_STATUS status = MATHLIB_pow<double>(length, pSrc0, pSrc1, pDst);
1056  return status;
1057 }
1058 
1059 
1060 } // extern "C"
template MATHLIB_STATUS MATHLIB_pow< double >(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
static c7x::double_vec cmn_DIVDP_opt(c7x::double_vec a, c7x::double_vec b)
MATHLIB_STATUS MATHLIB_pow_dp(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
MATHLIB_STATUS MATHLIB_pow_sp(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
MATHLIB_STATUS MATHLIB_pow_cond< double >(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
static MATHLIB_STATUS MATHLIB_pow_exp(__SE_TEMPLATE_v1 *restrict se0Params, __SA_TEMPLATE_v1 *restrict sa0Params, size_t length, T *restrict pSrc0, T *restrict pDst)
MATHLIB_STATUS MATHLIB_pow_cond< float >(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
static vecType MATHLIB_pow_log(vecType inVec)
static void MATHLIB_pow_vector(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst)
static void MATHLIB_pow_log_opt(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst)
void MATHLIB_pow_vector< double >(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
#define ELEMENT_COUNT(x)
Definition: MATHLIB_pow.cpp:34
MATHLIB_STATUS MATHLIB_pow_exp< double >(__SE_TEMPLATE_v1 *restrict se0Params, __SA_TEMPLATE_v1 *restrict sa0Params, size_t length, double *restrict pSrc1, double *restrict pDst)
void MATHLIB_pow_log_opt< double >(size_t length, double *restrict pSrc0, double *restrict pSrc1, double *restrict pDst)
static MATHLIB_STATUS MATHLIB_pow_cond(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst)
template MATHLIB_STATUS MATHLIB_pow< float >(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
void MATHLIB_pow_vector< float >(size_t length, float *restrict pSrc0, float *restrict pSrc1, float *restrict pDst)
MATHLIB_STATUS MATHLIB_pow_exp< float >(__SE_TEMPLATE_v1 *restrict se0Params, __SA_TEMPLATE_v1 *restrict sa0Params, size_t length, float *restrict pSrc0, float *restrict pDst)
MATHLIB_STATUS MATHLIB_pow(size_t length, T *restrict pSrc0, T *restrict pSrc1, T *restrict pDst)
#define MATHLIB_KTABLE_OFFSET
Definition: MATHLIB_lut.h:64
#define MATHLIB_LOGTABLE_OFFSET
Definition: MATHLIB_lut.h:66
static c7x::uint_vec MATHLIB_LUTReadLowerBits(vecType vecOffset)
This method reads bits 31-0 of LUT value at vecOffset.
Definition: MATHLIB_lut.h:111
#define MATHLIB_JTABLE_OFFSET
Definition: MATHLIB_lut.h:65
static c7x::uint_vec MATHLIB_LUTReadUpperBits(vecType vecOffset)
This method reads bits 63-32 of LUT value at vecOffset.
Definition: MATHLIB_lut.h:86
static void MATHLIB_SE0SE1SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc0, T *pSrc1)
This method performs SE0, SE1, and SA0 open.
static void MATHLIB_SE0SA0Close()
This method performs SE0 and SA0 close.
static void MATHLIB_SE0SE1SA0Close()
This method performs SE0, SE1, and SA0 close.
static void MATHLIB_SE0SA01DSequentialInit(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, size_t length, T *pSrc, T *pDst)
static MATHLIB_STATUS MATHLIB_checkParams(size_t length, T *pSrc, T *pDst)
This method performs parameter checks for MATHLIB function.
static void MATHLIB_SE0SA0Open(__SE_TEMPLATE_v1 *se0Params, __SA_TEMPLATE_v1 *sa0Params, T *pSrc)
This method performs SE0 and SA0 open.
MATHLIB_STATUS_NAME
The enumeration of all status codes.
@ MATHLIB_SUCCESS