const非整数指数的pow的优化？

如何解决const非整数指数的pow的优化？

| 我的代码中有一些热点，我正在做pow()，大约占我执行时间的10-20％。我对pow(x,y)的输入非常具体，因此我想知道是否有一种方法可以以更高的性能滚动两个two0ѭ近似值（每个指数一个）。我有两个常数指数：2.4和1 / 2.4。指数为2.4时，x的范围为（0.090473935，1.0]。当指数为1 / 2.4时，x的范围为（0.0031308，1.0]。我正在使用SSE / AVXfloat向量。如果可以利用特定平台的优势，那就继续吧！尽管我也对全精度（对于3分）算法感兴趣，但最大错误率约为0.01％是理想的。我已经在使用快速的pow()近似值，但是没有考虑这些约束。有可能做得更好吗？

解决方法

在IEEE 754骇客技术中，这是另一种更快且更少“神奇”的解决方案。它在大约十二个时钟周期内（在p = 2.4的情况下，在Intel Merom上）实现了0.08％的错误余量。中央处理器）。浮点数最初是作为对数的近似值发明的，因此您可以使用整数值作为log2的近似值。通过将整数转换指令应用于浮点值以获得另一个浮点值，这在某种程度上是可以实现的。要完成pow的计算，您可以乘以一个常数因子，并使用convert-to-integer指令将对数转换回。在SSE上，相关说明为cvtdq2ps和cvtps2dq。不过，它并不是那么简单。 IEEE 754中的指数字段是带符号的，偏差值为127表示指数为零。必须在乘以对数之前消除此偏差，并在取幂之前将其重新加法。此外，通过减法进行的偏差调整不会在零上起作用。幸运的是，两种调整都可以通过预先乘以一个常数系数来实现。

x^p
= exp2( p * log2( x ) )
= exp2( p * ( log2( x ) + 127 - 127 ) - 127 + 127 )
= cvtps2dq( p * ( log2( x ) + 127 - 127 - 127 / p ) )
= cvtps2dq( p * ( log2( x ) + 127 - log2( exp2( 127 - 127 / p ) ) )
= cvtps2dq( p * ( log2( x * exp2( 127 / p - 127 ) ) + 127 ) )
= cvtps2dq( p * ( cvtdq2ps( x * exp2( 127 / p - 127 ) ) ) )

exp2( 127 / p - 127 )是常数。此函数相当专业：它不适用于小分数指数，因为常数因数随指数的倒数成指数增长，并且会溢出。它不适用于负指数。大的指数会导致较高的错误，因为尾数位通过乘法与指数位混合在一起。但是，它只有4个快速指令。预乘，从\“ integer \”转换（对数），幂乘，转换为\“ integer \”（对数）。此SSE实施的转换速度非常快。我们也可以将一个额外的常数压缩到第一个乘法中。

template< unsigned expnum,unsigned expden,unsigned coeffnum,unsigned coeffden >
__m128 fastpow( __m128 arg ) {
        __m128 ret = arg;
//      std::printf( \"arg = %,vg\\n\",ret );
        // Apply a constant pre-correction factor.
        ret = _mm_mul_ps( ret,_mm_set1_ps( exp2( 127. * expden / expnum - 127. )
                * pow( 1. * coeffnum / coeffden,1. * expden / expnum ) ) );
//      std::printf( \"scaled = %,ret );
        // Reinterpret arg as integer to obtain logarithm.
        asm ( \"cvtdq2ps %1,%0\" : \"=x\" (ret) : \"x\" (ret) );
//      std::printf( \"log = %,ret );
        // Multiply logarithm by power.
        ret = _mm_mul_ps( ret,_mm_set1_ps( 1. * expnum / expden ) );
//      std::printf( \"powered = %,ret );
        // Convert back to \"integer\" to exponentiate.
        asm ( \"cvtps2dq %1,%0\" : \"=x\" (ret) : \"x\" (ret) );
//      std::printf( \"result = %,ret );
        return ret;
}

一些指数为2.4的试验表明，这一数字被高估了约5％。（总是保证例程会过高估计。）您可以简单地乘以0.95，但是再加上一些指令将为我们提供大约4个十进制数字的精度，这对于图形来说应该足够了。关键是要使高估与低估相匹配，然后取平均值。计算x ^ 0.8：四条指令，错误〜+ 3％。计算x ^ -0.4：一个rsqrtps。（这已经足够准确了，但是确实牺牲了以零工作的能力。）计算x ^ 0.4：一个mulps。计算x ^ -0.2：一个rsqrtps。计算x ^ 2：一个mulps。计算x ^ 3：一个mulps。 x ^ 2.4 = x ^ 2 * x ^ 0.4：1mulps。这是高估了。 x ^ 2.4 = x ^ 3 * x ^ -0.4 * x ^ -0.2：两个mulps。这是低估了。平均上述价格：一分20英镑，一分14英镑。指令计数：14，包括两次转换（延迟= 5）和两次倒数平方根估计（吞吐量= 4）。为了正确地取平均值，我们希望通过估计的预期误差对其进行加权。低估将误差提高到0.6与0.4的幂，因此我们希望它是错误的1.5倍。加权不添加任何说明；可以在预因子中完成。调用系数a：a ^ 0.5 = 1.5 a ^ -0.75，a = 1.38316186。最终误差约为0.015％，比初始“ 22”结果好2个数量级。对于带有ѭ23个源变量和目标变量的繁忙循环，运行时大约需要十几个周期……尽管它与迭代重叠，但实际使用情况还将看到指令级并行性。考虑到SIMD，这是每3个周期一个标量结果的吞吐量！

int main() {
        __m128 const x0 = _mm_set_ps( 0.01,1,5,1234.567 );
        std::printf( \"Input: %,x0 );

        // Approx 5% accuracy from one call. Always an overestimate.
        __m128 x1 = fastpow< 24,10,1 >( x0 );
        std::printf( \"Direct x^2.4: %,x1 );

        // Lower exponents provide lower initial error,but too low causes overflow.
        __m128 xf = fastpow< 8,int( 1.38316186 * 1e9 ),int( 1e9 ) >( x0 );
        std::printf( \"1.38 x^0.8: %,xf );

        // Imprecise 4-cycle sqrt is still far better than fastpow,good enough.
        __m128 xfm4 = _mm_rsqrt_ps( xf );
        __m128 xf4 = _mm_mul_ps( xf,xfm4 );

        // Precisely calculate x^2 and x^3
        __m128 x2 = _mm_mul_ps( x0,x0 );
        __m128 x3 = _mm_mul_ps( x2,x0 );

        // Overestimate of x^2 * x^0.4
        x2 = _mm_mul_ps( x2,xf4 );

        // Get x^-0.2 from x^0.4. Combine with x^-0.4 into x^-0.6 and x^2.4.
        __m128 xfm2 = _mm_rsqrt_ps( xf4 );
        x3 = _mm_mul_ps( x3,xfm4 );
        x3 = _mm_mul_ps( x3,xfm2 );

        std::printf( \"x^2 * x^0.4: %,x2 );
        std::printf( \"x^3 / x^0.6: %,x3 );
        x2 = _mm_mul_ps( _mm_add_ps( x2,x3 ),_mm_set1_ps( 1/ 1.960131704207789 ) );
        // Final accuracy about 0.015%,200x better than x^0.8 calculation.
        std::printf( \"average = %,x2 );
}

好吧...抱歉，我无法尽快发布。并将其扩展到x ^ 1 / 2.4作为练习； v）。统计更新我实现了一个小的测试工具，并实现了与上述相对应的两个x（5⁄12）案例。

#include <cstdio>
#include <xmmintrin.h>
#include <cmath>
#include <cfloat>
#include <algorithm>
using namespace std;

template< unsigned expnum,unsigned coeffden >
__m128 fastpow( __m128 arg ) {
    __m128 ret = arg;
//  std::printf( \"arg = %,ret );
    // Apply a constant pre-correction factor.
    ret = _mm_mul_ps( ret,_mm_set1_ps( exp2( 127. * expden / expnum - 127. )
        * pow( 1. * coeffnum / coeffden,1. * expden / expnum ) ) );
//  std::printf( \"scaled = %,ret );
    // Reinterpret arg as integer to obtain logarithm.
    asm ( \"cvtdq2ps %1,%0\" : \"=x\" (ret) : \"x\" (ret) );
//  std::printf( \"log = %,ret );
    // Multiply logarithm by power.
    ret = _mm_mul_ps( ret,_mm_set1_ps( 1. * expnum / expden ) );
//  std::printf( \"powered = %,ret );
    // Convert back to \"integer\" to exponentiate.
    asm ( \"cvtps2dq %1,%0\" : \"=x\" (ret) : \"x\" (ret) );
//  std::printf( \"result = %,ret );
    return ret;
}

__m128 pow125_4( __m128 arg ) {
    // Lower exponents provide lower initial error,but too low causes overflow.
    __m128 xf = fastpow< 4,int( 1e9 ) >( arg );

    // Imprecise 4-cycle sqrt is still far better than fastpow,good enough.
    __m128 xfm4 = _mm_rsqrt_ps( xf );
    __m128 xf4 = _mm_mul_ps( xf,xfm4 );

    // Precisely calculate x^2 and x^3
    __m128 x2 = _mm_mul_ps( arg,arg );
    __m128 x3 = _mm_mul_ps( x2,arg );

    // Overestimate of x^2 * x^0.4
    x2 = _mm_mul_ps( x2,xf4 );

    // Get x^-0.2 from x^0.4,and square it for x^-0.4. Combine into x^-0.6.
    __m128 xfm2 = _mm_rsqrt_ps( xf4 );
    x3 = _mm_mul_ps( x3,xfm4 );
    x3 = _mm_mul_ps( x3,xfm2 );

    return _mm_mul_ps( _mm_add_ps( x2,_mm_set1_ps( 1/ 1.960131704207789 * 0.9999 ) );
}

__m128 pow512_2( __m128 arg ) {
    // 5/12 is too small,so compute the sqrt of 10/12 instead.
    __m128 x = fastpow< 5,6,int( 0.992245 * 1e9 ),int( 1e9 ) >( arg );
    return _mm_mul_ps( _mm_rsqrt_ps( x ),x );
}

__m128 pow512_4( __m128 arg ) {
    // 5/12 is too small,so compute the 4th root of 20/12 instead.
    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
    // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
    __m128 xf = fastpow< 2,3,int( 0.629960524947437 * 1e9 ),int( 1e9 ) >( arg );
    __m128 xover = _mm_mul_ps( arg,xf );

    __m128 xfm1 = _mm_rsqrt_ps( xf );
    __m128 x2 = _mm_mul_ps( arg,arg );
    __m128 xunder = _mm_mul_ps( x2,xfm1 );

    // sqrt2 * over + 2 * sqrt2 * under
    __m128 xavg = _mm_mul_ps( _mm_set1_ps( 1/( 3 * 0.629960524947437 ) * 0.999852 ),_mm_add_ps( xover,xunder ) );

    xavg = _mm_mul_ps( xavg,_mm_rsqrt_ps( xavg ) );
    xavg = _mm_mul_ps( xavg,_mm_rsqrt_ps( xavg ) );
    return xavg;
}

__m128 mm_succ_ps( __m128 arg ) {
    return (__m128) _mm_add_epi32( (__m128i) arg,_mm_set1_epi32( 4 ) );
}

void test_pow( double p,__m128 (*f)( __m128 ) ) {
    __m128 arg;

    for ( arg = _mm_set1_ps( FLT_MIN / FLT_EPSILON );
            ! isfinite( _mm_cvtss_f32( f( arg ) ) );
            arg = mm_succ_ps( arg ) ) ;

    for ( ; _mm_cvtss_f32( f( arg ) ) == 0;
            arg = mm_succ_ps( arg ) ) ;

    std::printf( \"Domain from %g\\n\",_mm_cvtss_f32( arg ) );

    int n;
    int const bucket_size = 1 << 25;
    do {
        float max_error = 0;
        double total_error = 0,cum_error = 0;
        for ( n = 0; n != bucket_size; ++ n ) {
            float result = _mm_cvtss_f32( f( arg ) );

            if ( ! isfinite( result ) ) break;

            float actual = ::powf( _mm_cvtss_f32( arg ),p );

            float error = ( result - actual ) / actual;
            cum_error += error;
            error = std::abs( error );
            max_error = std::max( max_error,error );
            total_error += error;

            arg = mm_succ_ps( arg );
        }

        std::printf( \"error max = %8g\\t\" \"avg = %8g\\t\" \"|avg| = %8g\\t\" \"to %8g\\n\",max_error,cum_error / n,total_error / n,_mm_cvtss_f32( arg ) );
    } while ( n == bucket_size );
}

int main() {
    std::printf( \"4 insn x^12/5:\\n\" );
    test_pow( 12./5,& fastpow< 12,1059,1000 > );
    std::printf( \"14 insn x^12/5:\\n\" );
    test_pow( 12./5,& pow125_4 );
    std::printf( \"6 insn x^5/12:\\n\" );
    test_pow( 5./12,& pow512_2 );
    std::printf( \"14 insn x^5/12:\\n\" );
    test_pow( 5./12,& pow512_4 );
}

输出：

4 insn x^12/5:
Domain from 1.36909e-23
error max =      inf    avg =      inf  |avg| =      inf    to 8.97249e-19
error max =  2267.14    avg =  139.175  |avg| =  139.193    to 5.88021e-14
error max = 0.123606    avg = -0.000102963  |avg| = 0.0371122   to 3.85365e-09
error max = 0.123607    avg = -0.000108978  |avg| = 0.0368548   to 0.000252553
error max =  0.12361    avg = 7.28909e-05   |avg| = 0.037507    to  16.5513
error max = 0.123612    avg = -0.000258619  |avg| = 0.0365618   to 1.08471e+06
error max = 0.123611    avg = 8.70966e-05   |avg| = 0.0374369   to 7.10874e+10
error max =  0.12361    avg = -0.000103047  |avg| = 0.0371122   to 4.65878e+15
error max = 0.123609    avg =      nan  |avg| =      nan    to 1.16469e+16
14 insn x^12/5:
Domain from 1.42795e-19
error max =      inf    avg =      nan  |avg| =      nan    to 9.35823e-15
error max = 0.000936462 avg = 2.0202e-05    |avg| = 0.000133764 to 6.13301e-10
error max = 0.000792752 avg = 1.45717e-05   |avg| = 0.000129936 to 4.01933e-05
error max = 0.000791785 avg = 7.0132e-06    |avg| = 0.000129923 to  2.63411
error max = 0.000787589 avg = 1.20745e-05   |avg| = 0.000129347 to   172629
error max = 0.000786553 avg = 1.62351e-05   |avg| = 0.000132397 to 1.13134e+10
error max = 0.000785586 avg = 8.25205e-06   |avg| = 0.00013037  to 6.98147e+12
6 insn x^5/12:
Domain from 9.86076e-32
error max = 0.0284339   avg = 0.000441158   |avg| = 0.00967327  to 6.46235e-27
error max = 0.0284342   avg = -5.79938e-06  |avg| = 0.00897913  to 4.23516e-22
error max = 0.0284341   avg = -0.000140706  |avg| = 0.00897084  to 2.77556e-17
error max = 0.028434    avg = 0.000440504   |avg| = 0.00967325  to 1.81899e-12
error max = 0.0284339   avg = -6.11153e-06  |avg| = 0.00897915  to 1.19209e-07
error max = 0.0284298   avg = -0.000140597  |avg| = 0.00897084  to 0.0078125
error max = 0.0284371   avg = 0.000439748   |avg| = 0.00967319  to      512
error max = 0.028437    avg = -7.74294e-06  |avg| = 0.00897924  to 3.35544e+07
error max = 0.0284369   avg = -0.000142036  |avg| = 0.00897089  to 2.19902e+12
error max = 0.0284368   avg = 0.000439183   |avg| = 0.0096732   to 1.44115e+17
error max = 0.0284367   avg = -7.41244e-06  |avg| = 0.00897923  to 9.44473e+21
error max = 0.0284366   avg = -0.000141706  |avg| = 0.00897088  to 6.1897e+26
error max = 0.485129    avg = -0.0401671    |avg| = 0.048422    to 4.05648e+31
error max = 0.994932    avg = -0.891494 |avg| = 0.891494    to 2.65846e+36
error max = 0.999329    avg =      nan  |avg| =      nan    to       -0
14 insn x^5/12:
Domain from 2.64698e-23
error max =  0.13556    avg = 0.00125936    |avg| = 0.00354677  to 1.73472e-18
error max = 0.000564988 avg = 2.51458e-06   |avg| = 0.000113709 to 1.13687e-13
error max = 0.000565065 avg = -1.49258e-06  |avg| = 0.000112553 to 7.45058e-09
error max = 0.000565143 avg = 1.5293e-06    |avg| = 0.000112864 to 0.000488281
error max = 0.000565298 avg = 2.76457e-06   |avg| = 0.000113713 to       32
error max = 0.000565453 avg = -1.61276e-06  |avg| = 0.000112561 to 2.09715e+06
error max = 0.000565531 avg = 1.42628e-06   |avg| = 0.000112866 to 1.37439e+11
error max = 0.000565686 avg = 2.71505e-06   |avg| = 0.000113715 to 9.0072e+15
error max = 0.000565763 avg = -1.56586e-06  |avg| = 0.000112415 to 1.84467e+19

我怀疑更精确的5/12的准确性受到rsqrt操作的限制。 ,另一个答案，因为这与我以前的答案有很大不同，而且进展很快。相对误差为3e-8。想要更高的准确性？再添加几个切比雪夫条款。最好保持序为奇数，因为这会使2 ^n-ε和2 ^ n + epsil之间有小的不连续性。

#include <stdlib.h>
#include <math.h>

// Returns x^(5/12) for x in [1,2),to within 3e-8 (relative error).
// Want more precision? Add more Chebychev polynomial coefs.
double pow512norm (
   double x)
{
   static const int N = 8;

   // Chebychev polynomial terms.
   // Non-zero terms calculated via
   //   integrate (2/pi)*ChebyshevT[n,u]/sqrt(1-u^2)*((u+3)/2)^(5/12)
   //   from -1 to 1
   // Zeroth term is similar except it uses 1/pi rather than 2/pi.
   static const double Cn[N] = { 
       1.1758200232996901923,0.16665763094889061230,-0.0083154894939042125035,0.00075187976780420279038,// Wolfram alpha doesn\'t want to compute the remaining terms
      // to more precision (it times out).
      -0.0000832402,0.0000102292,-1.3401e-6,1.83334e-7};

   double Tn[N];

   double u = 2.0*x - 3.0;

   Tn[0] = 1.0;
   Tn[1] = u;
   for (int ii = 2; ii < N; ++ii) {
      Tn[ii] = 2*u*Tn[ii-1] - Tn[ii-2];
   }   

   double y = 0.0;
   for (int ii = N-1; ii >= 0; --ii) {
      y += Cn[ii]*Tn[ii];
   }   

   return y;
}


// Returns x^(5/12) to within 3e-8 (relative error).
double pow512 (
   double x)
{
   static const double pow2_512[12] = {
      1.0,pow(2.0,5.0/12.0),pow(4.0,pow(8.0,pow(16.0,pow(32.0,pow(64.0,pow(128.0,pow(256.0,pow(512.0,pow(1024.0,pow(2048.0,5.0/12.0)
   };

   double s;
   int iexp;

   s = frexp (x,&iexp);
   s *= 2.0;
   iexp -= 1;

   div_t qr = div (iexp,12);
   if (qr.rem < 0) {
      qr.quot -= 1;
      qr.rem += 12;
   }

   return ldexp (pow512norm(s)*pow2_512[qr.rem],5*qr.quot);
}

附录：这是怎么回事？根据请求，以下内容说明了以上代码的工作方式。总览上面的代码定义了两个函数，29ѭ和double pow512 (double x)。后者是该套件的入口。这是用户代码应调用以计算x ^（5/12）的函数。函数pow512norm(x)使用Chebyshev多项式近似x ^（5/12），但仅适用于[1,2]范围内的x。（对于超出此范围的x值使用pow512norm(x)，结果将是垃圾。）函数pow512(x)将传入的x分成对(double s,int n)，使得that36ѭ和1≤s<2。将n进一步划分为(int q,unsigned int r)，使得n = 12*q + r和r小于12，这使我将找到x ^（5/12）的问题分成了几个部分：（42ѭ通过（uv）^ a =（u ^ a）（v ^ a）获得正u，v和实数a。 s^(5/12)是通过ѭ44calculated计算得出的。 substitution45ѭ通过替代。 2^(12*q+r)=(2^(12*q))*(2^r)至u^(a+b)=(u^a)*(u^b)表示正u，实数a，b。 (2^(12*q+r))^(5/12)=(2^(5*q))*((2^r)^(5/12))通过更多的操纵。 (2^r)^(5/12)由查找表pow2_512计算出。计算pow512norm(s)*pow2_512[qr.rem]，我们快到了。这里的qr.rem是在上面的步骤3中计算的r值。所需要做的就是将其乘以54来得到所需的结果。这正是数学库函数ldexp所做的。函数近似这里的目标是提出一个容易计算的f（x）= x ^（5/12）近似值，它对于当前的问题“足够好”。在某种意义上，我们的近似值应接近f（x）。修辞性问题：“接近”是什么意思？两种相互竞争的解释是最小化均方误差与最小化最大绝对误差。我将使用股市类比来描述两者之间的区别。假设您想存钱以备将来退休。如果您二十多岁，最好的办法是投资股票或股票市场基金。这是因为在足够长的时间范围内，股票市场平均优于任何其他投资计划。但是，我们都已经看到，把钱投入股票是一件非常糟糕的事情。如果您是五十多岁或六十岁（如果您想退休，则为四十岁），则需要更加保守地进行投资。这些下降可能会严重影响您的退休投资组合。返回函数近似值：作为某种近似值的使用者，您通常担心最坏情况的错误，而不是“平均”的性能。使用一些构造的近似值以“在平均水平”（例如最小二乘）给出最佳性能，墨菲定律规定，在性能远不如平均水平的情况下，您的程序将花费大量时间使用近似值。您想要的是一个极大极小值近似值，它可以最大程度地减小某些域上的最大绝对误差。一个好的数学库将采用minimax方法而不是最小二乘方法，因为这使数学库的作者可以保证其库的性能。数学库通常使用多项式或有理多项式在a≤x≤b的某个域上近似某个函数f（x）。假设函数f（x）在这个域上是解析的，并且您想用一个次数为N的多项式p（x）来近似该函数。对于给定的次数N，存在一些神奇的唯一多项式p（x）使得p（x x）-f（x）在[a，b]上具有N + 2极值，因此这些N + 2极值的绝对值都彼此相等。找到这个神奇的多项式p（x）是函数逼近器的圣杯。我没有找到适合您的圣杯。我改用Chebyshev近似值。第一种Chebyshev多项式是一组正交（但不是正交）的多项式，在函数逼近时具有一些非常好的特征。 Chebyshev逼近通常非常接近该神奇多项式p（x）。（实际上，确实发现圣杯多项式的Remez交换算法通常以Chebyshev近似开始。） pow512范数（x）此函数使用Chebyshev逼近来找到一些近似x ^（5/12）的多项式p *（x）。在这里，我使用p *（x）来区分这种切比雪夫逼近与上述神奇多项式p（x）。切比雪夫近似值p *（x）很容易找到；发现p（x）是熊。 Chebyshev近似值p *（x）是sum_i Cn [i] * Tn（i，x），其中Cn [i]是Chebyshev系数，Tn（i，x）是在x处求值的Chebyshev多项式。我用Wolfram alpha为我找到了切比雪夫系数Cn。例如，计算得出Cn[1]。输入框后的第一个框具有所需的答案，在这种情况下为0.166658。那不是我想要的位数。单击\“更多数字\”，瞧，您会得到更多的数字。 Wolfram alpha是免费的。它会执行多少计算是有限制的。它达到了更高阶条件的限制。（如果您购买或可以使用mathematica，则可以高精度地计算那些高阶系数。） Chebyshev多项式Tn（x）在数组ѭ58中计算。除了给出非常接近魔术多项式p（x）的值之外，使用Chebyshev逼近的另一个原因是可以轻松计算这些Chebyshev多项式的值：从Tn[0]=1和Tn[1]=x开始，然后迭代计算Tn[i]=2*x*Tn[i-1] - Tn[i-2]。（我在代码中使用\'ii \'而不是\'i \'作为索引变量。我从不使用\'i \'作为变量名。英语中有多少个单词带有\'i \'单词中有两个连续的'i'？） pow512（x） pow512是用户代码应调用的功能。我已经在上面描述了此功能的基础。其他一些细节：数学库函数frexp(x)返回输入x的有效位数s和指数iexp。（次要问题：我希望s在1到2之间与pow512norm一起使用，但frexp返回一个介于0.5到1之间的值。）数学库函数ѭ70the返回商和整数除以一个骤升环。最后，我使用数学库函数ldexp将这三个部分放在一起形成最终答案。 ,伊恩·斯蒂芬森（Ian Stephenson）编写了这段代码，声称胜过pow()。他将想法描述如下：战俘基本上是使用记录：pow(a,b)=x(logx(a)*b)。所以我们需要快速记录和快速指数-它 x是多少都没有关系，所以我们使用2。诀窍是浮点数数字已经是对数样式格式：

a=M*2E

以双方的日志为准：

log2(a)=log2(M)+E

或更简单地说：

log2(a)~=E

换句话说，如果我们采取浮动数字的点表示，以及提取我们拥有的指数这是一个很好的起点作为其日志。原来，当我们通过按摩位模式来做到这一点，尾数最终给一个好误差的近似值效果很好。简单就足够了照明计算，但如果需要更好的东西，然后可以提取尾数，并使用它来计算二次校正因子这是非常准确的。 , 首先，使用浮点数在当今大多数机器上都不会买太多东西。实际上，双打可以更快。您的功效1.0 / 2.4为5/12或1/3 *（1 + 1/4）。即使这叫cbrt（一次）和sqrt（两次！），它仍然是使用pow（）的两倍。（优化：-O3，编译器：i686-apple-darwin10-g ++-4.2.1）。

#include <math.h> // cmath does not provide cbrt; C99 does.
double xpow512 (double x) {
  double cbrtx = cbrt(x);
  return cbrtx*sqrt(sqrt(cbrtx));
}

, 这可能无法回答您的问题。 2.4f和1/2.4f使我非常怀疑，因为这些正是用于在sRGB和线性RGB颜色空间之间转换的能力。因此，您实际上可能正在尝试对此进行优化。我不知道，这就是为什么这可能无法回答您的问题。如果是这种情况，请尝试使用查找表。就像是：

__attribute__((aligned(64))
static const unsigned short SRGB_TO_LINEAR[256] = { ... };
__attribute__((aligned(64))
static const unsigned short LINEAR_TO_SRGB[256] = { ... };

void apply_lut(const unsigned short lut[256],unsigned char *src,...

如果使用的是16位数据，请进行相应更改。无论如何，我都会将该表设置为16位，因此在处理8位数据时，如有必要，您可以抖动结果。如果您的数据是以浮点数开头的，那么这显然不会很好地工作，但是将sRGB数据存储在浮点数中并没有任何意义，因此您也可以转换为16位/ 8位，然后进行从线性到sRGB的转换。（sRGB之所以不能作为浮点数，是因为HDR应该是线性的，而sRGB仅便于存储在磁盘上或在屏幕上显示，而不便于操作。） , 二项式级数确实说明了一个恒定的指数，但是只有将所有输入标准化为[1,2）时，您才可以使用它。（请注意，它计算（1 + x）^ a）。您必须进行一些分析，才能确定需要多少个术语才能达到所需的准确性。 , 我将回答您真正想问的问题，即如何进行快速sRGB <->线性RGB转换。为了精确有效地做到这一点，我们可以使用多项式逼近。 sollya生成了以下多项式逼近，其最坏情况下的相对误差为0.0144％。

inline double poly7(double x,double a,double b,double c,double d,double e,double f,double g,double h) {
    double ab,cd,ef,gh,abcd,efgh,x2,x4;
    x2 = x*x; x4 = x2*x2;
    ab = a*x + b; cd = c*x + d;
    ef = e*x + f; gh = g*x + h;
    abcd = ab*x2 + cd; efgh = ef*x2 + gh;
    return abcd*x4 + efgh;
}

inline double srgb_to_linear(double x) {
    if (x <= 0.04045) return x / 12.92;

    // Polynomial approximation of ((x+0.055)/1.055)^2.4.
    return poly7(x,0.15237971711927983387,-0.57235993072870072762,0.92097986411523535821,-0.90208229831912012386,0.88348956209696805075,0.48110797889132134175,0.03563925285274562038,0.00084585397227064120);
}

inline double linear_to_srgb(double x) {
    if (x <= 0.0031308) return x * 12.92;

    // Piecewise polynomial approximation (divided by x^3)
    // of 1.055 * x^(1/2.4) - 0.055.
    if (x <= 0.0523) return poly7(x,-6681.49576364495442248881,1224.97114922729451791383,-100.23413743425112443219,6.60361150127077944916,0.06114808961060447245,-0.00022244138470139442,0.00000041231840827815,-0.00000000035133685895) / (x*x*x);

    return poly7(x,-0.18730034115395793881,0.64677431008037400417,-0.99032868647877825286,1.20939072663263713636,0.33433459165487383613,-0.01345095746411287783,0.00044351684288719036,-0.00000664263587520855) / (x*x*x);
}

sollya输入用于生成多项式：

suppressmessage(174);
f = ((x+0.055)/1.055)^2.4;
p0 = fpminimax(f,7,[|D...|],[0.04045;1],relative);
p = fpminimax(f/(p0(1)+1e-18),relative);
print(\"relative:\",dirtyinfnorm((f-p)/f,[s;1]));
print(\"absolute:\",dirtyinfnorm((f-p),[s;1]));
print(canonical(p));

s = 0.0523;
z = 3;
f = 1.055 * x^(1/2.4) - 0.055;

p = fpminimax(1.055 * (x^(z+1/2.4) - 0.055*x^z/1.055),[0.0031308;s],relative)/x^z;
print(\"relative:\",[0.0031308;s]));
print(\"absolute:\",[0.0031308;s]));
print(canonical(p));

p = fpminimax(1.055 * (x^(z+1/2.4) - 0.055*x^z/1.055),[s;1],[s;1]));
print(canonical(p));

, 对于2.4的指数，如果表不够准确（基本上是一个大的日志表），则可以为所有2.4值创建一个查询表并使用lirp或更高阶的函数来填充inbetweem值。或者，将值的平方*值乘以2/5，这可以从函数的前半部分开始取初始平方值，然后再取第5个根。对于第5个根，您可以进行牛顿运算或使用其他一些快速逼近器，尽管老实说，一旦达到这一点，您最好自己使用适当的缩写系列函数执行exp和log函数。 , 以下是可以与任何快速计算方法一起使用的想法。它能否帮助事情更快发展取决于您的数据如何到达。您可以使用这样的事实：如果您知道x和ѭ84use，则可以使用幂的变化率来计算较小的delta的pow(x + delta,n)的合理近似值，并进行一次乘法和加法（或多或少）。如果您输入的幂函数的连续值足够接近，则将分摊多次函数调用中的精确计算的全部成本。请注意，您不需要额外的战俘计算就可以得到导数。您可以将其扩展为使用二阶导数，以便可以使用二次函数，这将增加您可以使用的delta，并且仍然具有相同的精度。 ,因此，传统上，powf(x,p) = x^p是通过将x改写为x=2^(log2(x))而得到powf(x,p) = 2^(p*log2(x))，从而将问题转换为two92ѭ和log2()的两个近似值。这具有使用较大功率with94ѭ的优势，但是缺点是，这对于恒定功率p以及超过指定输入范围0 ≤ x ≤ 1而言不是最佳解决方案。当幂p > 1时，答案是在the96ѭ上的平凡极大极小多项式，p = 12/5 = 2.4就是这种情况，如下所示：

float pow12_5(float x){
    float mp;
    // Minimax horner polynomials for x^(5/12),Note: choose the accurarcy required then implement with fma() [Fused Multiply Accumulates]
    // mp = 0x4.a84a38p-12 + x * (-0xd.e5648p-8 + x * (0xa.d82fep-4 + x * 0x6.062668p-4)); // 1.13705697e-3
    mp = 0x1.117542p-12 + x * (-0x5.91e6ap-8 + x * (0x8.0f50ep-4 + x * (0xa.aa231p-4 + x * (-0x2.62787p-4))));  // 2.6079002e-4
    // mp = 0x5.a522ap-16 + x * (-0x2.d997fcp-8 + x * (0x6.8f6d1p-4 + x * (0xf.21285p-4 + x * (-0x7.b5b248p-4 + x * 0x2.32b668p-4))));  // 8.61377e-5
    // mp = 0x2.4f5538p-16 + x * (-0x1.abcdecp-8 + x * (0x5.97464p-4 + x * (0x1.399edap0 + x * (-0x1.0d363ap0 + x * (0xa.a54a3p-4 + x * (-0x2.e8a77cp-4))))));  // 3.524655e-5
    return(mp);
}

但是，当p < 1时，边界0 ≤ x ≤ 1上的极小极大逼近不能适当收敛到所需的精度。一种选择[不是真的]是重写问题ѭ103，其中m=1,2,3是一个正整数，使新的功率近似值p > 1，但这会引入固有的减法运算。但是，还有另一个选择是将输入ѭ34分解为浮点指数和尾数形式：

x = mx* 2^(ex) where 1 ≤ mx < 2
y = x^(5/12) = mx^(5/12) * 2^((5/12)*ex),let ey = floor(5*ex/12),k = (5*ex) % 12
  = mx^(5/12) * 2^(k/12) * 2^(ey)

现在，1 ≤ mx < 2的mx^(5/12)的极小极大逼近比以前快得多，没有除法，但是for110ѭ需要12点LUT。代码如下：

float powk_12LUT[] = {0x1.0p0,0x1.0f38fap0,0x1.1f59acp0,0x1.306fep0,0x1.428a3p0,0x1.55b81p0,0x1.6a09e6p0,0x1.7f910ep0,0x1.965feap0,0x1.ae89fap0,0x1.c823ep0,0x1.e3437ep0};
float pow5_12(float x){
    union{float f; uint32_t u;} v,e2;
    float poff,m,e,ei;
    int xe;

    v.f = x;
    xe = ((v.u >> 23) - 127);

    if(xe < -127) return(0.0f);

    // Calculate remainder k in 2^(k/12) to find LUT
    e = xe * (5.0f/12.0f);
    ei = floorf(e);
    poff = powk_12LUT[(int)(12.0f * (e - ei))];

    e2.u = ((int)ei + 127) << 23;   // Calculate the exponent
    v.u = (v.u & ~(0xFFuL << 23)) | (0x7FuL << 23); // Normalize exponent to zero

    // Approximate mx^(5/12) on [1,with appropriate degree minimax
    // m = 0x8.87592p-4 + v.f * (0x8.8f056p-4 + v.f * (-0x1.134044p-4));    // 7.6125e-4
    // m = 0x7.582138p-4 + v.f * (0xb.1666bp-4 + v.f * (-0x2.d21954p-4 + v.f * 0x6.3ea0cp-8));  // 8.4522726e-5
    m = 0x6.9465cp-4 + v.f * (0xd.43015p-4 + v.f * (-0x5.17b2a8p-4 + v.f * (0x1.6cb1f8p-4 + v.f * (-0x2.c5b76p-8))));   // 1.04091259e-5
    // m = 0x6.08242p-4 + v.f * (0xf.352bdp-4 + v.f * (-0x7.d0c1bp-4 + v.f * (0x3.4d153p-4 + v.f * (-0xc.f7a42p-8 + v.f * 0x1.5d840cp-8))));    // 1.367401e-6

    return(m * poff * e2.f);
}

const非整数指数的pow的优化？

如何解决const非整数指数的pow的优化？

解决方法

相关推荐