factor: much faster, and very slightly larger isqrt()

function                                             old     new   delta
isqrt_odd                                             70      88     +18

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2017-04-11 07:02:42 +02:00
parent cc1f8ba489
commit 10673c44f1

View File

@ -48,41 +48,17 @@ typedef unsigned long half_t;
static inline half_t isqrt(wide_t N) static inline half_t isqrt(wide_t N)
{ {
half_t x; half_t x;
unsigned shift;
// Never called with N < 1 shift = WIDE_BITS - 2;
//if (N == 0) x = 0;
// return 0; do {
x = (x << 1) + 1;
x = HALF_MAX; if ((wide_t)x * x > (N >> shift))
/* First approximation of x+1 > sqrt(N) - all-ones, half as many bits: x--; /* whoops, that +1 was too much */
* 1xxxxx -> 111 (six bits to three) shift -= 2;
* 01xxxx -> 111 } while ((int)shift >= 0);
* 001xxx -> 011 return x;
* 0001xx -> 011 and so on.
*/
// It is actually not performance-critical at all.
// Can simply start Newton loop with very conservative x=0xffffffff.
//wide_t mask_2bits;
//mask_2bits = TOPMOST_WIDE_BIT | (TOPMOST_WIDE_BIT >> 1);
//while (!(N & mask_2bits)) {
// x >>= 1;
// mask_2bits >>= 2;
//}
dbg("x:%"HALF_FMT"x", x);
for (;;) {
half_t y = (x + N/x) / 2;
dbg("y:%x y^2:%llx", y, (wide_t)y * y);
/*
* "real" y may be one bit wider: 0x100000000 and get truncated to 0.
* In this case, "real" y is > x. The first check below is for this case:
*/
if (y == 0 || y >= x) {
dbg("isqrt(%llx)=%"HALF_FMT"x", N, x);
return x;
}
x = y;
}
} }
static NOINLINE half_t isqrt_odd(wide_t N) static NOINLINE half_t isqrt_odd(wide_t N)