ARM code square root routines

Because of the complete lack of a FAQ for the newsgroup comp.sys.arm, and because some questions come up again and again, I decided to archive the latest discussion on square root routines for the ARM processor on the web where people may be able to find it.

I have not explicitly obtained permission from the posters to reproduce their postings. Anyone who objects should mail me at the address given at the end of this web page.

I have felt free to edit the the postings to avoid repetition, to change the formatting and to cut out anything I deemed irrelevant. The order of the routines in this document is more or less the order in which they were posted in this thread. Some of the routines are better than others.

I have no intention of maintaining a FAQ for comp.sys.arm, however, I also have archived another thread on reciprocal square root routines.

4 cycle/bit, 70 cycle worst case ARM code routine

Andreas Joos reposted an article by Dan Maloney which contained the following routine:

;       Optimised 32-bit integer sqrt
;       Calculates square root of R0
;       Result returned in R0, uses R1 and R2
;       Worst case execution time exactly 70 S cycles

.intsqr
        MOV     r1,#0
        CMP     r1,r0,LSR #8
        BEQ     bit8            ; &000000xx numbers

        CMP     r1,r0,LSR #16
        BEQ     bit16           ; &0000xxxx numbers

        CMP     r1,r0,LSR #24
        BEQ     bit24           ; &00xxxxxx numbers

        SUBS    r2,r0,#&40000000
        MOVCS   r0,r2
        MOVCS   r1,#&10000

        ADD     r2,r1,#&4000
        SUBS    r2,r0,r2,LSL#14
        MOVCS   r0,r2
        ADDCS   r1,r1,#&8000

        ADD     r2,r1,#&2000
        SUBS    r2,r0,r2,LSL#13
        MOVCS   r0,r2
        ADDCS   r1,r1,#&4000

.bit24
        ADD     r2,r1,#&1000
        SUBS    r2,r0,r2,LSL#12
        MOVCS   r0,r2
        ADDCS   r1,r1,#&2000

        ADD     r2,r1,#&800
        SUBS    r2,r0,r2,LSL#11
        MOVCS   r0,r2
        ADDCS   r1,r1,#&1000

        ADD     r2,r1,#&400
        SUBS    r2,r0,r2,LSL#10
        MOVCS   r0,r2
        ADDCS   r1,r1,#&800

        ADD     r2,r1,#&200
        SUBS    r2,r0,r2,LSL#9
        MOVCS   r0,r2
        ADDCS   r1,r1,#&400

.bit16
        ADD     r2,r1,#&100
        SUBS    r2,r0,r2,LSL#8
        MOVCS   r0,r2
        ADDCS   r1,r1,#&200

        ADD     r2,r1,#&80
        SUBS    r2,r0,r2,LSL#7
        MOVCS   r0,r2
        ADDCS   r1,r1,#&100

        ADD     r2,r1,#&40
        SUBS    r2,r0,r2,LSL#6
        MOVCS   r0,r2
        ADDCS   r1,r1,#&80

        ADD     r2,r1,#&20
        SUBS    r2,r0,r2,LSL#5
        MOVCS   r0,r2
        ADDCS   r1,r1,#&40

.bit8
        ADD     r2,r1,#&10
        SUBS    r2,r0,r2,LSL#4
        MOVCS   r0,r2
        ADDCS   r1,r1,#&20

        ADD     r2,r1,#&8
        SUBS    r2,r0,r2,LSL#3
        MOVCS   r0,r2
        ADDCS   r1,r1,#&10

        ADD     r2,r1,#&4
        SUBS    r2,r0,r2,LSL#2
        MOVCS   r0,r2
        ADDCS   r1,r1,#&8

        ADD     r2,r1,#&2
        SUBS    r2,r0,r2,LSL#1
        MOVCS   r0,r2
        ADDCS   r1,r1,#&4

        ADD     r2,r1,#&1
        CMP     r0,r2
        ADDCS   r1,r1,#&2

        MOV     r0,r1,LSR#1     ; result in R0

Dan’s original comments on the routine were that

He had optimised the code himself but didn’t know where he had got the original from.
If instead of a true integer routine you needed a fixed point routine, then he had one of those too.

Newton Raphson method

Peter Teichmann suggested that you use iteration formula Y1 = (Y0 + W/Y0)/2 where Y1 = (N+1)th estimation; Y0 = Nth estimation; W = number you want the square root from. You should stop the iteration when Y1 = Y0.

Peter also warns that care should be taken with the first estimation,if you take 1 it is fast for small square roots and slow for big. If you take 65536 it is the other way round.

Peter also gave the following implementation of the method in ARM code with the input and output in R0:

	stmfd	r13!,{r14}

	mov	r3,r0		;r3=W
	mov	r0,#16384*2	;r0=start estimation * 2

loop
	mov	r14,r0,lsr#1
	mov	r0,r3
	mov	r1,r14
	bl	FDIV		;r0=r0/r1  r2=scrapped
	add	r0,r0,r14
	cmp	r14,r0,lsr#1
	bne	loop

	mov	r0,r0,lsr#1

	ldmfd	r13!,{pc}^

However, I suspect there has been a transcription error in the routine. I think the references to r14 inside the loop should be references to r4 as r14 will get trashed by the BL.

He suggested a division routine on his home page.

Peter commented that on a StrongARM 202.8 MHz it takes between 2 and 2.5 seconds with the above start estimation for 1000000 sqrts (2.5 for sqrt(1) and sqrt(4e9)). By my reckoning, that’s between 400 and 500 cycles per sqrt or 12 to 16 cycles per bit (to get it in the same units as the other routines).

5 and 4 cycle per bit C and ARM code routines

Robert Harley suggested the following C routine to calculate floor(sqrt(n)):

unsigned long isqrt(const unsigned long n);

/*-- isqrt -----------------------------------------------------------------*/

unsigned long isqrt(unsigned long n) {
  unsigned long s, t;

#define sqrtBit(k) \
  t = s+(1UL<<(k-1)); t <<= k+1; if (n >= t) { n -= t; s |= 1UL<<k; }

  s = 0UL;
#ifdef __alpha
  if (n >= 1UL<<62) { n -= 1UL<<62; s = 1UL<<31; }
  sqrtBit(30); sqrtBit(29); sqrtBit(28); sqrtBit(27); sqrtBit(26);
  sqrtBit(25); sqrtBit(24); sqrtBit(23); sqrtBit(22); sqrtBit(21);
  sqrtBit(20); sqrtBit(19); sqrtBit(18); sqrtBit(17); sqrtBit(16);
  sqrtBit(15);
#else
  if (n >= 1UL<<30) { n -= 1UL<<30; s = 1UL<<15; }
#endif
  sqrtBit(14); sqrtBit(13); sqrtBit(12); sqrtBit(11); sqrtBit(10);
  sqrtBit(9); sqrtBit(8); sqrtBit(7); sqrtBit(6); sqrtBit(5);
  sqrtBit(4); sqrtBit(3); sqrtBit(2); sqrtBit(1);
  if (n > s<<1) s |= 1UL;

#undef sqrtBit

  return s;
} /* end isqrt */

Robert also reported that cross-compiling the above routine with gcc version 2.8.1 and the options -O2 -fomit-frame-pointer produced the following ARM code:

@ Generated by gcc 2.8.1 for ARM/coff
.text
	.align	0
	.global	_isqrt
_isqrt:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, current_function_anonymous_args = 0
	mov	r2, #0
	cmn	r0, #-1073741823
	addhi	r0, r0, #-1073741824
	movhi	r2, #32768
.L2:
	add	r3, r2, #8192
	mov	r3, r3, asl #15
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #16384
.L3:
	add	r3, r2, #4096
	mov	r3, r3, asl #14
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #8192
.L4:
	add	r3, r2, #2048
	mov	r3, r3, asl #13
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #4096
.L5:
	add	r3, r2, #1024
	mov	r3, r3, asl #12
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #2048
.L6:
	add	r3, r2, #512
	mov	r3, r3, asl #11
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #1024
.L7:
	add	r3, r2, #256
	mov	r3, r3, asl #10
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #512
.L8:
	add	r3, r2, #128
	mov	r3, r3, asl #9
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #256
.L9:
	add	r3, r2, #64
	mov	r3, r3, asl #8
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #128
.L10:
	add	r3, r2, #32
	mov	r3, r3, asl #7
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #64
.L11:
	add	r3, r2, #16
	mov	r3, r3, asl #6
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #32
.L12:
	add	r3, r2, #8
	mov	r3, r3, asl #5
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #16
.L13:
	add	r3, r2, #4
	mov	r3, r3, asl #4
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #8
.L14:
	add	r3, r2, #2
	mov	r3, r3, asl #3
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #4
.L15:
	add	r3, r2, #1
	mov	r3, r3, asl #2
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #2
.L16:
	cmp	r0, r2, asl #1
	movls	r0, r2
	orrhi	r0, r2, #1
	mov	pc, lr

This code takes 5 cycles per bit. Robert noted that the code was obviously suboptimal and replacing each sequence of 5 instructions of the form:

	add	r3, r2, #8192
	mov	r3, r3, asl #15
	cmp	r0, r3
	rsbcs	r0, r3, r0
	orrcs	r2, r2, #16384

with

	add	r3, r2, #8192
	cmp	r0, r3, asl #15
	subcs	r0, r0, r3, asl #15
	orrcs	r2, r2, #16384

reduces the execution time to 4 cycles per bit.

9 cycle/bit C routine

Rich Walker suggested the following C routine. He noted that it was optimal in the sense that compiling it with the Norcroft C compiler produced exactly the assembler he expected.

static __pure unsigned int isqrt(unsigned int i)
{
  int max;
  int j;
  unsigned int a, c, d, s;
  a=0; c=0;
  if (i>=0x10000) max=15; else max=7;
  s = 1<<(max*2);
  j=max;
  do {
    d = c + (a<<(j+1)) + s;
    if (d<=i) { c=d; a |= 1<<j; };
    if (d!=i) { s>>=2; j--; } 
  } while ((j>=0)&&(d!=i));
  return a;
}

He noted that on C compilers other than Norcroft’s thou may want to remove the __pure directive and tell the C compiler in some other way that the routine is pure.

He also noted that the routine would probably benefit from being unrolled.

Wilco Dijkstra suggested optimising the C code to produce the following code which takes 9 cycles/bit:

typedef unsigned int uint32;

uint32 isqrt3(uint32 n)
{
   uint32 root = 0, bit, trial;
   bit = (n >= 0x10000) ? 1<<30 : 1<<14;
   do
   {
      trial = root+bit;
      if (n >= trial)
      {
         n -= trial;
         root = trial+bit;
      }
      root >>= 1;
      bit >>= 2;
   } while (bit);
   return root;
}

3 cycle/bit, 51 cycle total ARM code routine

Wilco Dijkstra provided the following ARM code routine which takes 3 cycles per bit with a 3 cycle overhead for a total of 51 cycles. He even showed how he derived it. Everything from here till the heading at the start of the next section is taken from the original article which was posted on the 20th of June 1996.

Goal

Calculate root = INT (SQRT (N)), for N = 0 .. 2^32 -1

Idea

Successive approximation of the equation (root + delta) ^ 2 = N until delta < 1. If delta < 1 we have the integer part of SQRT (N). Use delta = 2^i for i = 15 .. 0.

Init

Start with root = 0 and delta = 2^15.

Step

If (root + delta) ^ 2 <= N then
  root += delta
endif
delta = delta / 2

This step is repeated until delta = 0. Now root = INT (SQRT (N)).

Idea

(root + delta) ^ 2 = root^2 + 2 * root * delta + delta^2 = root^2 + delta * (2 * root + delta)
Now take M = N - root^2, and update M in each step of the algorithm:

Init

M = N, delta = 2^15

Step

if delta * (2 * root + delta) <= M then
  M -= delta * (2 * root + delta)
  root += delta
endif
delta = delta / 2

We need to calculate delta * (2 * root + delta) fast, without any multiplications. This is possible because delta is a power of 2. In ARM code we might optimize a bit by taking root' = root *2, and using loop unrolling for delta = 2^i.

   MOV   M, N
[ unroll for i = 15 .. 0
   ADD   t, root, #1 << i    ; t = 2 * root + delta
   CMP   M, t, LSL #i        ; if t * delta <= M then
   SUBHS M, M, t, LSL #i     ;   M -= t * delta
   ADDHS root, root, #2 << i ;   root += delta
]
   MOV   root, root, LSR #1

OK, now we have a 4-cycle per bit routine. For 3-cycle per bit we have to leave one instruction out... The idea is to calculate 2 * root + delta successively (so we don't need the ADD t, root, #1 << i instruction). In binary the value 2 * root + delta looks like this:

  xxxx01  ; the xxxx bits are the value of root, the zero is from 2 * root, 
            and the 1 is the delta bit.

Now look at successive values of 2 * root + delta:

  xxxx010000   
  xxxxa01000  ; a is the new bit of root (from root += delta)
  xxxxab0100  ; b new bit of root
  xxxxabc010

It can be seen that the new bit of the new approximation of root is put on the zero bit of the previous approximation, while the delta bit shifts one position. We want to calculate this new approximation in only ONE instruction! If you forget about the delta bit, and shift root to the right, you would get:

   xxxx
  xxxxa
 xxxxab

This is an ADC root, root, root instruction. Now, we use a rotate right, instead of a shift right:

010000xxxx
01000xxxxa
0100xxxxab

This can be done with an ADC root, offset, root, LSL #1 instruction. Take offset = 3^30:

010000xxxx  ; original root
10000xxxx0  ; root LSL #1
10000xxxxa  ; root LSL #1 + carry bit
01000xxxxa  ; ADC root, offset, root, LSL #1

Et voila! Now we have the following ARM routine:

; IN :  n 32 bit unsigned integer
; OUT:  root = INT (SQRT (n))
; TMP:  offset

MOV    offset, #3 << 30
MOV    root, #1 << 30
[ unroll for i = 0 .. 15
CMP    n, root, ROR #2 * i
SUBHS  n, n, root, ROR #2 * i
ADC    root, offset, root, LSL #1
]
BIC    root, root, #3 << 30  ; for rounding add: CMP n, root  ADC root, #1

This routine takes 3 * 16 + 3 = 51 cycles for a root of a 32 bit integer. Of course, there are many other possibilities. The routine can be used with some extra code which uses early termination if n is less than 32 bit. Also fixed point versions are possible, or roots of 64 bit numbers. Even looped versions are no problem: Shift in 8 or 16 bits at a time, so you need only to unroll 4 or 8 times (overhead 6 cycles).

4 cycle/bit C routine

Wilco Dijkstra also provided the following C code which produces optimised ARM code which takes 4 cycles per bit:

typedef unsigned uint32;

#define iter1(N) \
    try = root + (1 << (N)); \
    if (n >= try << (N))   \
    {   n -= try << (N);   \
        root |= 2 << (N); \
    }

uint32 sqrt (uint32 n)
{
    uint32 root = 0, try;
    iter1 (15);    iter1 (14);    iter1 (13);    iter1 (12);
    iter1 (11);    iter1 (10);    iter1 ( 9);    iter1 ( 8);
    iter1 ( 7);    iter1 ( 6);    iter1 ( 5);    iter1 ( 4);
    iter1 ( 3);    iter1 ( 2);    iter1 ( 1);    iter1 ( 0);
    return root >> 1;
}

Extending the precision to 32 bits

All the routines above take a 32 bit integer and give a 16 bit square root. For a lot of purposes, this is sufficient accuracy, but occasionally, more precision is needed.

Jan Vlietinck pointed out that you can if you need more precision, then instead of running an extra 16 iterations, you can instead us an error estimate to obtain the last 16 bits in one go at the cost of a multiply.

He provided an explanation of how the method works and a complete program demonstrating it with floating point numbers. He uses his own 4 cycle/bit square root routine to get the first 16 bits and then extends using his new method.

It might be an interesting exercise to combine this routine with Wilco’s 3 cycle/bit routine for the first 16 bits.

The following is from Jan’s article with only minor formatting changes:

First a quick word how the algorithm works. To find sqr(x) we write the following equality.
x = sqr(x)*sqr(x)

Now we multiply left and right side of the equation with a factor 1+2^-a for a=0 to 15,
x*(1+2^-a) = sqr(x)*sqr(x*(1+2^-a)²)
y = sqr(x)*sqr(x')

The purpose of this is to drive the next x' to 1. We only do the factor multiplication if x' < 1, assuming that x is first normalized so that 1 < x <= 0.25.
If we repeat this process some iterations we end up with:
y = sqr(x)*sqr(1+ err) = y
= sqr(x) + err'

So we find a good approximation for sqr(x), depending on the number of iterations. To reduce the number of iterations one can get a more accurate result earlier with some simple final calculations:
y = sqr(x)*sqr(1+err) =~ sqr(x)*(1+err/2)
So as our final square root value we calculate:
y_final = y/(1+ err/2) =~ y*(1-err/2)
This final correction requires one multiply but allows to increase the accuracy from 16 bits to 32 bits instead of doing the same with an extra 16 iterations.
Straight forward implementing this in BASIC yield following procedure:
DEF PROCroot(X)
  Y=X
  FOR A=0 TO 15
    IF X*(1+2^-A)^2<1 THEN X=X*(1+2^-A)^2:Y=Y*(1+2^-A)
  NEXT
  Y=Y*(1+(1-X)/2)
  PRINT Y
ENDPROC
An efficient ARM implementation of the same yields following instructions to be executed per iteration cycle.
ADDS   T,X,X,LSR #I  
ADDLOS T,T,T,LSR #I
MOVLO  X,T
ADDLO  Y,Y,Y,LSR #I
Iterating this for i=1..16, with X and Y initialized to x of y=sqr(x) gives the first approximation of sqr(x) in Y. A final correction with a MUL gives the desired approximation.
Implementing this in full BBC ARM assembler gives the following routine calculating the square root of a floating point number (BBC format). You can choose either or not for loop unrolling. With unrolling the total cost of this routine is about 80 cycles to calc the square root to full precision of a floating point number with 32 bit mantise.
Does anyone do better on the ARM ?
Jan Vlietinck.
vlietin@intec.rug.ac.be
Btw1: I have another routine that is more suited to calculate the integer part of the square root of a 32-bit integer ...
Btw2: It just springs to my mind that the same tricks could be applied to do divisions, which would lead to an ARM floating point division of an estimated 64 cycles !
Here follows the full program:
MODE 0
UNROLL=TRUE
PROCsqr
PRINT '"Calculating 50000 times the square root of a real"
IF UNROLL THEN
 PRINT "With unrolling main loop"
ELSE
 PRINT "Without unrolling main loop"
ENDIF
REPEAT
 PRINT '"Please enter x >0 "
 PRINT  "================="
 INPUT X
 |man_x=X
 TIME=0
 CALL sqr_x
 T=TIME/100
 PRINT '"Own routine"
 PRINT "sqr(x)= ";|man_sqr_x
 PRINT "Time= ";T;" seconds"'
 PRINT "Basic reference"
 PRINT "sqr(x)= ";SQR(X)
UNTIL FALSE


DEF PROCsqr
X=0:Y=1:T=2:M=3:H=10
DIM Q% 40000
FOR PASS=0 TO 2 STEP 2
P%=Q%
[OPT PASS
.man_x EQUD 0
.exp_x EQUB 0
ALIGN
.man_sqr_x EQUD 0
.exp_sqr_x EQUB 0
ALIGN
.dat1 EQUD 50000
.sqr_x
LDR H,dat1
.lop1
LDR X,man_x
LDRB T,exp_x
ORR X,X,#1<<31           ; Replace sign bit of mantise with MSBit set
MOVS T,T,LSR #1          ; If exponent is odd
MOVCS X,X,LSR #1         ; then scale mantise with 2, decrementing exponent 
ADC T,T,#64              ; and divide new exponent with 2
STRB T,exp_sqr_x
MOV Y,X
]
IF UNROLL THEN
FOR I=1 TO 16
[OPT PASS
ADDS   T,X,X,LSR #I
ADDLOS T,T,T,LSR #I
MOVLO  X,T
ADDLO  Y,Y,Y,LSR #I
]
NEXT I
ELSE
[OPT PASS
MOV M,#0
.for
ADD M,M,#1
ADDS   T,X,X,LSR M
ADDLOS T,T,T,LSR M
MOVLO  X,T
ADDLO  Y,Y,Y,LSR M
ADD M,M,#1
ADDS   T,X,X,LSR M
ADDLOS T,T,T,LSR M
MOVLO  X,T
ADDLO  Y,Y,Y,LSR M
CMP M,#16
BNE for
]
ENDIF
[OPT PASS
MOV T,#1<<31
SUB X,T,X,LSR #1      ; (1-X)/2
MOV T,Y,LSR #16
MUL T,X,T
ADD Y,Y,T,LSR #16     ; SQR(X)=Y+Y*(1-X)/2
ADD Y,Y,#1
BIC Y,Y,#1<<31        ; clear sign bit of mantise
STR Y,man_sqr_x
SUBS H,H,#1
BNE lop1
MOV PC,R14
]
NEXT
ENDPROC

Up to the welcome page.

Comments should be addressed to webmaster@pertinentdetail.org.

Copyright of the original messages belongs with the original authors. I do not claim copyright on reformatting the messages (so, if you have permission to use the original messages, you may use my reformatted versions). Extra comments added when creating this compilation are copyright © 1998 Steven Singer. If you have permission to use the original messages then I grant you permission to use my comments. If you’re trying to trace ownership, the following information may be helpful. These messages were originally posted in the Usenet newsgroup comp.sys.arm in 1998 and the message ID of the first message in the thread was <2c76468f48%David@chescam.demon.co.uk>. At the time of writing (2020), the thread can be found on Google’s Usenet archive by following this link. The first message in this thread included a repost of a message posted in comp.sys.acorn.programmer in 1997 with message ID <na.f94e5f47fd.a702a0dmaloney@argonet.co.uk> available at this link at time of writing. For Wilco Dijkstra’s code, it may be worth reading the email exchange captured in the LICENSE file for WebRTC.