CMPE 310 Lecture 6,

    <- previous    index    next ->

Lecture 6 Branching and loops

UGH! Note that < and > are interpreted by HTML,
thus source code, physically included, has & gt ; rather than symbol.
Be sure to download from link, not from HTML.

The basic integer compare instruction is  "cmp"
Following this instruction is typically one of:
  JL  label  ; jump on less than  "<"
  JLE label  ; jump on less than or equal "<="
  JG  label  ; jump on greater than ">"
  JGE label  ; jump on greater than or equal ">="
  JE  label  ; jump on equal "=="
  JNE label  ; jump on not equal "!="

After many integer arithmetic instructions
  JZ  label  ; jump on zero
  JNZ label  ; jump on non zero
  JS  label  ; jump on sign plus
  JNS label;  ; jump on sign not plus

Note: Use 'cmp' rather than 'sub' for comparison.
Overflow can occur on subtraction resulting in sign inversion.

if-then-else in assembly language
Convert a "C" 'if' statement to nasm assembly ifint_64.asm
The significant features are:
1) use a compare instruction for the test
2) put a label on the start of the false branch (e.g. false1:)
3) put a label after the end of the 'if' statement (e.g. exit1:)
4) choose a conditional jump that goes to the false part
5) put an unconditional jump to (e.g. exit1:) at the end of the true part

; ifint_64.asm  code ifint_64.c for nasm 
; /* ifint_64.c an 'if' statement that will be coded for nasm */
; #include <stdio.h>
; int main()
; {
;   long int a=1;
;   long int b=2;
;   long int c=3;
;   if(a<b)
;     printf("true a < b \n");
;   else
;     printf("wrong on a < b \n");
;   if(b>c)
;     printf("wrong on b > c \n");
;   else
;     printf("false b > c \n");
;   return 0;
;}
; result of executing both "C" and assembly is:
; true a < b
; false b > c 
	
	global	main		; define for linker
        extern	printf		; tell linker we need this C function
        section .data		; Data section, initialized variables
a:	dq 1
b:	dq 2
c:	dq 3
fmt1:   db "true a < b ",10,0
fmt2:   db "wrong on a < b ",10,0
fmt3:   db "wrong on b > c ",10,0
fmt4:   db "false b > c ",10,0

	section .text
main:	push	rbp		; set up stack
	mov	rax,[a]		; a
	cmp	rax,[b]		; compare a to b
	jge	false1		; choose jump to false part
	; a < b sign is set
        mov	rdi, fmt1	; printf("true a < b \n"); 
        call    printf	
        jmp	exit1		; jump over false part
false1:	;  a < b is false 
        mov	rdi, fmt2	; printf("wrong on a < b \n");
        call    printf
exit1:				; finished 'if' statement

	mov	rax,[b]		; b
	cmp	rax,[c]		; compare b to c
	jle	false2		; choose jump to false part
	; b > c sign is not set
        mov	rdi, fmt3	; printf("wrong on b > c \n");
        call    printf	
        jmp	exit2		; jump over false part
false2:	;  b > c is false 
        mov	rdi, fmt4	; printf("false b > c \n");
        call    printf
exit2:				; finished 'if' statement

	pop	rbp		; restore stack
	mov	rax,0		; normal, no error, return value
	ret			; return 0;



loop in assembly language
Convert a "C" loop to nasm assembly  loopint_64.asm
The significant features are:
1) "C" long int  is 8-bytes, thus  dd1[1] becomes  dword [dd1+8]
                              dd1[99] becomes  dword [dd1+8*99]

2) "C" long int  is 8-bytes, thus  dd1[i]; i++; becomes  add edi,8
   since "i" is never stored, the register  edi  holds "i"

3) the 'cmp' instruction sets flags that control the jump instruction.
   cmp  edi,8*99   is like  i<99 in "C"
   jne  loop1      jumps if register  edi  is not  8*99

; loopint_64.asm  code loopint.c for nasm 
; /* loopint_64.c a very simple loop that will be coded for nasm */
; #include <stdio.h>
; int main()
; {
;   long int dd1[100]; // 100 could be 3 gigabytes
;   long int i;        // must be long for more than 2 gigabytes
;   dd1[0]=5; /* be sure loop stays 1..98 */
;   dd1[99]=9;
;   for(i=1; i<99; i++) dd1[i]=7;
;   printf("dd1[0]=%ld, dd1[1]=%ld, dd1[98]=%ld, dd1[99]=%ld\n",
;           dd1[0], dd1[1], dd1[98],dd1[99]);
;   return 0;
;}
; execution output is dd1[0]=5, dd1[1]=7, dd1[98]=7, dd1[99]=9
 
	section	.bss
dd1:	resq	100			; reserve 100 long int
i:	resq	1			; actually unused, kept in register

        section .data			; Data section, initialized variables
fmt:    db "dd1[0]=%ld, dd1[1]=%ld, dd1[98]=%ld, dd1[99]=%ld",10,0
	
        extern	printf			; the C function, to be called

	section .text
	global	main
main:	push	rbp			; set up stack

	mov	qword [dd1],5	   	; dd1[0]=5;  memory to memory
	mov	qword [dd1+99*8],9 	; dd1[99]=9; indexed 99 qword

	mov 	rdi, 1*8		; i=1; index, will move by 8 bytes
loop1:	mov 	qword [dd1+rdi],7	; dd1[i]=7;
	add	rdi, 8			; i++;  8 bytes 
	cmp	rdi, 8*99		; i<99
	jne	loop1			; loop until incremented i=99
	
	mov	rdi, fmt		; pass address of format
	mov	rsi, qword [dd1]	; dd1[0]   first list parameter
	mov	rdx, qword [dd1+1*8]	; dd1[1]   second list parameter
	mov	rcx, qword [dd1+98*8]	; dd1[98]  third list parameter
	mov	r8,  qword [dd1+99*8]	; dd1[99]  fourth list parameter
	mov	rax, 0			; no xmm used
        call    printf			; Call C function

	pop	rbp			; restore stack
	mov	rax,0			; normal, no error, return value
	ret				; return 0;

	
logic operations in assembly language
Previously, integer arithmetic in "C" was converted to
NASM assembly language. The following is very similar
(cut and past) of intarith_64.asm to intlogic_64.asm that
shows the "C" operators "&" and, "|" or, "^" xor, "~" not.

intlogic_64.asm

; intlogic_64.asm    show some simple C code and corresponding nasm code
;                    the nasm code is one sample, not unique
;
; compile:	nasm -f elf64 -l intlogic_64.lst  intlogic_64.asm
; link:		gcc -m64 -o intlogic_64  intlogic_64.o
; run:		./intlogic_64 > intlogic_64.out
;
; the output from running intlogic_64.asm and intlogic.c is
; c=5  , a=3, b=5, c=15
; c=a&b, a=3, b=5, c=1
; c=a|b, a=3, b=5, c=7
; c=a^b, a=3, b=5, c=6
; c=~a , a=3, b=5, c=-4
;
;The file  intlogic.c  is:
;  #include <stdio.h>
;  int main()
;  { 
;    long int a=3, b=5, c;
;
;    c=15;
;    printf("%s, a=%d, b=%d, c=%d\n","c=5  ", a, b, c);
;    c=a&b; /* and */
;    printf("%s, a=%d, b=%d, c=%d\n","c=a&b", a, b, c);
;    c=a|b; /* or */
;    printf("%s, a=%d, b=%d, c=%d\n","c=a|b", a, b, c);
;    c=a^b; /* xor */
;    printf("%s, a=%d, b=%d, c=%d\n","c=a^b", a, b, c);
;    c=~a;  /* not */
;    printf("%s, a=%d, b=%d, c=%d\n","c=~a", a, b, c);
;    return 0;
; }

        extern printf		; the C function to be called

%macro	pabc 1			; a "simple" print macro
	section .data
.str	db	%1,0		; %1 is first actual in macro call
	section .text
        mov	rdi, fmt        ; address of format string
	mov	rsi, .str 	; users string
	mov	rdx, [a]	; long int a
	mov	rcx, [b]	; long int b 
	mov	r8, [c]		; long int c
	mov     rax, 0	        ; no xmm used
        call    printf          ; Call C function
%endmacro
	
	section .data  		; preset constants, writeable
a:	dq	3		; 64-bit variable a initialized to 3
b:	dq	5		; 64-bit variable b initializes to 4
fmt:    db "%s, a=%ld, b=%ld, c=%ld",10,0 ; format string for printf
	
	section .bss 		; unitialized space
c:	resq	1		; reserve a 64-bit word

	section .text		; instructions, code segment
	global	 main		; for gcc standard linking
main:				; label
	push	rbp		; set up stack
	
lit5:				; c=5;
	mov	rax,15	 	; 5 is a literal constant
	mov	[c],rax		; store into c
	pabc	"c=5  "		; invoke the print macro
	
andb:				; c=a&b;
	mov	rax,[a]	 	; load a
	and	rax,[b]		; and with b
	mov	[c],rax		; store into c
	pabc	"c=a&b"		; invoke the print macro
	
orw:				; c=a-b;
	mov	rax,[a]	 	; load a
	or	rax,[b]		; logical or with b
	mov	[c],rax		; store into c
	pabc	"c=a|b"		; invoke the print macro
	
xorw:				; c=a^b;
	mov	rax,[a]	 	; load a
	xor	rax,[b] 	; exclusive or with b
	mov	[c],rax		; store result in c
	pabc	"c=a^b"		; invoke the print macro
	
notw:				; c=~a;
	mov	rax,[a]	 	; load c
	not	rax	 	; not, complement
	mov	[c],rax		; store result into c
	pabc	"c=~a "		; invoke the print macro

	pop	rbp		; restore stack
	mov     rax,0           ; exit code, 0=normal
	ret			; main returns to operating system


loops in assembly language
One significant use of loops is to evaluate polynomials and
convert numbers from one base to another.
(Yes, this is related to project 1 for CMPE 310)

The following program has three loops.

Loop3 (h3loop) uses Horners method to evaluate a polynomial,
       using 'rdi' as an index, 'rcx' and 'loop' to do the loop.
       a_0 is first in the array, n=4.

Loop4 (h4loop) uses Horners method, with data order optimized,
      using 'rcx' as both index and loop counter, to get a
      three instruction loop.
      a_4 is first in the array, n=4.

Loop5 (h5loop) uses Horners method to evaluate a polynomial
      using double precision floating point. Note 8 byte
      increment and quad word to xmm0, to printf.


Horners method to evaluate polynomials in assembly language
Study horner_64.asm to understand
the NASM coding of the loops.

; horner_64.asm  Horners method of evaluating polynomials
;
; given a polynomial  Y = a_n X^n + a_n-1 X^n-1 + ... a_1 X + a_0
; a_n is the coefficient 'a' with subscript n. X^n is X to nth power
; compute y_1 = a_n * X + a_n-1
; compute y_2 = y_1 * X + a_n-2
; compute y_i = y_i-1 * X + a_n-i   i=3..n
; thus    y_n = Y = value of polynomial 
;
; in assembly language:
;   load some register with a_n, multiply by X
;   add a_n-1, multiply by X, add a_n-2, multiply by X, ...
;   finishing with the add  a_0
;
; output from execution:
; a  6319
; aa 6319
; af 6.319000e+03

	extern	printf
	section	.data
	global	main

	section	.data
fmta:	db	"a  %ld",10,0
fmtaa:	db	"aa %ld",10,0
fmtflt:	db	"af %e",10,0

	section	.text
main:	push	rbp		; set up stack

; evaluate an integer polynomial, X=7, using a count

	section	.data
a:	dq	2,5,-7,22,-9	; coefficients of polynomial, a_n first
X:	dq	7		; X = 7
				; n=4, 8 bytes per coefficient
	section	.text
	mov	rax,[a]		; accumulate value here, get coefficient a_n
	mov	rdi,1		; subscript initialization
	mov	rcx,4		; loop iteration count initialization, n
h3loop:	imul	rax,[X]		; * X     (ignore edx)
	add	rax,[a+8*rdi]	; + a_n-i
	inc	rdi		; increment subscript
	loop	h3loop		; decrement rcx, jump on non zero

	mov	rsi, rax	; print rax
	mov	rdi, fmta	; format
	mov	rax, 0		; no float
	call	printf


; evaluate an integer polynomial, X=7, using a count as index
; optimal organization of data allows a three instruction loop
	
	section	.data
aa:	dq	-9,22,-7,5,2	; coefficients of polynomial, a_0 first
n:	dq	4		; n=4, 8 bytes per coefficient
	section	.text
	mov	rax,[aa+4*8]	; accumulate value here, get coefficient a_n
	mov	rcx,[n]		; loop iteration count initialization, n
h4loop:	imul	rax,[X]		; * X     (ignore edx)
	add	rax,[aa+8*rcx-8]; + aa_n-i
	loop	h4loop		; decrement rcx, jump on non zero

	mov	rsi, rax	; print rax
	mov	rdi, fmtaa	; format
	mov	rax, 0		; no float
	call	printf

; evaluate a double floating polynomial, X=7.0, using a count as index
; optimal organization of data allows a three instruction loop
	
	section	.data
af:	dq	-9.0,22.0,-7.0,5.0,2.0	; coefficients of polynomial, a_0 first
XF:	dq	7.0
Y:	dq	0.0
N:	dd	4

	section	.text
	mov	rcx,[N]		; loop iteration count initialization, n
	fld	qword [af+8*rcx]; accumulate value here, get coefficient a_n
h5loop:	fmul	qword [XF]	; * XF
	fadd	qword [af+8*rcx-8] ; + aa_n-i
	loop	h5loop		; decrement rcx, jump on non zero

	fstp	qword [Y]	; store Y in order to print Y
	movq	xmm0, qword [Y]	; well, may just mov reg
	mov	rdi, fmtflt	; format
	mov	rax, 1		; one float
	call	printf

	pop	rbp		; restore stack
	mov	rax,0		; normal return
	ret			; return

A "C" version with same data, slightly different code sequence.

// horner_64.c long integer and double Horners method of evaluating polynomials
//             everything 64-bit
// given a polynomial  Y = a_n X^n + a_n-1 X^n-1 + ... a_1 X + a_0
// a_n is the coefficient 'a' with subscript n. X^n is X to nth power
// compute y_1 = a_n * X + a_n-1
// compute y_2 = y_1 * X + a_n-2
// compute y_i = y_i-1 * X + a_n-i   i=3..n
// thus    y_n = Y = value of polynomial 

 #include <stdio.h>
int main(int argc, char *argv[])
{
  long int a[]  = {2, 5, -7, 22, -9}; // a_n first
  long int aa[] = {-9, 22, -7, 5, 2}; // aa_0 first
  double af[]   = {-9.0, 22.0, -7.0, 5.0, 2.0}; // af_0 first
  long int n    = 4;
  long int X, Y;
  double XF, YF; 
  long int i;

  // evaluate an integer polynomial a, X=7, using a_n first, count n
  X = 7;
  Y = a[0]*X + a[1];
  for(i=2; i<=n; i++) Y = Y*X + a[i];
  printf("a  %ld\n", Y);

  // evaluate an integer polynomial aa , X=7, using a_0 first, count n
  X = 7;
  Y = aa[n]*X + aa[n-1];
  for(i=n-2; i>=0; i--) Y = Y*X + aa[i];
  printf("aa %ld\n", Y);

  // evaluate a double floating polynomial, X=7.0, using af_0 first, n
  XF = 7.0;
  YF = af[n]*X + af[n-1];
  for(i=n-2; i>=0; i--) YF = YF*XF + af[i];
  printf("af %e\n", YF);

  return 0;
}

Same output:
a  6319
aa 6319
af 6.319000e+03

serial vs parallel, slow vs fast
Multiply hardware, serial


Multiply hardware, parallel











Then for wiring ground and power


Possibly many mask layers


Many complete chips are baked on a wafer