<- previous index next ->
Both integer and floating point arithmetic are demonstrated.
In order to make the source code smaller, a macro is defined
to print out results. The equivalent "C" program is given as
comments.
First, see how to call the "C" library function, printf, to make
it easier to print values:
Look at the file printf1_64.asm
; printf1_64.asm print an integer from storage and from a register
; Assemble: nasm -f elf64 -l printf1_64.lst printf1_64.asm
; Link: gcc -m64 -o printf1_64 printf1_64.o
; Run: ./printf1_64 > printf1_64.out
; Output: a=5, rax=7
; Equivalent C code
; /* printf1.c print a long int, 64-bit, and an expression */
; #include <stdio.h>
; int main()
; {
; long int a=5;
; printf("a=%ld, rax=%ld\n", a, a+2);
; return 0;
; }
; Declare external function
extern printf ; the C function, to be called
SECTION .data ; Data section, initialized variables
a: dq 5 ; long int a=5;
fmt: db "a=%ld, rax=%ld", 10, 0 ; The printf format, "\n",'0'
SECTION .text ; Code section.
global main ; the standard gcc entry point
main: ; the program label for the entry point
push rbp ; set up stack frame
mov rax,[a] ; put "a" from store into register
add rax,2 ; a+2 add constant 2
mov rdi,fmt ; format for printf
mov rsi,[a] ; first parameter for printf
mov rdx,rax ; second parameter for printf
mov rax,0 ; no xmm registers
call printf ; Call C function
pop rbp ; restore stack
mov rax,0 ; normal, no error, return value
ret ; return
Printing floating point
Now, we may need to print "float" and "double" and calling printf
gets more complicated. Still easier than doing your own conversion.
Look at the file printf2.asm
Output is printf2.out
; printf2_64.asm use "C" printf on char, string, int, long int, float, double
;
; Assemble: nasm -f elf64 -l printf2_64.lst printf2_64.asm
; Link: gcc -m64 -o printf2_64 printf2_64.o
; Run: ./printf2_64 > printf2_64.out
; Output: cat printf2_64.out
;
; A similar "C" program printf2_64.c
; #include <stdio.h>
; int main()
; {
; char char1='a'; /* sample character */
; char str1[]="mystring"; /* sample string */
; int len=9; /* sample string */
; int inta1=12345678; /* sample integer 32-bit */
; long int inta2=12345678900; /* sample long integer 64-bit */
; long int hex1=0x123456789ABCD; /* sample hexadecimal 64-bit*/
; float flt1=5.327e-30; /* sample float 32-bit */
; double flt2=-123.4e300; /* sample double 64-bit*/
;
; printf("printf2_64: flt2=%e\n", flt2);
; printf("char1=%c, srt1=%s, len=%d\n", char1, str1, len);
; printf("char1=%c, srt1=%s, len=%d, inta1=%d, inta2=%ld\n",
; char1, str1, len, inta1, inta2);
; printf("hex1=%lX, flt1=%e, flt2=%e\n", hex1, flt1, flt2);
; return 0;
; }
extern printf ; the C function to be called
SECTION .data ; Data section
; format strings for printf
fmt2: db "printf2: flt2=%e", 10, 0
fmt3: db "char1=%c, str1=%s, len=%d", 10, 0
fmt4: db "char1=%c, str1=%s, len=%d, inta1=%d, inta2=%ld", 10, 0
fmt5: db "hex1=%lX, flt1=%e, flt2=%e", 10, 0
char1: db 'a' ; a character
str1: db "mystring",0 ; a C string, "string" needs 0
len: equ $-str1 ; len has value, not an address
inta1: dd 12345678 ; integer 12345678, note dd
inta2: dq 12345678900 ; long integer 12345678900, note dq
hex1: dq 0x123456789ABCD ; long hex constant, note dq
flt1: dd 5.327e-30 ; 32-bit floating point, note dd
flt2: dq -123.456789e300 ; 64-bit floating point, note dq
SECTION .bss
flttmp: resq 1 ; 64-bit temporary for printing flt1
SECTION .text ; Code section.
global main ; "C" main program
main: ; label, start of main program
push rbp ; set up stack frame
fld dword [flt1] ; need to convert 32-bit to 64-bit
fstp qword [flttmp] ; floating load makes 80-bit,
; store as 64-bit
mov rdi,fmt2
movq xmm0, qword [flt2]
mov rax, 1 ; 1 xmm register
call printf
mov rdi, fmt3 ; first arg, format
mov rsi, [char1] ; second arg, char
mov rdx, str1 ; third arg, string
mov rcx, len ; fourth arg, int
mov rax, 0 ; no xmm used
call printf
mov rdi, fmt4 ; first arg, format
mov rsi, [char1] ; second arg, char
mov rdx, str1 ; third arg, string
mov rcx, len ; fourth arg, int
mov r8, [inta1] ; fifth arg, inta1 32->64
mov r9, [inta2] ; sixth arg, inta2
mov rax, 0 ; no xmm used
call printf
mov rdi, fmt5 ; first arg, format
mov rsi, [hex1] ; second arg, char
movq xmm0, qword [flttmp] ; first double
movq xmm1, qword [flt2] ; second double
mov rax, 2 ; 2 xmm used
call printf
pop rbp ; restore stack
mov rax, 0 ; exit code, 0=normal
ret ; main returns to operating system
Integer arithmetic
Now, for integer arithmetic, look at the file intarith_64.asm
Output is intarith_64.out
C version is intarith_64.c
Since all the lines use the same format, a macro was created
to do the call on printf.
; intarith_64.asm show some simple C code and corresponding nasm code
; the nasm code is one sample, not unique
;
; compile: nasm -f elf64 -l intarith_64.lst intarith_64.asm
; link: gcc -m64 -o intarith_64 intarith_64.o
; run: ./intarith_64 > intarith_64.out
;
; the output from running intarith_64.asm and intarith.c is:
; c=5 , a=3, b=4, c=5
; c=a+b, a=3, b=4, c=7
; c=a-b, a=3, b=4, c=-1
; c=a*b, a=3, b=4, c=12
; c=c/a, a=3, b=4, c=4
;
;The file intarith.c is:
; /* intarith.c */
; #include <stdio.h>
; int main()
; {
; long int a=3, b=4, c;
; c=5;
; printf("%s, a=%ld, b=%ld, c=%ld\n","c=5 ", a, b, c);
; c=a+b;
; printf("%s, a=%ld, b=%ld, c=%ld\n","c=a+b", a, b, c);
; c=a-b;
; printf("%s, a=%ld, b=%ld, c=%ld\n","c=a-b", a, b, c);
; c=a*b;
; printf("%s, a=%ld, b=%ld, c=%ld\n","c=a*b", a, b, c);
; c=c/a;
; printf("%s, a=%ld, b=%ld, c=%ld\n","c=c/a", a, b, c);
; return 0;
; }
extern printf ; the C function to be called
%macro pabc 1 ; a "simple" print macro
section .data
.str db %1,0 ; %1 is first actual in macro call
section .text
mov rdi, fmt4 ; first arg, format
mov rsi, .str ; second arg
mov rdx, [a] ; third arg
mov rcx, [b] ; fourth arg
mov r8, [c] ; fifth arg
mov rax, 0 ; no xmm used
call printf ; Call C function
%endmacro
section .data ; preset constants, writeable
a: dq 3 ; 64-bit variable a initialized to 3
b: dq 4 ; 64-bit variable b initializes to 4
fmt4: db "%s, a=%ld, b=%ld, c=%ld",10,0 ; format string for printf
section .bss ; unitialized space
c: resq 1 ; reserve a 64-bit word
section .text ; instructions, code segment
global main ; for gcc standard linking
main: ; label
push rbp ; set up stack
lit5: ; c=5;
mov rax,5 ; 5 is a literal constant
mov [c],rax ; store into c
pabc "c=5 " ; invoke the print macro
addb: ; c=a+b;
mov rax,[a] ; load a
add rax,[b] ; add b
mov [c],rax ; store into c
pabc "c=a+b" ; invoke the print macro
subb: ; c=a-b;
mov rax,[a] ; load a
sub rax,[b] ; subtract b
mov [c],rax ; store into c
pabc "c=a-b" ; invoke the print macro
mulb: ; c=a*b;
mov rax,[a] ; load a (must be rax for multiply)
imul qword [b] ; signed integer multiply by b
mov [c],rax ; store bottom half of product into c
pabc "c=a*b" ; invoke the print macro
diva: ; c=c/a;
mov rax,[c] ; load c
mov rdx,0 ; load upper half of dividend with zero
idiv qword [a] ; divide double register edx rax by a
mov [c],rax ; store quotient into c
pabc "c=c/a" ; invoke the print macro
pop rbp ; pop stack
mov rax,0 ; exit code, 0=normal
ret ; main returns to operating system
Note that two registers are used for general multiply and divide.
bbbb [mem] a product of 64-bits times 64-bits is 128-bits
imul bbbb rax
---------
rdx bbbbbbbb rax the upper part of the product is in rdx
the lower part of the product is in rax
rdx bbbbbbbb rax before divide, the upper part of dividend is in rdx
the lower part of dividend is in rax
idiv bbbb [mem] the divisor
--------
after divide, the quotient is in rax
the remainder is in rdx
Floating point arithmetic
Now, for floating point arithmetic, look at the file fltarith_64.asm
Output is fltarith_64.out
C version is fltarith_64.c
Since all the lines use the same format, a macro was created
to do the call on printf.
Note the many similarities to integer arithmetic, yet some basic differences.
; fltarith_64.asm show some simple C code and corresponding nasm code
; the nasm code is one sample, not unique
;
; compile nasm -f elf64 -l fltarith_64.lst fltarith_64.asm
; link gcc -m64 -o fltarith_64 fltarith_64.o
; run ./fltarith_64 > fltarith_64.out
;
; the output from running fltarith and fltarithc is:
; c=5.0, a=3.000000e+00, b=4.000000e+00, c=5.000000e+00
; c=a+b, a=3.000000e+00, b=4.000000e+00, c=7.000000e+00
; c=a-b, a=3.000000e+00, b=4.000000e+00, c=-1.000000e+00
; c=a*b, a=3.000000e+00, b=4.000000e+00, c=1.200000e+01
; c=c/a, a=3.000000e+00, b=4.000000e+00, c=4.000000e+00
; a=i , a=8.000000e+00, b=1.600000e+01, c=1.600000e+01
; a<=b , a=8.000000e+00, b=1.600000e+01, c=1.600000e+01
; b==c , a=8.000000e+00, b=1.600000e+01, c=1.600000e+01
;The file fltarith.c is:
; #include <stdio.h>
; int main()
; {
; double a=3.0, b=4.0, c;
; long int i=8;
;
; c=5.0;
; printf("%s, a=%e, b=%e, c=%e\n","c=5.0", a, b, c);
; c=a+b;
; printf("%s, a=%e, b=%e, c=%e\n","c=a+b", a, b, c);
; c=a-b;
; printf("%s, a=%e, b=%e, c=%e\n","c=a-b", a, b, c);
; c=a*b;
; printf("%s, a=%e, b=%e, c=%e\n","c=a*b", a, b, c);
; c=c/a;
; printf("%s, a=%e, b=%e, c=%e\n","c=c/a", a, b, c);
; a=i;
; b=a+i;
; i=b;
; c=i;
; printf("%s, a=%e, b=%e, c=%e\n","c=c/a", a, b, c);
; if(a<b) printf("%s, a=%e, b=%e, c=%e\n","a<=b ", a, b, c);
; else printf("%s, a=%e, b=%e, c=%e\n","a>b ", a, b, c);
; if(b==c)printf("%s, a=%e, b=%e, c=%e\n","b==c ", a, b, c);
; else printf("%s, a=%e, b=%e, c=%e\n","b!=c ", a, b, c);
; return 0;
; }
extern printf ; the C function to be called
%macro pabc 1 ; a "simple" print macro
section .data
.str db %1,0 ; %1 is macro call first actual parameter
section .text
; push onto stack backwards
mov rdi, fmt ; address of format string
mov rsi, .str ; string passed to macro
movq xmm0, qword [a] ; first floating point in fmt
movq xmm1, qword [b] ; second floating point
movq xmm2, qword [c] ; third floating point
mov rax, 3 ; 3 floating point arguments to printf
call printf ; Call C function
%endmacro
section .data ; preset constants, writeable
a: dq 3.0 ; 64-bit variable a initialized to 3.0
b: dq 4.0 ; 64-bit variable b initializes to 4.0
i: dq 8 ; a 64 bit integer
five: dq 5.0 ; constant 5.0
fmt: db "%s, a=%e, b=%e, c=%e",10,0 ; format string for printf
section .bss ; unitialized space
c: resq 1 ; reserve a 64-bit word
section .text ; instructions, code segment
global main ; for gcc standard linking
main: ; label
push rbp ; set up stack
lit5: ; c=5.0;
fld qword [five] ; 5.0 constant
fstp qword [c] ; store into c
pabc "c=5.0" ; invoke the print macro
addb: ; c=a+b;
fld qword [a] ; load a (pushed on flt pt stack, st0)
fadd qword [b] ; floating add b (to st0)
fstp qword [c] ; store into c (pop flt pt stack)
pabc "c=a+b" ; invoke the print macro
subb: ; c=a-b;
fld qword [a] ; load a (pushed on flt pt stack, st0)
fsub qword [b] ; floating subtract b (to st0)
fstp qword [c] ; store into c (pop flt pt stack)
pabc "c=a-b" ; invoke the print macro
mulb: ; c=a*b;
fld qword [a] ; load a (pushed on flt pt stack, st0)
fmul qword [b] ; floating multiply by b (to st0)
fstp qword [c] ; store product into c (pop flt pt stack)
pabc "c=a*b" ; invoke the print macro
diva: ; c=c/a;
fld qword [c] ; load c (pushed on flt pt stack, st0)
fdiv qword [a] ; floating divide by a (to st0)
fstp qword [c] ; store quotient into c (pop flt pt stack)
pabc "c=c/a" ; invoke the print macro
intflt: ; a=i;
fild qword [i] ; load integer as floating point
fst qword [a] ; store the floating point (no pop)
fadd st0 ; b=a+i; 'a' as 'i' already on flt stack
fst qword [b] ; store sum (no pop) 'b' still on stack
fistp qword [i] ; i=b; store floating point as integer
fild qword [i] ; c=i; load again from ram (redundant)
fstp qword [c]
pabc "a=i " ; invoke the print macro
cmpflt: fld qword [b] ; into st0, then pushed to st1
fld qword [a] ; in st0
fcomip st0,st1 ; a compare b, pop a
jg cmpfl2
pabc "a<=b "
jmp cmpfl3
cmpfl2:
pabc "a>b "
cmpfl3:
fld qword [c] ; should equal [b]
fcomip st0,st1
jne cmpfl4
pabc "b==c "
jmp cmpfl5
cmpfl4:
pabc "b!=c "
cmpfl5:
pop rbp ; pop stack
mov rax,0 ; exit code, 0=normal
ret ; main returns to operating system
Shift data in a register
Refer to nasmdoc.txt for details.
A brief summary is provided here.
"reg" is an 8-bit, 16-bit or 32-bit or 64-bit register
"count" is a number of bits to shift
"right" moves contents of the register to the right, makes it smaller
"left" moves contents of the register to the left, makes it bigger
SAL reg,count shift arithmetic left
SAR reg,count shift arithmetic right (sign extension)
SHL reg,count shift left (logical, zero fill)
SHR reg,count shift right (logical, zero fill)
ROL reg,count rotate left
ROR reg,count rotate right
SHLD reg1,reg2,count shift left double-register
SHRD reg1,reg2,count shift right double-register
An example of using the various shifts is in: shift_64.asm
Output is shift_64.out
Just to make it easy to check, we keep all shift amounts a multiple
of 4, 4 bits per hex digit in output.
; shift_64.asm the nasm code is one sample, not unique
;
; compile: nasm -f elf64 -l shift_64.lst shift_64.asm
; link: gcc -m64 -o shift_64 shift_64.o
; run: ./shift_64 > shift_64.out
;
; the output from running shift.asm (zero filled) is:
; shl rax,4, old rax=ABCDEF0987654321, new rax=BCDEF09876543210,
; shl rax,8, old rax=ABCDEF0987654321, new rax=CDEF098765432100,
; shr rax,4, old rax=ABCDEF0987654321, new rax= ABCDEF098765432,
; sal rax,8, old rax=ABCDEF0987654321, new rax=CDEF098765432100,
; sar rax,4, old rax=ABCDEF0987654321, new rax=FABCDEF098765432,
; rol rax,4, old rax=ABCDEF0987654321, new rax=BCDEF0987654321A,
; ror rax,4, old rax=ABCDEF0987654321, new rax=1ABCDEF098765432,
; shld rdx,rax,8, old rdx:rax=0,ABCDEF0987654321,
; new rax=ABCDEF0987654321 rdx= AB,
; shl rax,8 , old rdx:rax=0,ABCDEF0987654321,
; new rax=CDEF098765432100 rdx= AB,
; shrd rdx,rax,8, old rdx:rax=0,ABCDEF0987654321,
; new rax=ABCDEF0987654321 rdx=2100000000000000,
; shr rax,8 , old rdx:rax=0,ABCDEF0987654321,
; new rax= ABCDEF09876543 rdx=2100000000000000,
extern printf ; the C function to be called
%macro prt 1 ; old and new rax
section .data
.str db %1,0 ; %1 is which shift string
section .text
mov rdi, fmt ; address of format string
mov rsi, .str ; callers string
mov rdx,rax ; new value
mov rax, 0 ; no floating point
call printf ; Call C function
%endmacro
%macro prt2 1 ; old and new rax,rdx
section .data
.str db %1,0 ; %1 is which shift
section .text
mov rdi, fmt2 ; address of format string
mov rsi, .str ; callers string
mov rcx, rdx ; new rdx befor next because used
mov rdx, rax ; new rax
mov rax, 0 ; no floating point
call printf ; Call C function
%endmacro
section .bss
raxsave: resq 1 ; save rax while calling a function
rdxsave: resq 1 ; save rdx while calling a function
section .data ; preset constants, writeable
b64: dq 0xABCDEF0987654321 ; data to shift
fmt: db "%s, old rax=ABCDEF0987654321, new rax=%16lX, ",10,0 ; format string
fmt2: db "%s, old rdx:rax=0,ABCDEF0987654321,",10," new rax=%16lX rdx=%16lX, ",10,0
section .text ; instructions, code segment
global main ; for gcc standard linking
main: push rbp ; set up stack
shl1: mov rax, [b64] ; data to shift
shl rax, 4 ; shift rax 4 bits, one hex position left
prt "shl rax,4 " ; invoke the print macro
shl4: mov rax, [b64] ; data to shift
shl rax,8 ; shift rax 8 bits. two hex positions left
prt "shl rax,8 " ; invoke the print macro
shr4: mov rax, [b64] ; data to shift
shr rax,4 ; shift
prt "shr rax,4 " ; invoke the print macro
sal4: mov rax, [b64] ; data to shift
sal rax,8 ; shift
prt "sal rax,8 " ; invoke the print macro
sar4: mov rax, [b64] ; data to shift
sar rax,4 ; shift
prt "sar rax,4 " ; invoke the print macro
rol4: mov rax, [b64] ; data to shift
rol rax,4 ; shift
prt "rol rax,4 " ; invoke the print macro
ror4: mov rax, [b64] ; data to shift
ror rax,4 ; shift
prt "ror rax,4 " ; invoke the print macro
shld4: mov rax, [b64] ; data to shift
mov rdx,0 ; register receiving bits
shld rdx,rax,8 ; shift
mov [raxsave],rax ; save, destroyed by function
mov [rdxsave],rdx ; save, destroyed by function
prt2 "shld rdx,rax,8"; invoke the print macro
shla: mov rax,[raxsave] ; restore, destroyed by function
mov rdx,[rdxsave] ; restore, destroyed by function
shl rax,8 ; finish double shift, both registers
prt2 "shl rax,8 "; invoke the print macro
shrd4: mov rax, [b64] ; data to shift
mov rdx,0 ; register receiving bits
shrd rdx,rax,8 ; shift
mov [raxsave],rax ; save, destroyed by function
mov [rdxsave],rdx ; save, destroyed by function
prt2 "shrd rdx,rax,8"; invoke the print macro
shra: mov rax,[raxsave] ; restore, destroyed by function
mov rdx,[rdxsave] ; restore, destroyed by function
shr rax,8 ; finish double shift, both registers
prt2 "shr rax,8 "; invoke the print macro
pop rbp ; restore stack
mov rax,0 ; exit code, 0=normal
ret ; main returns to operating system
First project is assigned.
You may want to do this in Lab this Friday.
www.cs.umbc.edu/~squire/cmpe310_proj.shtml
Instructions and data come from the cache
The "cache" is very high speed memory on the CPU chip.
Typical CPU's can get words out of the cache every clock.
In order to be as fast as the logic on the CPU, the cache
can not be as large as the main memory. Typical cache sizes
are hundreds of kilobytes to a few megabytes.
There is typically a level 1 instruction cache, a level 1
data cache. These would be in the blocks on our project
schematic labeled instruction memory and data memory.
Then, there is typically a level 2 unified cache that is
larger and may be slower than the level 1 caches. Unified
means it is used for both instructions and data.
Some computers have a level 3 cache that is larger and
slower than the level 2 cache. Multi core computers
have at least a L1 instruction cache and a L1 data cache
for every core. Some have a L3 unified cache that is
available to all cores. Thus data can go from one core
to another without going through RAM.
+-----------+ +-----------+
| L1 Icache | | L1 Dcache |
+-----------+ +-----------+
| |
+---------------------------+
| L2 unified cache |
+---------------------------+
|
+------+
| RAM |
+------+
|
+------+
| Disc | or Solid State Drive, SSD
+------+
The goal of the computer system is to use the cache for instructions
and data in order to execute instructions as fast as possible.
Typical RAM requires 5 to 10 clocks to get an instruction or
data word. A typical CPU does prefetching and branch prediction
to bring instructions into the cache in order to minimize
stalls waiting for instructions. You will simulate a cache and
the associated stalls in part 3 of your project.
Intel IA-64 cache structure, page 3
IA-64 Itanium
An approximate hierarchy is:
size response
CPU 0.5 ns 2 GHz clock
L1 cache .032MB 0.5 ns one for instructions, another for data
L2 cache 4MB 1.0 ns
RAM 4000MB 4.0 ns
disk 500000MB 4.0 ms = 4,000,000 ns
A program is loaded from disk, into RAM, then as needed
into L2 cache, then as needed into L1 cache, then as needed
into the CPU pipelines.
1) The CPU initiates the request by sending the L1 cache an address.
If the L1 cache has the value at that address, the value is quickly
sent to the CPU.
2) If the L1 cache does not have the value, the address is passed to
the L2 cache. If the L2 cache has the value, the value is quickly
passed to the L1 cache. The L1 cache passes the value to the CPU.
3) If the L2 cache does not have the value at the address, the
address is passed to a memory controller that must access RAM
in order to get the value. The value passes from RAM, through
the memory controller to the L2 cache then to the L1 cache then
to the CPU.
This may seem tedious yet each level is optimized to provide good
performance for the total system. One reason the system is fast is
because of wide data paths. The RAM data path may be 128-bits or
256-bits wide. This wide data path may continue through the
L2 cache and L1 cache. The cache is organized in blocks
(lines or entries may be used in place of the word blocks)
that provide for many bytes of data to be accessed in parallel.
When reading from a cache, it is like combinational logic, it
is not clocked. When writing into a cache it must write on
a clock edge.
A cache receives an address, a computer address, a binary number.
The parts of the cache are all powers of two. The basic unit of
an address is a byte. For our study, four bytes, one word, will
always be fetched from the cache. When working the homework
problems be sure to read the problem carefully to determine if
the addresses given are byte addresses or word addresses.
It will be easiest and less error prone if all addresses are
converted to binary for working the homework.
The basic elements of a cache are:
A valid bit: This is a 1 if values are in the cache block
A tag field: This is the upper part of the address for
the values in the cache block.
Cache block: The values that may be instructions or data
Here is the absolutely simplest cache with one word blocks
<- previous index next ->