<- previous    index    next ->
The Intel x86-64 has many registers and named sub-registers.
This is why your 16-bit Intel programs will still run.
Here are some that are used in assembly language programming
and debugging (the "dash number" gives the number of bits):
Typically typed lower case.
+---------------------------------+  A register
|RAX-64                           |                          
|    +---------------------------+|  RAX really extended accumulator
|    | EAX-32 +-----------------+||  EAX extended extended accumulator
|    |        |       AX-16     |||  Ax  extended accumulator
|    |        |+--------+------+|||  (A  multiplicand before multiply)
|    |        ||  AH-8  | AL-8 ||||  (A  lower part of dividend before divide)
|    |        |+--------+------+|||  (A  lower part of product)
|    |        +-----------------+||  (H for high, L for low byte)
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  B register
|RBX-64                           |
|    +---------------------------+|  RBX really extended base pointer
|    | EBX-32 +-----------------+||  (EBX is double word segment)
|    |        |       BX-16     |||  (BX is word segment)
|    |        |+--------+------+|||
|    |        ||  BH-8  | BL-8 ||||
|    |        |+--------+------+|||
|    |        +-----------------+||
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  C register
|RCX-64                           |
|    +---------------------------+|  RCX 64-bit counter
|    | ECX-32 +-----------------+||  (string and loop operations)
|    |        |       CX-16     |||  (ECX is a 32 bit counter)
|    |        |+--------+------+|||  (CX is a 16 bit counter)
|    |        ||  CH-8  | CL-8 ||||  (see  loop   instruction)
|    |        |+--------+------+|||
|    |        +-----------------+||
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  D register
|RDX-64                           |
|    +---------------------------+|  RDX extended EDX extended DX
|    | EDX-32 +-----------------+||  (I/O pointer for memory mapped I/O)
|    |        |       DX-16     |||  (D  remainder after divide)
|    |        |+--------+------+|||  (D  upper part of dividend)
|    |        ||  DH-8  | DL-8 ||||  (D  upper part of product)
|    |        |+--------+------+|||
|    |        +-----------------+||
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  Stack Pointer
|RSP-64                           |
|    +---------------------------+|  RSP 64-bit stack pointer
|    | ESP-32     +-------------+||  ESP extended stack pointer
|    |            | SP-16       |||  SP  stack pointer
|    |            +-------------+||  (used by PUSH and POP)
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  Base Pointer
|RBP-64                           |
|    +---------------------------+|  RBP 64-bit base pointer
|    | EBP-32     +-------------+||  EBP extended base pointer
|    |            | BP-16       |||  (by convention, callers stack)
|    |            +-------------+||  (BP in ES segment)
|    +---------------------------+|  We save it, push then pop
+---------------------------------+
+---------------------------------+  Source Index
|RSI-64                           |
|    +---------------------------+|  RSI 64-bit source index
|    | ESI-32     +-------------+||  ESI extended source index
|    |            | SI-16       |||  SI  source index
|    |            +-------------+||  (SI in DS segment)
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  Destination Index
|RDI-64                           |
|    +---------------------------+|  RDI 64-bit destination index
|    | EDI-32     +-------------+||  EDI extended destination index
|    |            | DI-16       |||  DI  destination index
|    |            +-------------+||  (DI in ES segment)
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  Instruction Pointer
|RIP-64                           |
|    +---------------------------+|  RIP 64-bit instruction pointer
|    | EIP-32     +-------------+||  EIP extended instruction pointer
|    |            | IP-16       |||  IP  instruction pointer
|    |            +-------------+||  set by jump and call
|    +---------------------------+|
+---------------------------------+
+---------------------------------+  Flags indicating errors
|RFLAGS-64                        |
|    +---------------------------+|   RFLAGS 64-bit flags
|    | EFLAGS-32  +-------------+||   EFLAGS extended flags
|    |            | FLAGS-16    |||   FLAGS
|    |            +-------------+||   (not a register name!)
|    +---------------------------+|   (must use PUSHF and POPF)
+---------------------------------+
Additional 64-bit registers are R8, R9, R10, R11, R12, R13, R14, R15
128-bit Registers for SSE instructions and printf are  xmm0, ..., xmm15
Additional floating point stack, fld, fst, fstp, st0, st1, ... 80 bit
Use of registers and little endian
see  testreg_64.asm for register syntax 
see  testreg_64.lst for binary encoding
Just a snippet of testreg_64.asm :
	section .data  		; preset constants, writeable
aa8:	db	8		; 8-bit
aa16:	dw	16		; 16-bit
aa32:	dd	32		; 32-bit
aa64:	dq	64		; 64-bit
		
	section .text		; instructions, code segment
	mov	rax,[aa64]	; five registers in RAX
	mov	eax,[aa32]	; four registers in EAX
	mov	ax,[aa16]
	mov	ah,[aa8]
	mov	al,[aa8]
Just a snippet of testreg_64.lst
(line number, hex address in segment, hex data, assembly language)
((note byte 10 hex is 16 decimal, 20 hex is 32 decimal, etc))
((( note little endian, least significant byte first.)))
     8 00000000 08                      aa8:	db	8
     9 00000001 1000                    aa16:	dw	16
    10 00000003 20000000                aa32:	dd	32
    11 00000007 4000000000000000        aa64:	dq	64
    24 00000001 488B0425[07000000]      	mov	rax,[aa64]
    25 00000009 8B0425[03000000]        	mov	eax,[aa32]
    26 00000010 668B0425[01000000]      	mov	ax,[aa16]
    27 00000018 8A2425[00000000]        	mov	ah,[aa8]
    28 0000001F 8A0425[00000000]        	mov	al,[aa8]
OH! Did I forget to mention that Intel is a "little endian" machine.
The bytes are stored backwards to English.
The little end, least significant byte is first, smallest address.
Other registers that are extended include:
              +-------------+   CS code segment
              | CS-16       |
              +-------------+
              +-------------+   SS stack segment
              | SS-16       |
              +-------------+
              +-------------+   DS data segment
              | DS-16       |   (current module)
              +-------------+
              +-------------+   ES data segment
              | ES-16       |   (calling module, destination string)
              +-------------+
              +-------------+   FS heap segment
              | FS-16       |
              +-------------+
              +-------------+   GS global segment
              | GS-16       |   (shared)
              +-------------+
There are also 80-bit or more, floating point registers ST0, ..., ST7
(These are actually a stack, note FST vs FSTP etc)
There are also control registers CR0, ..., CR4
There are also debug registers DR0, DR1, DR2, DR3, DR6, DR7
There are also test registers TR3, ...., TR7
Basic NASM syntax
The basic syntax for a line in NASM is:
label:  opcode  operand(s) ; comment
The "label" is a case sensitive user name, followed by a colon.
The label is optional and when not present, indent the opcode.
The label should start in column one of the line.
The label may be on a line with nothing else or a comment.
In assembly language the "label" is an address,
not a value as it is in compiler language.
The "opcode" is not case sensitive and may be a machine instruction
or an assembler directive (pseudo operation) or a macro call.
Typically, all "opcode" fields are neatly lined up starting in the
same column. Use of "tab" is OK.
Machine instructions may be preceded by a "prefix" such as:
a16, a32, o16, o32, and others.
"operand(s)" depend on the choice of "opcode".
An operand may have several parts separated by commas,
The parts may be a combination of register names, constants,
memory references in brackets [ ] or empty.
Comments are optional, yet encouraged.
Everything from the semicolon to the end of the line is
a comment, ignored by the assembler.
The semicolon may be in column one, making the entire line
a comment. Some editors put in two semicolon, no difference.
Sections or segments:
One specific assembler directive is the "section" or "SECTION"
directive. Four types of section are predefined for ELF format:
        section  .data    ; initialized data
                          ; writeable, not executable
                          ; default alignment 8 bytes
        section  .bss     ; uninitialized space for data
                          ; writeable, not executable
                          ; default alignment 8 bytes
        section  .rodata  ; initialized data
                          ; read only, not executable
                          ; default alignment 8 bytes
        section  .text    ; instructions (code)
                          ; not writeable, executable
                          ; default alignment 16 bytes
        section  other    ; any name other than .data, .bss,
                          ; .rodata, .text
                          ; your stuff
                          ; not executable, not writeable
                          ; default alignment 1 byte
Efficiency and samples
A few comments on efficiency:
My experience is that a good assembly language programmer
can make a small (about 100 lines) "C" program more
efficient than the  gcc  compiler. But, for larger
programs, the compiler will be more efficient.
Exceptions are, for example, the SGI IRIX  cc  compiler
that has super optimization for that specific machine.
For the Intel x86-64 here are some samples in nasm and from gcc
(different syntax but you should be able to recognize the instructions)
Focus on the loop, there is prologue and epilogue code that should
be included, yet was omitted. Note the test has "check" values
at each end of the array. There is no range testing in
either "C" or assembly language.
A simple loop loopint_64.asm
; loopint_64.asm  code loopint.c for nasm 
; /* loopint_64.c a very simple loop that will be coded for nasm */
; #include <stdio.h>
; int main()
; {
;   long int dd1[100]; // 100 could be 3 gigabytes
;   long int i;        // must be long for more than 2 gigabytes
;   dd1[0]=5; /* be sure loop stays 1..98 */
;   dd1[99]=9;
;   for(i=1; i<99; i++) dd1[i]=7;
;   printf("dd1[0]=%ld, dd1[1]=%ld, dd1[98]=%ld, dd1[99]=%ld\n",
;           dd1[0], dd1[1], dd1[98],dd1[99]);
;   return 0;
;}
; execution output is dd1[0]=5, dd1[1]=7, dd1[98]=7, dd1[99]=9
 
	section	.bss
dd1:	resq	100			; reserve 100 long int
i:	resq	1			; actually unused, kept in register
        section .data			; Data section, initialized variables
fmt:    db "dd1[0]=%ld, dd1[1]=%ld, dd1[98]=%ld, dd1[99]=%ld",10,0
	
        extern	printf			; the C function, to be called
	section .text
	global	main
main:	push	rbp			; set up stack
	mov	qword [dd1],5	   	; dd1[0]=5;  memory to memory
	mov	qword [dd1+99*8],9 	; dd1[99]=9; indexed 99 qword
	mov 	rdi, 1*8		; i=1; index, will move by 8 bytes
loop1:	mov 	qword [dd1+rdi],7	; dd1[i]=7;
	add	rdi, 8			; i++;  8 bytes 
	cmp	rdi, 8*99		; i<99
	jne	loop1			; loop until incremented i=99
	
	mov	rdi, fmt		; pass address of format
	mov	rsi, qword [dd1]	; dd1[0]   first list parameter
	mov	rdx, qword [dd1+1*8]	; dd1[1]   second list parameter
	mov	rcx, qword [dd1+98*8]	; dd1[98]  third list parameter
	mov	r8,  qword [dd1+99*8]	; dd1[99]  fourth list parameter
	mov	rax, 0			; no xmm used
        call    printf			; Call C function
	pop	rbp			; restore stack
	mov	rax,0			; normal, no error, return value
					     ret				; return
The simplest loop in NASM requires use of register  rcx
"C"  for(j=n; j>0; j--) // count down from n, do not use j==0
     {
       a[j] = 0.0;
     }
      section .data
n:    dq  9
zero: dq  0.0
      section .bss
a:    resq 9
      section .text
      mov rcx, [n]        ; j = 9;  // start loop
jlab: fld  qword [zero]
      fstp qword [a+8*rcx]  ; a[j] = 0.0;  each  a  is 8 bytes
      loop jlab           ; j=j-1, jump to jlab: if  j>0 
A loop that counts up requires you to do the increment and compare:
"C"  for(j=0; j<n; j++)  // count up from zero, do not use  j==n
     {
       a[j] = 0.0;
     }
      section .data
n:    dq  9
zero: dq  0.0
      section .bss
a:    resq 9
      section .text
      mov rax,0        ; j = 0;  // start loop
jlab: fld  qword [zero]
      fstp qword [a+8*rax] ;  a[j] = 0.0; each  a  is 8 bytes
      inc  rax         ;  j = j + 1
      cmp  rax,[n]     ;  j <n ?
      jl jlab          ; jump to jlab: if  j<n 
Speed consideration must take into account cache and virtual memory
performance, number of bytes transferred from RAM and clock cycles.
On modern computer architectures, this is almost impossible. For example,
the Pentium 4 translates the 80x86 code into RISC pipeline code and
is actually executing instructions that are different from the
assembly language. Carefully benchmarking complete applications is
about the only conclusive measure of efficiency.
"C" and other programming languages may call subroutines, functions,
procedures written in assembly language. Here is a small sample
using floating point just to show use of ST registers, mentioned in comments.
Main C program test_callf1_64.c
callf1_64.h
// test_callf1_64.c   test  callf1_64.asm 
// nasm -f elf64 -l callf1_64.lst callf1_64.asm
// gcc -m64 -o test_callf1_64 test_callf1_64.c callf1_64.o
// ./test_callf1_64 > test_callf1_64.out
 #include "callf1_64.h"
 #include <stdio.h>
int main()
{
  double L[2];
  printf("test_callf1_64.c using callf1_64.asm\n");
  L[0]=1.0;
  L[1]=2.0;
  callf1_64(L); // add 3.0 to L[0], add 4.0 to L[1]
  printf("L[0]=%e, L[1]=%e \n", L[0], L[1]);
  return 0;
}
Full with debug callf1_64.asm
Stripped down  callf1_64.asm  with no demo, no debug:
; callf1_64.asm  a basic structure for a subroutine to be called from "C"
; Parameter:   double *L
; Result: L[0]=L[0]+3.0  L[1]=L[1]+4.0
        global callf1_64	; linker must know name of subroutine
        SECTION .data		; Data section, initialized variables
a3:	dq	3.0		; 64-bit variable a initialized to 3.0
a4:	dq	4.0		; 64-bit variable b initializes to 4.0
	SECTION .text           ; Code section.
callf1_64:			; name must appear as a nasm label
        push	rbp		; save rbp
	mov	rax,rdi		; first, only, in parameter, address
				; add 3.0 to L[0]
	fld	qword [rax] 	; load L[0] (pushed on flt pt stack, st0)
	fadd	qword [a3]	; floating add 3.0 (to st0)
	fstp	qword [rax]	; store into L[0] (pop flt pt stack)
	fld	qword [rax+8] 	; load L[1] (pushed on flt pt stack, st0)
	fadd	qword [a4]	; floating add 4.0 (to st0)
	fstp	qword [rax+8]	; store into L[1] (pop flt pt stack)
	pop	rbp	        ; restore callers stack frame
        ret			; return
We did not need to save floating point stack, we left it unchanged.
We could have used dt and tword for 80 bit floating point.
Calling printf uses xmm registers.
Simple loop using register  rcx
loop_64.asm
loop_64_asm.out
; loop_64.asm  simple using rcx and loop
;              for(i=9; i>0; i++) A[i] = 0;
; Assemble:	  nasm -f elf64  loop_64.asm
; Link:		  gcc -m64 -o loop_64  loop_64.o
; Run:		  ./loop_64 > loop_64.out
; Output:	  cat hello_64.out
	
        extern	printf		; the C function, to be called
        section .data		; Data section, initialized variables
fmt:    db "A[%ld]=0", 10, 0  ; The printf format, "\n",'0'
	section .bss
A:	resq	10	        ; A[0] .. A[9] in C  A[0] unused
sav:	resq	1		; in case printf clobbers rcx
	
        section .text           ; Code section.
        global main		; the standard gcc entry point
main:				; the program label for the entry point
        push    rbp		; set up stack frame, must be aligned
	
	mov	rcx,9
loop1:	mov	qword [A+rcx*8],0 ; A[i] = 0
	mov	[sav],rcx       ; printf clobbers rcx  debug printout 
	mov	rdi,fmt         ; address of format, standard register rdi
	mov	rsi,rcx         ; address of first data, standard register rsi
	mov	rax,0		; no float or double in xmm
        call    printf		; Call C function
	mov	rcx,[sav]
	
	loop	loop1		; decrement rcx, jump if rcx > 0 zero
	                        ; [A+rcx*8-8]  A[8] .. A[0]
	pop	rbp		; restore stack
	mov	rax,0		; normal, no error, return value
	ret			; return
output in loop_64_asm.out  with -8  could be  A[8]=0 .. A[0]=0
A[9]=0
A[8]=0
A[7]=0
A[6]=0
A[5]=0
A[4]=0
A[3]=0
A[2]=0
A[1]=0
		   
A bunch of information, much you do not need, on  nasm
nasmh.txt  from  nasm -h
    <- previous    index    next ->