CMPE 310 Lecture 11,

    <- previous    index    next ->

Lecture 11 Privileged instructions

The Intel 80x86 have privilege levels.
There are instructions that can only be executed at the highest
privilege level, CPL = 0. This would be reserved for the
operating system in order to prevent the average user from
causing chaos. e.g. The average user could issue a HLT instruction
to halt the machine and thus every process would be dead.
Other CPL=0 only instructions include:
  CLTS  Clear Task Switching flag in cr0
  INVP  Invalidate cache
  INVLPG Invalidate translation lookaside buffer, TLB
  WBINVD Write Back and Invalidate cache

It should be obvious that when running a multiprocessing operating
system, that there are many instructions that only the operating
system should use.

The operating system controls the resources of the computer,
including RAM, I/O and user processes. Some sample protections
are tested by the following sample programs:

A few simple tests to be sure protections are working.
These three programs result in segfault, intentionally.
safe_64.asm store into read only section
; safe_64.asm   for testing protections within sections
; Assemble:	nasm -f elf64  safe_64.asm
; Link:		gcc -o safe_64  safe_64.o
; Run:		./safe_64
; Output:
; it should stop with a system caught error

        global main		; the standard gcc entry point
        extern	printf		; the C function, to be called

        section .rodata		; read only data section, constants
a:	dq	5		; long int a=5;
fmt:    db "Bad, still running",10,0


        section .text           ; Code section. not writeable
main:				; the program label for the entry point
        push    rbp		; set up stack frame

	mov	rax,0x789abcde
	mov	[a],rax		; should be error, read only section  !!!!!!!!!!
        mov	rdi,fmt		; address of format string
	mov	rax,0
	call	printf

        pop     rbp
	mov	rax,0		; normal, no error, return value
	ret			; return
	

safe1_64.asm store into code section
; safe_64.asm   for testing protections within sections
; Assemble:	nasm -f elf64   safe1_64.asm
; Link:		gcc -o safe1_64  safe1_64.o
; Run:		./safe1_64
; Output:
; it should stop with a system caught error

        global main		; the standard gcc entry point
        extern	printf		; the C function, to be called

        section .rodata		; read only data section, constants
a:	dq	5		; long int a=5;
fmt:    db "Bad, still running",10,0


        section .text           ; Code section. not writeable
main:				; the program label for the entry point
        push    rbp		; set up stack frame

	mov	rax,0x789abcde
	mov	[main],rax	; should be error, can not change code .text !!!!!!
        mov	rdi,fmt		; address of format string
	mov	rax,0
	call	printf

        pop     rbp
	mov	rax,0		; normal, no error, return value
	ret			; return
	

safe2_64.asm jump (execute) data
; safe2_64.asm   for testing protections within sections
; Assemble:	nasm -f elf64  safe2_64.asm
; Link:		gcc -o safe2_64  safe2_64.o
; Run:		./safe2_64
; Output:
; it should stop with a system caught error

        global main		; the standard gcc entry point
        extern	printf		; the C function, to be called

        section .rodata		; read only data section, constants
a:	dq	5		; long int a=5;
fmt:    db "Bad, still running",10,0


        section .text           ; Code section. not writeable
main:				; the program label for the entry point
        push    rbp		; set up stack frame

	mov	rax,0x789abcde
	jmp	a		; should be error, can not execute data !!!!!!!!
        mov	rdi,fmt		; address of format string
	mov	rax,0
	call	printf

        pop     rbp
	mov	rax,0		; normal, no error, return value
	ret			; return
	

A few simple tests to be sure privileged instructions can not execute.
priv_64.asm hlt instruction to halt computer
; priv_64.asm   for testing that average user
;               can not execute privileged instructions 
; Assemble:	nasm -f elf64 priv_64.asm
; Link:		gcc -o priv_64  priv_64.o
; Run:		./priv_64
; Output:
; it should stop with a system caught error

        global main		; the standard gcc entry point
        extern	printf		; the C function, to be called
fmt:    db "bad! Still running",10,0	; The printf format, "\n",'0'


        section .text           ; try to halt the computer
main:				; the program label for the entry point
        push    rbp		; set up stack frame

	hlt			; should be error, only allowed in CPL=0  !!!!!!!

        mov	rdi,fmt		; address of format string
	mov	rax,0
	call	printf

        pop     rbp
	mov	rax,0		; normal, no error, return value
	ret			; return
	

	

priv1_64.asm other privileged instructions
; priv1_64.asm   for testing that average user
;                can not execute privileged instructions 
; Assemble:	nasm -f elf64 priv1_64.asm
; Link:		gcc -o priv1_64  priv1_64.o
; Run:		./priv1_64
; Output:
; it should stop with a system caught error

        global main		; the standard gcc entry point
        extern	printf		; the C function, to be called
fmt:    db "bad! Still running",10,0	; The printf format, "\n",'0'


        section .text           ; try to halt the computer
main:				; the program label for the entry point
        push    rbp		; set up stack frame

	clts			; should be error, only allowed in CPL=0  !!!!!!!
        wbinvd			; never gets to these, also error

	
        mov	rdi,fmt		; address of format string
	mov	rax,0
	call	printf

        pop     rbp
	mov	rax,0		; normal, no error, return value
	ret			; return
	

	

In order to allow the user some access, controlled access, to
system resources, an interface to the operating system, or kernel,
is provided. You will see in the next lecture that some BIOS
functions are also provided as Linux kernel calls.


Need for speed: Some Brief History:
  The ISA card slots were replaced by PCI card slots that
  are replaced by external USB devices. The
  serial port for RS232 devices is replaced by the USB port.
  Floppy disk are disappearing along with that connector on
  the motherboard. RAM still uses DIMM's and the slots have
  grown to handle 4, 8 and 16 gigabytes of memory. ATA hard
  drives are replaced by SATA hard drives, 4TB becoming available.
  Some rotating hard drives are being replaced by SSD, solid
  state drives. The printer port will be going as will the
  AGP graphics connector. That expensive graphics card you
  bought will probably not work in your new computer.

A standard engineering statement is:
Fast, Cheap, Reliable - pick any two.

The best method of measuring a computers performance
is to use benchmarks. Some suggestions from my
personal experience preparing a benchmark suite
and several updates and personal benchmark
experience are presented in pdf format.


Smaller time is better, higher clock frequency is better.
time = 1 / frequency   T = 1/F   and  F = 1/T
1 nanosecond = 1 / 1 GHz
1 microsecond = 1 / 1 MHz

Definitions:
CPI    Clocks Per Instruction
MHz    Megahertz, millions of cycles per second
MIPS   Millions of Instructions Per Second = MHz / CPI
MOPS   Millions of Operations Per Second
MFLOPS Millions of Floating point Operations Per Second
MIOPS  Millions of Integer Operations Per Second  


Do not trust your computers clock or the software
that reads and processes the time.

First: Test the wall clock time against your watch.

time_test.c
time_test.java
time_test.f90

   Click on above to see code.

The program displays 0, 5, 10, 15 ... at 0 seconds,
5 seconds, 10 seconds etc.

demonstrate time_test if possible



Note the use of <time.h> and 'time()'

Beware, midnight is zero seconds.
Then 60 sec/min * 60 min/hr * 24 hr/day = 86,400 sec/day
Just before midnight is 86,399 seconds.
Running a benchmark across midnight may give a negative time.


Then: Test CPU time, this should be just the time
used by the program that is running. With only
this program running, checking against your watch
should work. On a busy day on GL this could take 10 seconds
to give the first 5 second printout. This would need 16 students
running compute intensive programs.

time_cpu.c

  Click on above to see code.

The program displays 0, 5, 10, 15 ... at 0 seconds,
5 seconds, 10 seconds etc.

Note the use of <time.h> and 
  '(double)clock()/(double)CLOCKS_PER_SEC'

I have found one machine with the constant
CLOCKS_PER_SECOND completely wrong and
another machine with a value 64 that should
have been 100. A computer used for real time
applications could have a value of 1,000,000
or more.

A computer benchmark will typically be some code that is executed
and the running time measured. 

A few simple rules about benchmarks:

1) Do not believe or trust any person, any company, any data.

2) Expect the same code to give different times on:
   different operating systems,
   different compilers,
   different computers from various manufacturers
             (IBM, Sun, Intel, AMD) even at same clock speed,
             (IBM Power fastest, AMD next fastest with same memory, cache)
   different languages, even for line by line translation.

3) If you want to measure optimization, turn it on,
   otherwise prevent all optimization.
             (Most compilers provide optimization choices)
             (Add code to prevent inlining of functions, force store)

4) You will probably be using your computers clock to measure time.
   Test that the clock is giving valid results for the language
   you are using. The constant CLOCKS_PER_SEC in the "C" header
   file  time.h  has been found to be wrong.
   One manufacturer put a capacitor across the clock circuitry
   on a motherboard and all time measurements were half the
   correct time. See sample test below.

5) For measuring short times you will need to use the
   "double difference method". This method can be used to
   measure the time of a single instruction. This method
   should be used for any benchmark where one iteration of
   the code runs in less than a second. See sample test below.

6) Some methods of measuring time on a computer are only
   accurate to one second. Generally run enough iterations of
   your code in loops to get a ten second measurement.
   Some computers provide a real time clock as accurate as
   one microsecond, others one millisecond and some poorer than
   a fiftieth of a second.

7) Turn off all networking and stop all software that might run
   periodically. If possible, run in single user mode. You want to
   measure your code, not a virus checker or operating system.
   I once did measurement on a Sun computer running Solaris. It 
   seemed to slow down periodically. I found that the operating
   system periodically checked to see if any disk files needed
   to be written.

8) If you are interested in how fast your application might run
   on a new computer, find reputable benchmarks that are for
   similar applications. I do a lot of numerical computation, thus
   all my benchmarks are heavily floating point. You may be
   more interested in disk performance or network performance.

9) Do not run all all zero data. Some compilers and very smart and
   may precompute your result without running you code.
   Be sure to use every result. Compilers do "dead code elimination"
   that checks for code where the results are not used and just
   does not produce instructions for that "dead code." An "if" test
   or printing out the result is typically sufficient. For vectors
   and arrays, usually printing out one element is sufficient.

10) It helps to be paranoid. Check that you get the same results
    by running n iterations, then 2n iterations. If the time did
    not double, you do not have a stable measurement. Run 4n and 8n
    and check again. It may not be your benchmark code, it may be
    an operating system activity.

11) Do not run a benchmark across midnight. Most computers reset
    the seconds to zero at midnight.

12) Keep values of time as a double precision numbers.

13) Given an algorithm where you can predict the time increase
    as the size of data increases: e.g. FFT is order  n log2 n,
    multiplying a matrix by a matrix is order n^3, expect
    non uniform results for some values of n.

    Consider the case where all your code and all your data fit
    in the level one caches. This will be the fastest.

    Consider when you data is much larger than the level one cache
    yet fits in the level two cache. You are now measuring the
    performance of the level two cache.

    Consider when your data fits in RAM but is much larger than
    your level two (or three) cache. You are now measuring the speed
    of your code running in RAM.

    Consider when your data is much larger than your RAM, you are
    now running in virtual memory from your disk drive. This will
    be very slow and you are measuring disk performance.


The "Double Difference Method" tries to get accurate measurement
for very small times. The code to time a single floating point
add instruction is shown below. The principal is:

  measure time, t1

  run a test harness with loops that has everything except the code
  that you want to time. Count the number of executions as a check.

  measure time, t2

  measure time, t3

  run exactly the same code from the test harness with only the
  feature you want to measure added. Count number of executions.

  measure time, t4

  check that the number of executions is the same.
  check that  t2-t1 was more than 10 seconds

  the time for the feature you wanted to measure is

  t5 = ((t4 - t3) - (t2 - t1))/ number of executions

  basically measured time minus test harness time divided by the
  number of executions.

 /* time_fadd.c  try to measure time of double  A = A + B;         */
 /*              roughly time of one floating point add            */
 /*              using double difference and minimum and stability */

 #include <time.h>
 #include <stdio.h>
 #include <math.h>

 #define dabs(a) ((a)<0.0?(-(a)):(a))
 void do_count(int * count_check, int rep, double * B);

 int main(int argc, char * argv[])
 {
   double t1, t2, t3, t4, tmeas, t_min, t_prev, ts, tavg;
   double A, B, Q;
   int stable;
   int i, j;
   int count_check, outer;
   int rep, min_rep;


   t_min = 10.0;    /* 10.0 seconds typical minimum measurement time */
   Q  = 5.0;        /* 5.0 typical approximate percentage stability */
   min_rep = 32;    /* minimum of 32 typical */
   outer = 100000;  /* some big number */

   printf("time_fadd.c \n");
   printf("min time %g seconds, min stability %g percent, outer loop=%d\n",
          t_min, Q, outer);


   stable = 5; /* max tries */
   t_prev = 0.0;
   for(rep=min_rep; rep<100000; rep=rep+rep) /* increase until good measure */
   {
     A = 0.0;
     B = 0.1;
     t1 = (double)clock()/(double)CLOCKS_PER_SEC;
     for(i=0; i<outer; i++) /* outer control loop */
     {
       count_check = 0;
       for(j=0; j<rep; j++)   /* inner control loop */
       {
          do_count(&count_check, rep, &B);
       }
     }
     t2 = (double)clock()/(double)CLOCKS_PER_SEC;
     if(count_check != rep) printf("bad count_check_1 %d \n", count_check);

     A = 0.0;
     t3 = (double)clock()/(double)CLOCKS_PER_SEC;
     for(i=0; i<outer; i++) /* outer measurement loop */
     {
       count_check = 0;
       for(j=0; j<rep; j++)   /* inner measurement loop */
       {
         do_count(&count_check, rep, &B);
         A = A + B;   /* item being measured, approximately FADD time */
       }
     }
     t4 = (double)clock()/(double)CLOCKS_PER_SEC;
     if(count_check != rep) printf("bad count_check_2 %d \n", count_check);

     tmeas = (t4-t3) - (t2-t1); /* the double difference */
     printf("rep=%d, t measured=%g \n", rep, tmeas);

     if((t4-t3)<t_min) continue; /* need more rep */

     if(t_prev==0.0)
     {
       printf("tmeas=%g, t_prev=%g, rep=%d \n", tmeas, t_prev, rep);
       t_prev = tmeas;
     } 
     else /* compare to previous */
     {
       printf("tmeas=%g, t_prev=%g, rep=%d \n", tmeas, t_prev, rep);
       ts = 2.0*(dabs(tmeas-t_prev)/(tmeas+t_prev));
       tavg = 0.5*(tmeas+t_prev);
       if(100.0*ts < Q)  break; /* above minimum and stable */
       t_prev = tmeas;
     }
     stable--;
     if(stable==0) break;
     rep = rep/2; /* hold rep constant */
   } /* end loop increasing rep */

   /* stable? and over minimum */
   if(stable==0) printf("rep=%d  unstable \n", rep);
   if(tmeas<t_min) printf("time measured=%g, under minimum \n", tmeas);
   printf("raw time=%g, fadd time=%g, rep=%d, stable=%g\% \n\n", tmeas, 
          (tavg/(double)outer)/(double)rep, rep, ts);
   return 0;
 } /* end time_fadd.c */

 /* do_count to prevent dead code elimination  */
 void do_count(int * count_check, int rep, double * B)
 {
   (*count_check)++;
   /* could change B but probably don't have to. */
 }

time_fadd_sgi.out

    <- previous    index    next ->

Lecture 11 Privileged instructions

demonstrate time_test if possible

Other links

Go to top