<- previous index next ->
The Intel 80x86 have privilege levels.
There are instructions that can only be executed at the highest
privilege level, CPL = 0. This would be reserved for the
operating system in order to prevent the average user from
causing chaos. e.g. The average user could issue a HLT instruction
to halt the machine and thus every process would be dead.
Other CPL=0 only instructions include:
CLTS Clear Task Switching flag in cr0
INVP Invalidate cache
INVLPG Invalidate translation lookaside buffer, TLB
WBINVD Write Back and Invalidate cache
It should be obvious that when running a multiprocessing operating
system, that there are many instructions that only the operating
system should use.
The operating system controls the resources of the computer,
including RAM, I/O and user processes. Some sample protections
are tested by the following sample programs:
A few simple tests to be sure protections are working.
These three programs result in segfault, intentionally.
safe_64.asm store into read only section
; safe_64.asm for testing protections within sections
; Assemble: nasm -f elf64 safe_64.asm
; Link: gcc -o safe_64 safe_64.o
; Run: ./safe_64
; Output:
; it should stop with a system caught error
global main ; the standard gcc entry point
extern printf ; the C function, to be called
section .rodata ; read only data section, constants
a: dq 5 ; long int a=5;
fmt: db "Bad, still running",10,0
section .text ; Code section. not writeable
main: ; the program label for the entry point
push rbp ; set up stack frame
mov rax,0x789abcde
mov [a],rax ; should be error, read only section !!!!!!!!!!
mov rdi,fmt ; address of format string
mov rax,0
call printf
pop rbp
mov rax,0 ; normal, no error, return value
ret ; return
safe1_64.asm store into code section
; safe_64.asm for testing protections within sections
; Assemble: nasm -f elf64 safe1_64.asm
; Link: gcc -o safe1_64 safe1_64.o
; Run: ./safe1_64
; Output:
; it should stop with a system caught error
global main ; the standard gcc entry point
extern printf ; the C function, to be called
section .rodata ; read only data section, constants
a: dq 5 ; long int a=5;
fmt: db "Bad, still running",10,0
section .text ; Code section. not writeable
main: ; the program label for the entry point
push rbp ; set up stack frame
mov rax,0x789abcde
mov [main],rax ; should be error, can not change code .text !!!!!!
mov rdi,fmt ; address of format string
mov rax,0
call printf
pop rbp
mov rax,0 ; normal, no error, return value
ret ; return
safe2_64.asm jump (execute) data
; safe2_64.asm for testing protections within sections
; Assemble: nasm -f elf64 safe2_64.asm
; Link: gcc -o safe2_64 safe2_64.o
; Run: ./safe2_64
; Output:
; it should stop with a system caught error
global main ; the standard gcc entry point
extern printf ; the C function, to be called
section .rodata ; read only data section, constants
a: dq 5 ; long int a=5;
fmt: db "Bad, still running",10,0
section .text ; Code section. not writeable
main: ; the program label for the entry point
push rbp ; set up stack frame
mov rax,0x789abcde
jmp a ; should be error, can not execute data !!!!!!!!
mov rdi,fmt ; address of format string
mov rax,0
call printf
pop rbp
mov rax,0 ; normal, no error, return value
ret ; return
A few simple tests to be sure privileged instructions can not execute.
priv_64.asm hlt instruction to halt computer
; priv_64.asm for testing that average user
; can not execute privileged instructions
; Assemble: nasm -f elf64 priv_64.asm
; Link: gcc -o priv_64 priv_64.o
; Run: ./priv_64
; Output:
; it should stop with a system caught error
global main ; the standard gcc entry point
extern printf ; the C function, to be called
fmt: db "bad! Still running",10,0 ; The printf format, "\n",'0'
section .text ; try to halt the computer
main: ; the program label for the entry point
push rbp ; set up stack frame
hlt ; should be error, only allowed in CPL=0 !!!!!!!
mov rdi,fmt ; address of format string
mov rax,0
call printf
pop rbp
mov rax,0 ; normal, no error, return value
ret ; return
priv1_64.asm other privileged instructions
; priv1_64.asm for testing that average user
; can not execute privileged instructions
; Assemble: nasm -f elf64 priv1_64.asm
; Link: gcc -o priv1_64 priv1_64.o
; Run: ./priv1_64
; Output:
; it should stop with a system caught error
global main ; the standard gcc entry point
extern printf ; the C function, to be called
fmt: db "bad! Still running",10,0 ; The printf format, "\n",'0'
section .text ; try to halt the computer
main: ; the program label for the entry point
push rbp ; set up stack frame
clts ; should be error, only allowed in CPL=0 !!!!!!!
wbinvd ; never gets to these, also error
mov rdi,fmt ; address of format string
mov rax,0
call printf
pop rbp
mov rax,0 ; normal, no error, return value
ret ; return
In order to allow the user some access, controlled access, to
system resources, an interface to the operating system, or kernel,
is provided. You will see in the next lecture that some BIOS
functions are also provided as Linux kernel calls.
Need for speed: Some Brief History:
The ISA card slots were replaced by PCI card slots that
are replaced by external USB devices. The
serial port for RS232 devices is replaced by the USB port.
Floppy disk are disappearing along with that connector on
the motherboard. RAM still uses DIMM's and the slots have
grown to handle 4, 8 and 16 gigabytes of memory. ATA hard
drives are replaced by SATA hard drives, 4TB becoming available.
Some rotating hard drives are being replaced by SSD, solid
state drives. The printer port will be going as will the
AGP graphics connector. That expensive graphics card you
bought will probably not work in your new computer.
A standard engineering statement is:
Fast, Cheap, Reliable - pick any two.
The best method of measuring a computers performance
is to use benchmarks. Some suggestions from my
personal experience preparing a benchmark suite
and several updates and personal benchmark
experience are presented in pdf format.
Smaller time is better, higher clock frequency is better.
time = 1 / frequency T = 1/F and F = 1/T
1 nanosecond = 1 / 1 GHz
1 microsecond = 1 / 1 MHz
Definitions:
CPI Clocks Per Instruction
MHz Megahertz, millions of cycles per second
MIPS Millions of Instructions Per Second = MHz / CPI
MOPS Millions of Operations Per Second
MFLOPS Millions of Floating point Operations Per Second
MIOPS Millions of Integer Operations Per Second
Do not trust your computers clock or the software
that reads and processes the time.
First: Test the wall clock time against your watch.
time_test.c
time_test.java
time_test.f90
Click on above to see code.
The program displays 0, 5, 10, 15 ... at 0 seconds,
5 seconds, 10 seconds etc.
demonstrate time_test if possible
Note the use of <time.h> and 'time()'
Beware, midnight is zero seconds.
Then 60 sec/min * 60 min/hr * 24 hr/day = 86,400 sec/day
Just before midnight is 86,399 seconds.
Running a benchmark across midnight may give a negative time.
Then: Test CPU time, this should be just the time
used by the program that is running. With only
this program running, checking against your watch
should work. On a busy day on GL this could take 10 seconds
to give the first 5 second printout. This would need 16 students
running compute intensive programs.
time_cpu.c
Click on above to see code.
The program displays 0, 5, 10, 15 ... at 0 seconds,
5 seconds, 10 seconds etc.
Note the use of <time.h> and
'(double)clock()/(double)CLOCKS_PER_SEC'
I have found one machine with the constant
CLOCKS_PER_SECOND completely wrong and
another machine with a value 64 that should
have been 100. A computer used for real time
applications could have a value of 1,000,000
or more.
A computer benchmark will typically be some code that is executed
and the running time measured.
A few simple rules about benchmarks:
1) Do not believe or trust any person, any company, any data.
2) Expect the same code to give different times on:
different operating systems,
different compilers,
different computers from various manufacturers
(IBM, Sun, Intel, AMD) even at same clock speed,
(IBM Power fastest, AMD next fastest with same memory, cache)
different languages, even for line by line translation.
3) If you want to measure optimization, turn it on,
otherwise prevent all optimization.
(Most compilers provide optimization choices)
(Add code to prevent inlining of functions, force store)
4) You will probably be using your computers clock to measure time.
Test that the clock is giving valid results for the language
you are using. The constant CLOCKS_PER_SEC in the "C" header
file time.h has been found to be wrong.
One manufacturer put a capacitor across the clock circuitry
on a motherboard and all time measurements were half the
correct time. See sample test below.
5) For measuring short times you will need to use the
"double difference method". This method can be used to
measure the time of a single instruction. This method
should be used for any benchmark where one iteration of
the code runs in less than a second. See sample test below.
6) Some methods of measuring time on a computer are only
accurate to one second. Generally run enough iterations of
your code in loops to get a ten second measurement.
Some computers provide a real time clock as accurate as
one microsecond, others one millisecond and some poorer than
a fiftieth of a second.
7) Turn off all networking and stop all software that might run
periodically. If possible, run in single user mode. You want to
measure your code, not a virus checker or operating system.
I once did measurement on a Sun computer running Solaris. It
seemed to slow down periodically. I found that the operating
system periodically checked to see if any disk files needed
to be written.
8) If you are interested in how fast your application might run
on a new computer, find reputable benchmarks that are for
similar applications. I do a lot of numerical computation, thus
all my benchmarks are heavily floating point. You may be
more interested in disk performance or network performance.
9) Do not run all all zero data. Some compilers and very smart and
may precompute your result without running you code.
Be sure to use every result. Compilers do "dead code elimination"
that checks for code where the results are not used and just
does not produce instructions for that "dead code." An "if" test
or printing out the result is typically sufficient. For vectors
and arrays, usually printing out one element is sufficient.
10) It helps to be paranoid. Check that you get the same results
by running n iterations, then 2n iterations. If the time did
not double, you do not have a stable measurement. Run 4n and 8n
and check again. It may not be your benchmark code, it may be
an operating system activity.
11) Do not run a benchmark across midnight. Most computers reset
the seconds to zero at midnight.
12) Keep values of time as a double precision numbers.
13) Given an algorithm where you can predict the time increase
as the size of data increases: e.g. FFT is order n log2 n,
multiplying a matrix by a matrix is order n^3, expect
non uniform results for some values of n.
Consider the case where all your code and all your data fit
in the level one caches. This will be the fastest.
Consider when you data is much larger than the level one cache
yet fits in the level two cache. You are now measuring the
performance of the level two cache.
Consider when your data fits in RAM but is much larger than
your level two (or three) cache. You are now measuring the speed
of your code running in RAM.
Consider when your data is much larger than your RAM, you are
now running in virtual memory from your disk drive. This will
be very slow and you are measuring disk performance.
The "Double Difference Method" tries to get accurate measurement
for very small times. The code to time a single floating point
add instruction is shown below. The principal is:
measure time, t1
run a test harness with loops that has everything except the code
that you want to time. Count the number of executions as a check.
measure time, t2
measure time, t3
run exactly the same code from the test harness with only the
feature you want to measure added. Count number of executions.
measure time, t4
check that the number of executions is the same.
check that t2-t1 was more than 10 seconds
the time for the feature you wanted to measure is
t5 = ((t4 - t3) - (t2 - t1))/ number of executions
basically measured time minus test harness time divided by the
number of executions.
/* time_fadd.c try to measure time of double A = A + B; */
/* roughly time of one floating point add */
/* using double difference and minimum and stability */
#include <time.h>
#include <stdio.h>
 #include <math.h>
#define dabs(a) ((a)<0.0?(-(a)):(a))
void do_count(int * count_check, int rep, double * B);
int main(int argc, char * argv[])
{
double t1, t2, t3, t4, tmeas, t_min, t_prev, ts, tavg;
double A, B, Q;
int stable;
int i, j;
int count_check, outer;
int rep, min_rep;
t_min = 10.0; /* 10.0 seconds typical minimum measurement time */
Q = 5.0; /* 5.0 typical approximate percentage stability */
min_rep = 32; /* minimum of 32 typical */
outer = 100000; /* some big number */
printf("time_fadd.c \n");
printf("min time %g seconds, min stability %g percent, outer loop=%d\n",
t_min, Q, outer);
stable = 5; /* max tries */
t_prev = 0.0;
for(rep=min_rep; rep<100000; rep=rep+rep) /* increase until good measure */
{
A = 0.0;
B = 0.1;
t1 = (double)clock()/(double)CLOCKS_PER_SEC;
for(i=0; i<outer; i++) /* outer control loop */
{
count_check = 0;
for(j=0; j<rep; j++) /* inner control loop */
{
do_count(&count_check, rep, &B);
}
}
t2 = (double)clock()/(double)CLOCKS_PER_SEC;
if(count_check != rep) printf("bad count_check_1 %d \n", count_check);
A = 0.0;
t3 = (double)clock()/(double)CLOCKS_PER_SEC;
for(i=0; i<outer; i++) /* outer measurement loop */
{
count_check = 0;
for(j=0; j<rep; j++) /* inner measurement loop */
{
do_count(&count_check, rep, &B);
A = A + B; /* item being measured, approximately FADD time */
}
}
t4 = (double)clock()/(double)CLOCKS_PER_SEC;
if(count_check != rep) printf("bad count_check_2 %d \n", count_check);
tmeas = (t4-t3) - (t2-t1); /* the double difference */
printf("rep=%d, t measured=%g \n", rep, tmeas);
if((t4-t3)<t_min) continue; /* need more rep */
if(t_prev==0.0)
{
printf("tmeas=%g, t_prev=%g, rep=%d \n", tmeas, t_prev, rep);
t_prev = tmeas;
}
else /* compare to previous */
{
printf("tmeas=%g, t_prev=%g, rep=%d \n", tmeas, t_prev, rep);
ts = 2.0*(dabs(tmeas-t_prev)/(tmeas+t_prev));
tavg = 0.5*(tmeas+t_prev);
if(100.0*ts < Q) break; /* above minimum and stable */
t_prev = tmeas;
}
stable--;
if(stable==0) break;
rep = rep/2; /* hold rep constant */
} /* end loop increasing rep */
/* stable? and over minimum */
if(stable==0) printf("rep=%d unstable \n", rep);
if(tmeas<t_min) printf("time measured=%g, under minimum \n", tmeas);
printf("raw time=%g, fadd time=%g, rep=%d, stable=%g\% \n\n", tmeas,
(tavg/(double)outer)/(double)rep, rep, ts);
return 0;
} /* end time_fadd.c */
/* do_count to prevent dead code elimination */
void do_count(int * count_check, int rep, double * B)
{
(*count_check)++;
/* could change B but probably don't have to. */
}
time_fadd_sgi.out
<- previous index next ->