diff options
-rw-r--r-- | COM.DAT | 11 | ||||
-rw-r--r-- | Changes | 42 | ||||
-rw-r--r-- | Makefile | 153 | ||||
-rw-r--r-- | NNET.DAT | 210 | ||||
-rw-r--r-- | README | 66 | ||||
-rw-r--r-- | README.motorola | 29 | ||||
-rw-r--r-- | README.nonlinux | 50 | ||||
-rw-r--r-- | README.submit | 33 | ||||
-rw-r--r-- | RESULTS | 138 | ||||
-rw-r--r-- | bdoc.txt | 2109 | ||||
-rw-r--r-- | debugbit.good.gz | bin | 0 -> 1019 bytes | |||
-rw-r--r-- | emfloat.c | 1343 | ||||
-rw-r--r-- | emfloat.h | 154 | ||||
-rwxr-xr-x | hardware | bin | 0 -> 17013 bytes | |||
-rw-r--r-- | hardware.c | 202 | ||||
-rw-r--r-- | hardware.h | 2 | ||||
-rw-r--r-- | hello.c | 2 | ||||
-rw-r--r-- | misc.c | 120 | ||||
-rw-r--r-- | misc.h | 41 | ||||
-rw-r--r-- | nbench0.c | 1174 | ||||
-rw-r--r-- | nbench0.h | 356 | ||||
-rw-r--r-- | nbench1.c | 4445 | ||||
-rw-r--r-- | nbench1.h | 428 | ||||
-rw-r--r-- | nmglobal.h | 519 | ||||
-rw-r--r-- | pointer.c | 6 | ||||
-rw-r--r-- | sysinfo.c.example | 10 | ||||
-rw-r--r-- | sysinfo.c.template | 10 | ||||
-rwxr-xr-x | sysinfo.sh | 78 | ||||
-rw-r--r-- | sysinfoc.c.example | 4 | ||||
-rw-r--r-- | sysinfoc.c.template | 4 | ||||
-rw-r--r-- | sysspec.c | 884 | ||||
-rw-r--r-- | sysspec.h | 168 | ||||
-rw-r--r-- | wordcat.h | 81 |
33 files changed, 12872 insertions, 0 deletions
@@ -0,0 +1,11 @@ +ALLSTATS=T +DONUMSORT=T +DOSTRINGSORT=T +DOBITFIELD=T +DOEMF=T +DOFOUR=T +DOASSIGN=T +DOIDEA=T +DOHUFF=T +DONNET=T +DOLU=T @@ -0,0 +1,42 @@ +This is about BYTE's beta version of the native-algorithm benchmark + +December 16, 1996: + +The source for DOS is obtainable at http://www.byte.com/bmark/bmark.htm +Linux adaptation written by Uwe F. Mayer <mayer@tux.org> + +February 7, 1997: + +added -DSOLARIS flag to support solaris + +November 11, 1997: + +added index split suggested by Andrew D. Balsa +re-baselined to a Linux machine +added checking of CPU-type at run-time (cpuinfo.c) +increased maximal number of loops in some tests +removed -DSOLARIS flag, works now automatically (this also removed the + compiler warnings about redefined types and leads to a 20% faster + code for "Bitfield" if compiled with -funroll-loops!) + +November 13-19, 1997: + +changed debugging information +changed random number generator to be always 32 bits even on 64 bit OSs +added data resets to Bitfield and Huffman +created this Changes file +added debug code for Bitfield + +December 6, 1997: + +got rid of cpuinfo.c +added a RESULTS file + +December 7, 1997: + +fixed the statistical analysis used to compute the confidence coefficient +fixed a bug in the DEBUG routine of "Assignment" + +December 11, 1997 +added some entries to RESULTS + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5045c77 --- /dev/null +++ b/Makefile @@ -0,0 +1,153 @@ +# Makefile for nbench, December 11, 1997, Uwe F. Mayer <mayer@tux.org> +# Updated February 18, 2003 + +default: nbench + +########################################################################## +# If you are using gcc-2.7.2.3 or earlier: +# The optimizer of gcc has a bug and in general you should not specify +# -funroll-loops together with -O (or -O2, -O3, etc.) +# This bug is supposed to be fixed with release 2.8 of gcc. +# +# This bug does NOT seem to have an effect on the correct compilation +# of this benchmark suite on my Linux box. However, it leads to +# the dreaded "internal compiler error" message on our alpha +# running DEC Unix 4.0b. The Linux-binary that was used to obtain +# the baseline results was nevertheless compiled with +# CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops +# +# You should leave -static in the CFLAGS so that your sysinfo can be +# compiled into the executable. + +CC = gcc + +# generic options for gcc +CFLAGS = -s -static -Wall -O3 + +# if your gcc lets you do it, then try this one +#CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops + +# for gcc on an older Pentium type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -m486 \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# for a newer gcc on a newer Pentium type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -march=i686 \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# for a newer gcc on an Athlon XP type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -march=athlon-xp \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# For debugging using gcc +#CFLAGS = -g -O3 -Wall -DDEBUG + +########################################################################## +# For Linux machines with more than one binary format. +# The default binaries, depends on your system whether it's elf or aout. +MACHINE= +# a.out code for linux on an elf machine +#MACHINE= -bi486-linuxaout +# elf code for linux on an a.out machine +#MACHINE= -bi486-linuxelf +# if you want a different compiler version and different binaries, for example +#MACHINE= -V2.7.2 -bi486-linuxaout + +########################################################################## +# Read the file README.nonlinux if you are not using Linux + +# for DEC Unix using cc you can try +#CC = cc +#CFLAGS = -O3 +#LINKFLAGS = -s -non_shared + +# for SunOS using cc +#CC = cc +#CFLAGS = -O3 -s + +# for DEC Ultrix using cc +#CC = cc +#CFLAGS = -O2 +#LINKFLAGS = -s + +# for a Mac with OsX and the Darwin environment +#CC = cc +#CFLAGS = -O3 -DOSX + +# For debugging using cc +#CC = cc +#CFLAGS = -g -DDEBUG + +########################################################################## +# If your system does not understand the system command "uname -s -r" +# then comment this out + +# NO_UNAME= -DNO_UNAME + +########################################################################## +# For any Unix flavor you need -DLINUX +# You also need -DLINUX to get the new indices + +DEFINES= -DLINUX $(NO_UNAME) + +########################################################################## +# For LINUX-like systems with gcc +sysinfoc.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +sysinfo.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +########################################################################## +# For non-LINUX systems +# Edit the files sysinfo.c and sysinfoc.c to include your system information +# and take sysinfo.c and sysinfoc.c out of the dependencies for nbench0.o + +hardware.o: hardware.c hardware.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c hardware.c + +nbench0.o: nbench0.h nbench0.c nmglobal.h pointer.h hardware.h\ + Makefile sysinfo.c sysinfoc.c + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench0.c + +emfloat.o: emfloat.h emfloat.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c emfloat.c + +pointer.h: pointer Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -o pointer pointer.c + rm -f pointer.h + if [ "4" = `./pointer` ] ; then touch pointer.h ;\ + else echo "#define LONG64" >pointer.h ; fi + +misc.o: misc.h misc.c Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c misc.c + +nbench1.o: nbench1.h nbench1.c wordcat.h nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench1.c + +sysspec.o: sysspec.h sysspec.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c sysspec.c + +nbench: emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) $(LINKFLAGS)\ + emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o\ + -o nbench -lm + +########################################################################## + +clean: + - /bin/rm -f *.o *~ \#* core a.out hello sysinfo.c sysinfoc.c \ + bug pointer pointer.h debugbit.dat + +mrproper: clean + - /bin/rm -f nbench diff --git a/NNET.DAT b/NNET.DAT new file mode 100644 index 0000000..5711730 --- /dev/null +++ b/NNET.DAT @@ -0,0 +1,210 @@ +5 7 8 +26 +0 0 1 0 0 +0 1 0 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 0 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 0 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 1 0 0 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 0 1 0 1 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 0 0 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 1 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 0 0 0 +0 1 1 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 1 1 0 +0 1 0 0 1 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 0 1 0 +1 0 0 0 1 +1 0 0 1 0 +1 0 1 0 0 +1 1 0 0 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 0 1 0 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 1 1 0 0 +1 0 0 0 1 +1 1 0 1 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 1 0 1 +1 0 0 0 1 +1 1 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 1 1 +1 0 0 0 1 +0 1 0 0 1 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 1 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 1 0 0 0 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 0 1 1 +0 1 1 1 1 +0 1 0 1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 1 0 0 1 0 +0 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +0 1 1 1 0 +0 0 0 0 1 +0 0 0 0 1 +1 1 1 1 0 +0 1 0 1 0 0 1 1 +1 1 1 1 1 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 0 1 0 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +0 1 0 1 0 +0 1 0 1 0 1 1 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 +0 1 0 1 0 +1 0 0 0 1 +0 1 0 1 1 0 0 0 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 1 0 0 1 +1 1 1 1 1 +0 0 0 1 0 +0 0 0 1 0 +0 0 1 0 0 +0 1 0 0 0 +0 1 0 0 0 +1 1 1 1 1 +0 1 0 1 1 0 1 0 @@ -0,0 +1,66 @@ +February 18, 2003 +----------------- +Bug-fix release. + +December 9, 1997 +---------------- +This release is based on beta release 2 of BYTE Magazine's BYTEmark +benchmark program (previously known as BYTE's Native Mode +Benchmarks). This document covers the Native Mode (a.k.a. Algorithm +Level) tests; benchmarks designed to expose the capabilities of a +system's CPU, FPU, and memory system. + +Running a "make" will create the binary if all goes well. It is called +"nbench" and performs a suite of 10 tests and compares the results to +a Dell Pentium 90 with 16 MB RAM and 256 KB L2 cache running MSDOS and +compiling with the Watcom 10.0 C/C++ compiler. If you define -DLINUX +during compilation (the default) then you also get a comparison to an +AMD K6/233 with 32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and +using a binary which was compiled with GNU gcc version 2.7.2.3 and GNU +libc-5.4.38. + +For more verbose output specify -v as an argument. + +The primary web site is: http://www.tux.org/~mayer/linux/bmark.html + +The port to Linux/Unix was done by Uwe F. Mayer <mayer@tux.org>. + +The index-split was done by Andrew D. Balsa, and reflects the +realization that memory management is important in CPU design. The +original tests have been left alone, however, the tests NUMERIC SORT, +FP EMULATION, IDEA, and HUFFMAN now constitute the integer-arithmetic +focused benchmark index, while the tests STRING SORT, BITFIELD, and +ASSIGNMENT make up the new memory index. + +The algorithms were not changed from the source which was obtained +from the BYTE web site at http://www.byte.com/bmark/bmark.htm on +December 14, 1996. However, the source was modified to better work +with 64-bit machines (in particular the random number generator was +modified to always work with 32 bit, no matter what kind of hardware +you run it on). Furthermore, for some of the algorithms additional +resettings of the data was added to increase the consistency across +different hardware. Some extra debugging code was added, which has no +impact on normal runs. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run than on an +unloaded system. This is because the benchmark does some statistical +analysis to make sure that the reported results are statistically +significant, and an increased variation in individual runs requires +more runs to achieve the required statistical confidence. + +This is a single-threaded benchmark and is not designed to measure the +performance gain on multi-processor machines. + +For details and customization read bdoc.txt. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.motorola b/README.motorola new file mode 100644 index 0000000..223001b --- /dev/null +++ b/README.motorola @@ -0,0 +1,29 @@ +The information in this file is old and no longer valid. It seems that +the GNU C library has caught up with Motorola's libmoto, and now +performance is just as good (or better) without libmoto. I'll include +the old notice out of historical reasons only. Currently libmoto is +available at ftp://ftp.mcg.mot.com/pub/SPS/PowerPC/software/mklinux/libmoto/, +but this is subject to change and not under my control. + +February 18, 2003 +Uwe F. Mayer + +--------------------------------------------------------------------------- + +If you have a Motorola CPU or equivalent: + +When linked with the 'libmoto' (floating point library from Motorola) +the results you obtain are much better. (FPU index of 0.896 versus +1.910 in one example.) + +The Motorola math library is currently available at: +http://www.mot.com/SPS/PowerPC/support/rsw_customer_support/mklinux/libmoto/libmoto_reg_mkdev.html + +If you have a Motorola CPU and you submit a result then please let me +know whether you used libmoto or not. Please read the file README.submit. + +I do not have a Motorola CPU, and I can't help you with installing the +library either. + +December 3, 1997 +Uwe F. Mayer
\ No newline at end of file diff --git a/README.nonlinux b/README.nonlinux new file mode 100644 index 0000000..641fe09 --- /dev/null +++ b/README.nonlinux @@ -0,0 +1,50 @@ +December 3, 1993 +================ + +DEC Unix 4.0 or DEC OSF1 and gcc +-------------------------------- +Compiles cleanly if you don't use -funroll-loops with gcc-2.7.2.3 or earlier + +DEC UNIX 4.0 or DEC OSF1 and cc +------------------------------- +CC = cc +CFLAGS = -O3 +LINKFLAGS = -s -non_shared + +Compiles cleanly. + +SunOS and gcc +------------- +Compiles cleanly + +SunOS and cc +------------ +CC = cc +CFLAGS = -O3 -s + +Compiles with one warning during compilation of nbench1.c + +"/usr/ucbinclude/strings.h", line 48: warning: identifier redeclared: strlen + current : function() returning int + previous: function() returning uint : "/usr/include/string.h", line 98 + +HP-UX and gcc +------------- +Compiles with one warning during compilation of sysspec.c + +In file included from /usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/malloc.h:9, + from sysspec.h:37, + from sysspec.c:37: +/usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/sys/types.h:117: warning: empty declaration +/usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/sys/types.h:118: warning: empty declaration + +DEC Ultrix and cc +----------------- +CC = cc +CFLAGS = -O2 +LINKFLAGS = -s + +Compiles with a warning about the correct usage of cut when running sysinfo.sh +cut: Usage: cut [-s] [-d<char>] {-c<list> | -f<list>} file ... +cut: Usage: cut [-s] [-d<char>] {-c<list> | -f<list>} file ... + diff --git a/README.submit b/README.submit new file mode 100644 index 0000000..0dd3138 --- /dev/null +++ b/README.submit @@ -0,0 +1,33 @@ +I plan on posting a digest of results in case people mail me any. +The URL will be linked to + +http://www.tux.org/~mayer/linux/bmark.html + +If you want to submit, then run the benchmark (use your own +compilation, I don't care with what flags or compiler, but I want all +numbers from a single benchmark run) and fill in the template as given +in the example below: + +CPU : AMD 5x86P75 (486DX4/133MHz) +L2 CACHE : 256 KB +OS : Linux 2.0.32 +C COMPILER : gcc 2.7.2.3 +LIBC : libc-5.4.38 +Pentium 90 INTEGER INDEX : 1.051 +Pentium 90 FLOATING-POINT INDEX : 0.450 +AMD K6/233 MEMORY INDEX : 0.337 +AMD K6/233 INTEGER INDEX : 0.238 +AMD K6/233 FLOATING-POINT INDEX : 0.230 + +Any other format is fine as long as it contains the same info (write +"unknown" or "?" for data you don't know). For example, you could just +cut the summary from the output of nbench and mail it together with +cache, CPU, and OS info in case it is not already present. Please do +not email me the complete output of nbench, or any other unnecessarily +long email, as this just eats up my hard-disk space. However, long +collections of results are of course welcome. + +Send your result to mayer@tux.org + +Uwe F. Mayer +February 18, 2003 @@ -0,0 +1,138 @@ +December 7, 1997 + +This file contains a few results so you may compare your machine. +If you read this much after December 1997 then the results herein +are probably obsolete. + +For a longer and hopefully more up-to-date list of results consult +http://www.tux.org/~mayer/linux/bmark.html +This web site, however, currently lists the old Pentium 90 indices! + +The indices below are with respect to the new AMD K6/233 baseline. + +OS : DEC Ultrix 4.4 +C compiler : cc +libc : unknown version +CPU : mips R6000 +L2 cache : ? +MEMORY INDEX : 0.029 +INTEGER INDEX : 0.046 +FLOATING-POINT INDEX: 0.077 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 486DX2/66 MHz +L2 cache : 256 KB +MEMORY INDEX : 0.098 +INTEGER INDEX : 0.141 +FLOATING-POINT INDEX: 0.116 + +OS : LINUX 2.0.32 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : AMD 5x86P75 (486DX4/133MHz) +L2 cache : 256 KB +MEMORY INDEX : 0.234 +INTEGER INDEX : 0.286 +FLOATING-POINT INDEX: 0.249 + +OS : OSF1 V3.2 214 +C compiler : cc +libc : unknown version +CPU : 21064 alpha (DEC 3000 MODEL 300, year 1993) +L2 cache : 256 KB +MEMORY INDEX : 0.358 +INTEGER INDEX : 0.362 +FLOATING-POINT INDEX: 0.656 + +OS : HP-UX A.09.05 +C compiler : gcc version 2.7.2.1 +libc : unknown version +CPU : 9000/715 +L2 cache : ? +MEMORY INDEX : 0.208 +INTEGER INDEX : 0.369 +FLOATING-POINT INDEX: 0.516 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel Pentium 133 MHz +L2 cache : 512 KB +MEMORY INDEX : 0.383 +INTEGER INDEX : 0.444 +FLOATING-POINT INDEX: 0.632 + +OS : SunOS 5.5.1 +C compiler : cc +libc : unknown version +CPU : SUN-Ultra-Enterprise-2 sparc +L2 cache : ? +MEMORY INDEX : 0.417 +INTEGER INDEX : 0.546 +FLOATING-POINT INDEX: 1.028 + +OS : LINUX 2.0.29 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Cyrix 6x86L PR200+ (at 2 x 75 = 150 MHz) +L2 cache : 256 KB +MEMORY INDEX : 0.666 +INTEGER INDEX : 0.599 +FLOATING-POINT INDEX: 0.508 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel Pentium MMX 200 MHz +L2 cache : 512 KB +MEMORY INDEX : 0.601 +INTEGER INDEX : 0.636 +FLOATING-POINT INDEX: 0.970 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 686 PentiumPro 200 MHz +L2 cache : 256 KB (internal) +MEMORY INDEX : 0.699 +INTEGER INDEX : 0.732 +FLOATING-POINT INDEX: 1.140 + +OS : LINUX 2.0.29 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Cyrix 6x86MX PR233 (at 2.5 x 75 = 187.5 MHz) +L2 cache : 512 KB +MEMORY INDEX : 0.861 +INTEGER INDEX : 0.773 +FLOATING-POINT INDEX: 0.730 + +OS : LINUX 2.0.32 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : AMD K6/233 +L2 cache : 512 KB +MEMORY INDEX : 1.000 +INTEGER INDEX : 1.000 +FLOATING-POINT INDEX: 1.000 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 686 Pentium II 300 MHz +L2 cache : 512 KB +MEMORY INDEX : 1.255 +INTEGER INDEX : 1.093 +FLOATING-POINT INDEX: 1.842 + +OS : DEC UNIX 4.0b 564 +C compiler : cc +libc : unknown version +CPU : 21164 Alpha 300 MHz (dual CPU) +L2 cache : 96 KB +L3 cache : 4 MB per CPU +MEMORY INDEX : 0.973 +INTEGER INDEX : 1.124 +FLOATING-POINT INDEX: 3.237 diff --git a/bdoc.txt b/bdoc.txt new file mode 100644 index 0000000..e557bb0 --- /dev/null +++ b/bdoc.txt @@ -0,0 +1,2109 @@ +http://www.byte.com/bmark/bmark.htm +---------------------------------------------------------------------------- + +BYTEmark + +---------------------------------------------------------------------------- + +This is release 2 of BYTE Magazine's BYTEmark benchmark program (previously +known as BYTE's Native Mode Benchmarks). This document covers the Native +Mode (a.k.a. Algorithm Level) tests; benchmarks designed to expose the +capabilities of a system's CPU, FPU, and memory system. Another group of +benchmarks within the BYTEmark suite includes the Application Simulation +Benchmarks. They are detailed in a separate document. [NOTE: The +documentation for the Application simulation benchmarks should appear before +the end of March, 95. -- RG]. + +The Tests + +The Native Mode portion of the BYTEmark consists of a number of well-known +algorithms; some BYTE has used before in earlier versions of the benchmark, +others are new. The complete suite consists of 10 tests: + +Numeric sort - Sorts an array of 32-bit integers. + +String sort - Sorts an array of strings of arbitrary length. + +Bitfield - Executes a variety of bit manipulation functions. + +Emulated floating-point - A small software floating-point package. + +Fourier coefficients - A numerical analysis routine for calculating series +approximations of waveforms. + +Assignment algorithm - A well-known task allocation algorithm. + +Huffman compression - A well-known text and graphics compression algorithm. + +IDEA encryption - A relatively new block cipher algorithm. + +Neural Net - A small but functional back-propagation network simulator. + +LU Decomposition - A robust algorithm for solving linear equations. + +A more complete description of each test can be found in later sections of +this document. + +BYTE built the BYTEmark with the multiplatform world foremost in mind. There +were, of course, other considerations that we kept high on the list: + +Real-world algorithms. The algorithms should actually do something. Previous +benchmarks often moved gobs of bytes from one point to another, added or +subtracted piles and piles of numbers, or (in some cases) actually executed +NOP instructions. We should not belittle those tests of yesterday, they had +their place. However, we think it better that tests be based on activities +that are more complex in nature. + +Easy to port. All the benchmarks are written in "vanilla" ANSI C. This +provides us with the best chance of moving them quickly and accurately to +new processors and operating systems as they appear. It also simplifies +maintenance. + +This means that as new 64-bit (and, perhaps, 128-bit) processors appear, the +benchmarks can test them as soon as a compiler is available. + +Comprehensive. The algorithms were derived from a variety of sources. Some +are routines that BYTE had been using for some time. Others are routines +derived from well-known texts in the computer science world. Furthermore, +the algorithms differ in structure. Some simply "walk" sequentially through +one-dimensional arrays. Others build and manipulate two-dimensional arrays. +Finally, some benchmarks are "integer" tests, while others exercise the +floating-point coprocessor (if one is available). + +Scalable. We wanted these benchmarks to be useful across as wide a variety +of systems as possible. We also wanted to give them a lifetime beyond the +next wave of new processors. + +To that end, we incorporated "dynamic workload adjustment." A complete +description of this appears in a later section. In a nutshell, this allows +the tests to "expand or contract" depending on the capabilities of the +system under test, all the while providing consistent results so that fair +and accurate comparisons are possible. + +Honesty In Advertising + +We'd be lying if we said that the BYTEmark was all the benchmarking that +anyone would ever need to run on a system. It would be equally inaccurate to +suggest that the tests are completely free of inadequacies. There are many +things the tests do not do, there are shortcomings, and there are problems. + +BYTE will continue to improve the BYTEmark. The source code is freely +available, and we encourage vendors and users to examine the routines and +provide us with their feedback. In this way, we assure fairness, +comprehensiveness, and accuracy. + +Still, as we mentioned, there are some shortcomings. Here are those we +consider the most significant. Keep them in mind as you examine the results +of the benchmarks now and in the future. + +At the mercy of C compilers. Being written in ANSI C, the benchmark program +is highly portable. This is a reflection of the "world we live in." If this +were a one-processor world, we might stand a chance at hand-crafting a +benchmark in assembly language. (At one time, that's exactly what BYTE did.) +Not today, no way. + +The upshot is that the benchmarks must be compiled. For broadest coverage, +we selected ANSI C. And when they're compiled, the resulting executable's +performance can be highly dependent on the capabilities of the C compiler. +Today's benchmark results can be blown out of the water tomorrow if someone +new enters the scene with an optimizing strategy that outperforms existing +competition. + +This concern is not easily waved off. It will require you to keep careful +track of compiler version and optimization switches. As BYTE builds its +database of benchmark results, version number and switch setting will become +an integral part of that data. This will be true for published information +as well, so that you can make comparisons fairly and accurately. BYTE will +control the distribution of test results so that all relevant compiler +information is attached to the data. + +As a faint justification -- for those who think this situation results in +"polluted" tests -- we should point out that we are in the same boat as all +the other developers (at least, all those using C compilers -- and that's +quite a sizeable group). If the only C compilers for a given system happen +to be poor ones, everyone suffers. It's a fact that a given platform's +ultimate potential depends as much on the development software available as +on the technical achievements of the hardware design. + +It's just CPU and FPU. It's very tempting to try to capture the performance +of a machine in a single number. That has never been possible -- though it's +been tried a lot -- and the gap between that ideal and reality will forever +widen. + +These benchmarks are meant to expose the theoretical upper limit of the CPU, +FPU, and memory architecture of a system. They cannot measure video, disk, +or network throughput (those are the domains of a different set of +benchmarks). You should, therefore, use the results of these tests as part, +not all, of any evaluation of a system. + +Single threaded. Currently, each benchmark test uses only a single execution +thread. It's unlikely that you'll find any modern operating system that does +not have some multitasking component. How a system "scales" as more tasks +are run simultaneously is an effect that the current benchmarks cannot +explore. + +BYTE is working on a future version of the tests that will solve this +problem. + +The tests are synthetic. This quite reasonable argument is based on the fact +that people don't run benchmarks for a living, they run applications. +Consequently, the only true measure of a system is how well it performs +whatever applications you will be running. This, in fact, is the philosophy +behind the BAPCo benchmarks. + +This is not a point with which we would disagree. BYTE regularly makes use +of a variety of application benchmarks. None of this suggests, however, that +the BYTEmark benchmarks serve no purpose. + +BYTEmark's results should be used as predictors. They can be moved to a new +platform long before native applications will be ported. The BYTEmark +benchmarks will therefore provide an early look at the potential of the +machine. Additionally, the BYTEmark permits you to "home in" on an aspect of +the overall architecture. How well does the system perform when executing +floating-point computations? Does its memory architecture help or hinder the +management of memory buffers that may fall on arbitrary address boundaries? +How does the cache work with a program whose memory access favors moving +randomly through memory as opposed to moving sequentially through memory? + +The answers to these questions can give you a good idea of how well a system +would support a particular class of applications. Only a synthetic benchmark +can give the narrow view necessary to find the answers. + +Dynamic Workloads + +Our long history of benchmarking has taught us one thing above all others: +Tomorrow's system will go faster than today's by an amount exceeding your +wildest guess -- and then some. Dealing with this can become an unending +race. + +It goes like this: You design a benchmark algorithm, you specify its +parameters (how big the array is, how many loops, etc.), you run it on +today's latest super-microcomputer, collect your data, and go home. A new +machine arrives the next day, you run your benchmark, and discover that the +test executes so quickly that the resolution of the clock routine you're +using can't keep up with it (i.e., the test is over and done before the +system clock even has a chance to tick). + +If you modify your routine, the figures you collected yesterday are no good. +If you create a better clock routine by sneaking down into the system +hardware, you can kiss portability goodbye. + +The BYTEmark benchmarks solve this problem by a process we'll refer to as +"dynamic workload adjustment." In principle, it simply means that if the +test runs so fast that the system clock can't time it, the benchmark +increases the test workload -- and keeps increasing it -- until enough time +is consumed to gather reliable test results. + +Here's an example. + +The BYTEmark benchmarks perform timing using a "stopwatch" paradigm. The +routine StartStopwatch() begins timing; StopStopwatch() ends timing and +reports the elapsed time in clock ticks. Now, "clock ticks" is a value that +varies from system to system. We'll presume that our test system provides +1000 clock ticks per second. (We'll also presume that the system actually +updates its clock 1000 times per second. Surprisingly, some systems don't do +that. One we know of will tell you that the clock provides 100 ticks per +second, but updates the clock in 5- or 6-tick increments. The resolution is +no better than somewhere around 1/18th of a second.) Here, when we say +"system" we mean not only the computer system, but the environment provided +by the C compiler. Interestingly, different C compilers for the same system +will report different clock ticks per second. + +Built into the benchmarks is a global variable called GLOBALMINTICKS. This +variable is the minimum number of clock ticks that the benchmark will allow +StopStopwatch() to report. + +Suppose you run the Numeric Sort benchmark. The benchmark program will +construct an array filled with random numbers, call StartStopwatch(), sort +the array, and call StopStopwatch(). If the time reported in StopStopwatch() +is less than GLOBALMINTICKS, then the benchmark will build two arrays, and +try again. If sorting two arrays took less time than GLOBALMINTICKS, the +process repeats with more arrays. + +This goes on until the benchmark makes enough work so that an interval +between StartStopwatch() and StopStopwatch() exceeds GLOBALMINTICKS. Once +that happens, the test is actually run, and scores are calculated. + +Notice that the benchmark didn't make bigger arrays, it made more arrays. +That's because the time taken by the sort test does not increase linearly as +the array grows, it increases by a factor of N*log(N) (where N is the size +of the array). + +This principle is applied to all the benchmark tests. A machine with a less +accurate clock may be forced to sort more arrays at a time, but the results +are given in arrays per second. In this way fast machines, slow machines, +machines with accurate clocks, machines with less accurate clocks, can all +be tested with the same code. + +Confidence Intervals + +Another built-in feature of the BYTEmark is a set of statistical-analysis +routines. Running benchmarks is one thing; the question arises as to how +many times should a test be run until you know you have a good sampling. +Also, can you determine whether the test is stable (i.e., do results vary +widely from one execution of the benchmark to the next)? + +The BYTEmark keeps score as follows: Each test (a test being a numeric +sort, a string sort, etc.) is run five times. These five scores are +averaged, the standard deviation is determined, and a 95% confidence +half-interval for the mean is calculated (using the student t +distribution). This tells us that the true average lies -- with a 95% +probability -- within plus or minus the confidence half-interval of +the calculated average. If this half-interval is within 5% of the +calculated average, the benchmarking stops. Otherwise, a new test is +run and the calculations are repeated with all of the runs done so +far, including the new one. The benchmark proceeds this way up to a +total of 30 runs. If the length of the half-interval is still bigger +than 5% of the calculated average then a warning issued that the +results might not be statistically certain before the average is +displayed. + +** Fixed a statistical bug here. Uwe F. Mayer + +The upshot is that, for each benchmark test, the true average is -- with a +95% level of confidence -- within 5% of the average reported. Here, the +"true average" is the average we would get were we able to run the tests +over and over again an infinite number of times. + +This specification ensures that the calculation of results is controlled; +that someone running the tests in California will use the same technique for +determining benchmark results as someone running the tests in New York. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run the benchmark suite +as compared to a run an unloaded system. This is because the benchmark does +some statistical analysis to make sure that the reported results are +statistically significant (as explained above), and a high variation in +individual runs requires more runs to achieve the required statistical +confidence. + +*** added last the paragraph, Uwe F. Mayer + +Interpreting Results + +Of course, running the benchmarks can present you with a boatload of data. +It can get mystifying, and some of the more esoteric statistical information +is valuable only to a limited audience. The big question is: What does it +all mean? + +First, we should point out that the BYTEmark reports both "raw" and indexed +scores for each test. The raw score for a particular test amounts to the +"iterations per second" of that test. For example, the numeric sort test +reports as its raw score the number of arrays it was able to sort per +second. + +The indexed score is the raw score of the system under test divided by the +raw score obtained on the baseline machine. As of this release, the +baseline machine is a DELL 90 Mhz Pentium XPS/90 with 16 MB of RAM and 256K +of external processor cache. (The compiler used was the Watcom C/C++ 10.0 +compiler; optimizations set to "fastest possible code", 4-byte structure +alignment, Pentium code generation with Pentium register-based calling. The +operating system was MSDOS.) The indexed score serves to "normalize" the +raw scores, reducing their dynamic range and making them easier to +grasp. Simply put, if your machine has an index score of 2.0 on the numeric +sort test, it performed that test twice as fast as this 90 Mhz Pentium. + +If you run all the tests (as you'll see, it is possible to perform "custom +runs", which execute only a subset of the tests) the BYTEmark will also +produce two overall index figures: Integer index and Floating-point index. +The Integer index is the geometric mean of those tests that involve only +integer processing -- numeric sort, string sort, bitfield, emulated +floating-point, assignment, Huffman, and IDEA -- while the Floating-point +index is the geometric mean of those tests that require the floating-point +coprocessor -- Fourier, neural net, and LU decomposition. You can use these +scores to get a general feel for the performance of the machine under test +as compared to the baseline 90 Mhz Pentium. + +The Linux/Unix port has a second baseline machine, it is an AMD K6/233 with +32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and using GNU gcc +version 2.7.2.3 and libc-5.4.38. The integer index was split as suggested +by Andrew D. Balsa <andrewbalsa@usa.net>, and reflects the realization that +memory management is important in CPU design. The original tests have been +left alone, however, the geometric mean of the tests NUMERIC SORT, FP +EMULATION, IDEA, and HUFFMAN now constitutes the integer-arithmetic focused +benchmark index, while the geometric mean of the tests STRING SORT, +BITFIELD, and ASSIGNMENT makes up the new memory index. The floating point +index has been left alone, it is still the geometric mean of FOURIER, +NEURAL NET, and LU DECOMPOSITION. + +*** added the section on Linux, Uwe F. Mayer + +What follows is a list of the benchmarks and associated brief remarks that +describe what the tests do: What they exercise; what a "good" result or a +"bad" result means. Keep in mind that, in this expanding universe of faster +processors, bigger caches, more elaborate memory architectures, "good" and +"bad" are indeed relative terms. A good score on today's hot new processor +will be a bad score on tomorrow's hot new processor. + +These remarks are based on empirical data and profiling that we have done to +date. (NOTE: The profiling is limited to Intel and Motorola 68K on this +release. As more data is gathered, we will be refining this section. +3/14/95--RG) + +Benchmark Description + +Numeric sort Generic integer performance. Should + exercise non-sequential performance + of cache (or memory if cache is less + than 8K). Moves 32-bit longs at a + time, so 16-bit processors will be + at a disadvantage. + + + +String sort Tests memory-move performance. + Should exercise non-sequential + performance of cache, with added + burden that moves are byte-wide and + can occur on odd address boundaries. + May tax the performance of + cell-based processors that must + perform additional shift operations + to deal with bytes. + + + +Bitfield Exercises "bit twiddling" + performance. Travels through memory + in a somewhat sequential fashion; + different from sorts in that data is + merely altered in place. If + properly compiled, takes into + account 64-bit processors, which + should see a boost. + + + +Emulated F.P. Past experience has shown this test + to be a good measurement of overall + performance. + + + +Fourier Good measure of transcendental and + trigonometric performance of FPU. + Little array activity, so this test + should not be dependent of cache or + memory architecture. + + + +Assignment The test moves through large integer + arrays in both row-wise and + column-wise fashion. Cache/memory + with good sequential performance + should see a boost (memory is + altered in place -- no moving as in + a sort operation). Processing is + done in 32-bit chunks -- no + advantage given to 64-bit + processors. + + + +Huffman A combination of byte operations, + bit twiddling, and overall integer + manipulation. Should be a good + general measurement. + + + +IDEA Moves through data sequentially in + 16-bit chunks. Should provide a + good indication of raw speed. + + + +Neural Net Small-array floating-point test + heavily dependent on the exponential + function; less dependent on overall + FPU performance. Small arrays, so + cache/memory architecture should not + come into play. + + + +LU decomposition. A floating-point test that moves + through arrays in both row-wise and + column-wise fashion. Exercises only + fundamental math operations (+, -, + *, /). + +The Command File + +Purpose + +The BYTEmark program allows you to override many of its default parameters +using a command file. The command file also lets you request statistical +information, as well as specify an output file to hold the test results for +later use. + +You identify the command file using a command-line argument. E.G., + +C:NBENCH -cCOMFILE.DAT + +tells the benchmark program to read from COMFILE.DAT in the current +directory. + +The content of the command file is simply a series of parameter names and +values, each on a single line. The parameters control internal variables +that are either global in nature (i.e., they effect all tests in the +program) or are specific to a given benchmark test. + +The parameters are listed in a reference guide that follows, arranged in the +following groups: + +Global Parameters + +Numeric Sort + +String Sort + +Bitfield + +Emulated floating-point + +Fourier coefficients + +Assignment algorithm + +IDEA encryption + +Huffman compression + +Neural net + +LU decomposition + +As mentioned above, those items listed under "Global Parameters" affect all +tests; the rest deal with specific benchmarks. There is no required ordering +to parameters as they appear in the command file. You can specify them in +any sequence you wish. + +You should be judicious in your use of a command file. Some parameters will +override the "dynamic workload" adjustment that each test performs. Doing +this completely bypasses the benchmark code that is designed to produce an +accurate reading from your system clock. Other parameters will alter default +settings, yielding test results that cannot be compared with published +benchmark results. + +A Sample Command File + +Suppose you built a command file that contained the following: + +ALLSTATS=T + +CUSTOMRUN=T + +OUTFILE=D:\DATA.DAT + +DONUMSORT=T + +DOLU=T + +Here's what this file tells the benchmark program: + +ALLSTATS=T means that you've requested a "dump" of all the statistics the +test gathers. This includes not only the standard deviations of tests run, +it also produces test-specific information such as the number of arrays +built, the array size, etc. + +CUSTOMRUN=T tells the system that this is a custom run. Only tests +explicitly specified will be executed. + +OUTFILE=D:\DATA.DAT will write the output of the benchmark to the file +DATA.DAT on the root of the D: drive. (If DATA.DAT already exists, output +will be appended to the file.) + +DONUMSORT=T tells the system to run the numeric sort benchmark. (This was +necessary on account of the CUSTOMRUN=T line, above.) + +DOLU=T tells the system to run the LU decomposition benchmark. + +Command File Parameters Reference + +(NOTE: Altering some global parameters can invalidate results for comparison +purposes. Those parameters are indicated in the following section by a bold +asterisk (*). If you alter any parameters so indicated, you may NOT publish +the resulting data as BYTEmark scores.) + +Global Parameters + +GLOBALMINTICKS=<n> + +This overrides the default global_min_ticks value (defined in NBENCH1.H). +The global_min_ticks value is defined as the minimum number of clock ticks +per iteration of a particular benchmark. For example, if global_min_ticks is +set to 100 and the numeric sort benchmark is run; each iteration MUST take +at least 100 ticks, or the system will expand the work-per-iteration. + +MINSECONDS=<n> + +Sets the minimum number of seconds any particular test will run. This has +the effect of controlling the number of repetitions done. Default: 5. + +ALLSTATS=<T|F> + +Set this flag to T for a "dump" of all statistics. The information displayed +varies from test to test. Default: F. + +OUTFILE=<path> + +Specifies that output should go to the specified output file. Any test +results and statistical data displayed on-screen will also be written to the +file. If the file does not exist, it will be created; otherwise, new output +will be appended to an existing file. This allows you to "capture" several +runs into a single file for later review. + +Note: the path should not appear in quotes. For example, something like the +following would work: OUTFILE=C:\BENCH\DUMP.DAT + +CUSTOMRUN=<T|F> + +Set this flag to T for a custom run. A "custom run" means that the program +will run only the benchmark tests that you explicitly specify. So, use this +flag to run a subset of the tests. Default: F. + +Numeric Sort + +DONUMSORT=<T|F> + +Indicates whether to do the numeric sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case default is F. + +NUMNUMARRAYS=<n> + +Indicates the number of numeric arrays the system will build. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +NUMARRAYSIZE=<n> + +Indicates the number of elements in each numeric array. Default is 8001 +entries. (NOTE: Altering this value will invalidate the test for comparison +purposes. The performance of the numeric sort test is not related to the +array size as a linear function; i.e., an array twice as big will not take +twice as long. The relationship involves a logarithmic function.)* + +NUMMINSECONDS=<n> + +Overrides MINSECONDS for the numeric sort test. + +String Sort + +DOSTRINGSORT=<T|F> + +Indicates whether to do the string sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +STRARRAYSIZE=<n> + +Sets the size of the string array. Default is 8111. (NOTE: Altering this +value will invalidate the test for comparison purposes. The performance of +the string sort test is not related to the array size as a linear function; +i.e., an array twice as big will not take twice as long. The relationship +involves a logarithmic function.)* + +NUMSTRARRAYS=<n> + +Sets the number of string arrays that will be created to run the test. +Setting this value will override the program's "dynamic workload" adjustment +for this test.* + +STRMINSECONDS=<n> + +Overrides MINSECONDS for the string sort test. + +Bitfield + +DOBITFIELD=<T|F> + +Indicates whether to do the bitfield test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NUMBITOPS=<n> + +Sets the number of bitfield operations that will be performed. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +BITFIELDSIZE=<n> + +Sets the number of 32-bit elements in the bitfield arrays. The default value +is dependent on the size of a long as defined by the current compiler. For a +typical compiler that defines a long to be 32 bits, the default is 32768. +(NOTE: Altering this parameter will invalidate test results for comparison +purposes.)* + +BITMINSECONDS=<n> + +Overrides MINSECONDS for the bitfield test. + +Emulated floating-point + +DOEMF=<T|F> + +Indicates whether to do the emulated floating-point test. Default is T, +unless this is a custom run (CUSTOMRUN=T), in which case the default is F. + +EMFARRAYSIZE=<n> + +Sets the size (number of elements) of the emulated floating-point benchmark. +Default is 3000. The test builds three arrays, each of equal size. This +parameter sets the number of elements for EACH array. (NOTE: Altering this +parameter will invalidate test results for comparison purposes.)* + +EMFLOOPS=<n> + +Sets the number of loops per iteration of the floating-point test. Setting +this value will override the program's "dynamic workload" adjustment for +this test.* + +EMFMINSECONDS=<n> + +Overrides MINSECONDS for the emulated floating-point test. + +Fourier coefficients + +DOFOUR=<T|F> + +Indicates whether to do the Fourier test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +FOURASIZE=<n> + +Sets the size of the array for the Fourier test. This sets the number of +coefficients the test will derive. NOTE: Specifying this value will override +the system's "dynamic workload" adjustment for this test, and may make the +results invalid for comparison purposes.* + +FOURMINSECONDS=<n> + +Overrides MINSECONDS for the Fourier test. + +Assignment Algorithm + +DOASSIGN=<T|F> + +Indicates whether to do the assignment algorithm test. Default is T, unless +this is a custom run (CUSTOMRUN=T), in which case the default is F. + +ASSIGNARRAYS=<n> + +Indicates the number of arrays that will be built for the test. Specifying +this value will override the system's "dynamic workload" adjustment for this +test. (NOTE: The size of the arrays in the assignment algorithm is fixed at +101 x 101. Altering the array size requires adjusting global constants and +recompiling; to do so, however, would invalidate test results.)* + +ASSIGNMINSECONDS=<n> + +Overrides MINSECONDS for the assignment algorithm test. + +IDEA encryption + +DOIDEA=<T|F> + +Indicates whether to do the IDEA encryption test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +IDEAARRAYSIZE=<n> + +Sets the size of the plain-text character array that will be encrypted by the +test. Default is 4000. The benchmark actually builds 3 arrays: 1st +plain-text, encrypted version, and 2nd plain-text. The 2nd plain-text array is +the destination for the decryption process [part of the test]. All arrays +are set to the same size. (NOTE: Specifying this value will invalidate test +results for comparison purposes.)* + +IDEALOOPS=<n> + +Indicates the number of loops in the IDEA test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +IDEAMINSECONDS=<n> + +Overrides MINSECONDS for the IDEA test. + +Huffman compression + +DOHUFF=<T|F> + +Indicates whether to do the Huffman test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +HUFFARRAYSIZE=<n> + +Sets the size of the string buffer that will be compressed using the Huffman +test. The default is 5000. (NOTE: Altering this value will invalidate test +results for comparison purposes.)* + +HUFFLOOPS=<n> + +Sets the number of loops in the Huffman test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +HUFFMINSECONDS=<n> + +Overrides MINSECONDS for the Huffman test. + +Neural net + +DONNET=<T|F> + +Indicates whether to do the Neural Net test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NNETLOOPS=<n> + +Sets the number of loops in the Neural Net test. NOTE: Altering this value +overrides the benchmark's "dynamic workload" adjustment algorithm, and may +invalidate the results for comparison purposes.* + +NNETMINSECONDS=<n> + +Overrides MINSECONDS for the Neural Net test. + +LU decomposition + +DOLU=<T|F> + +Indicates whether to do the LU decomposition test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +LUNUMARRAYS=<n> + +Sets the number of arrays in each iteration of the LU decomposition test. +Specifying this value will override the system's "dynamic workload" +adjustment for this test.* + +LUMINSECONDS=<n> + +Overrides MINSECONDS for the LU decomposition test. + +Numeric Sort + +Description + +This benchmark is designed to explore how well the system sorts a numeric +array. In this case, a numeric array is a one-dimensional collection of +signed, 32-bit integers. The actual sorting is performed by a heapsort +algorithm (see the text box following for a description of the heapsort +algorithm). + +It's probably unnecessary to point out (but we'll do it anyway) that sorting +is a fundamental operation in computer application software. You'll likely +find sorting routines nestled deep inside a variety of applications; +everything from database systems to operating-systems kernels. + +The numeric sort benchmark reports the number of arrays it was able to sort +per second. The array size is set by a global constant (it can be overridden +by the command file -- see below). + +Analysis + +Optimized 486 code: Profiling of the numeric sort benchmark using Watcom's +profiler (Watcom C/C++ 10.0) indicates that the algorithm spends most of its +time in the numsift() function (specifically, about 90% of the benchmark's +time takes place in numsift()). Within numsift(), two if statements dominate +time spent: + +if(array[k]<array[k+1L]) and if(array[i]<array[k]) + +Both statements involve indexes into arrays, so it's likely the processor is +spending a lot of time resolving the array references. (Though both +statements involve "less-than" comparisons, we doubt that much time is +consumed in performing the signed compare operation.) Though the first +statement involves array elements that are adjacent to one another, the +second does not. In fact, the second statement will probably involve +elements that are far apart from one another during early passes through the +sifting process. We expect that systems whose caching system pre-fetches +contiguous elements (often in "burst" line fills) will not have any great +advantage of systems without pre-fetch mechanisms. + +Similar results were found when we profiled the numeric sort algorithm under +the Borland C/C++ compiler. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based; consequently, it does not allow for line-by-line analysis as does the +Watcom compiler's profiler. + +However, the CodeWarrior profiler does give us enough information to note +that NumSift() only accounts for about 28% of the time consumed by the +benchmark. The outer routine, NumHeapSort() accounts for around 71% of the +time taken. It will require additional analysis to determine why the two +compilers -- Watcom and CodeWarrior divide the workload so differently. (It +may have something to do with compiler architecture, or the act of profiling +the code may produce results that are significantly different than how the +program runs under normal conditions, though that would lead one to wonder +what use profilers would be.) + +Porting Considerations + +The numeric sort routine should represent a trivial porting exercise. It is +not an overly large benchmark in terms of source code. Additionally, the +only external routines it calls on are for allocating and releasing memory, +and managing the stopwatch. + +The numeric sort benchmark depends on the following global definitions (note +that these may be overridden by the command file): + +NUMNUMARRAYS -- Sets the upper limit on the number of arrays that the +benchmark will attempt to build. The numeric sort benchmark creates work for +itself by requiring the system to sort more and more arrays...not bigger and +bigger arrays. (The latter case would skew results, because the sorting time +for heapsort is N log2 N - e.g., doubling the array size does not double the +sort time.) This constant sets the upper limit to the number of arrays the +system will build before it signals an error. The default value is 100, and +may be changed if your system exceeds this limit. + +NUMARRAYSIZE - Determines the size of each array built. It has been set to +8111L and should not be tampered with. The command file entry +NUMARRAYSIZE=<n> can be used to change this value, but results produced by +doing this will make your results incompatible with other runs of the +benchmark (since results will be skewed -- see preceding paragraph). + +To test for a correct execution of the numeric sort benchmark, #define the +DEBUG symbol. This will enable code that verifies that arrays are properly +sorted. You should run the benchmark program using a command file that has +only the numeric sort test enabled. If there is an error, the program will +display "SORT ERROR" (If this happens, it's possible that tons of "SORT +ERROR" messages will be emitted, so it's best not to redirect output to a +file), otherwise it will print "Numeric sort: OK" (also quite a few times). + +References + +Gonnet, G.H. 1984, Handbook of Algorithms and Data Structures (Reading, MA: +Addison-Wesley). + +Knuth, Donald E. 1968, Fundamental Algorithms, vol 1 of The Art of Computer +Programming (Reading, MA: Addison-Wesley). + +Press, William H., Flannery, Brian P., Teukolsky, Saul A., and Vetterling, +William T. 1989, Numerical Recipes in Pascal (Cambridge: Cambridge +University Press). + +Heapsort + +The heapsort algorithm is well-covered in a number of the popular +computer-science textbooks. In fact, it gets a pat on the back in Numerical +Recipes (Press et. al.), where the authors write: + +Heapsort is our favorite sorting routine. It can be recommended +wholeheartedly for a variety of sorting applications. It is a true +"in-place" sort, requiring no auxiliary storage. + +Heapsort works by building the array into a kind of a queue called a heap. +You can imagine this heap as being a form of in-memory binary tree. The +topmost (root) element of the tree is the element that -- were the array +sorted -- would be the largest element in the array. Sorting takes place by +first constructing the heap, then pulling the root off the tree, promoting +the next largest element to the root, pulling it off, and so on. (The +promotion process is known as "sifting up.") + +Heapsort executes in N log2 N time even in its worst case. Unlike some other +sorting algorithms, it does not benefit from a partially sorted array +(though Gonnet does refer to a variation of heapsort, called "smoothsort," +which does -- see references). + +String Sort + +Description + +This benchmark is designed to gauge how well the system moves bytes around. +By that we mean, how well the system can copy a string of bytes from one +location to another; source and destination being aligned to arbitrary +addresses. (This is unlike the numeric sort array, which moves bytes +longword-at-a-time.) The strings themselves are built so as to be of random +length, ranging from no fewer than 4 bytes and no greater than 80 bytes. The +mixture of random lengths means that processors will be forced to deal with +strings that begin and end on arbitrary address boundaries. + +The string sort benchmark uses the heapsort algorithm; this is the same +algorithm as is used in the numeric sort benchmark (see the sidebar on the +heapsort for a detailed description of the algorithm). + +Manipulation of the strings is actually handled by two arrays. One array +holds the strings themselves; the other is a pointers array. Each member of +the pointers array carries an offset that points into the string array, so +that the ith pointer carries the offset to the ith string. This allows the +benchmark to rapidly locate the position of the ith string. (The sorting +algorithm requires exchanges of items that might be "distant" from one +another in the array. It's critical that the routine be able to rapidly find +a string based on its indexed position in the array.) + +The string sort benchmark reports the number of string arrays it was able to +sort per second. The size of the array is set by a global constant. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): Profiling of the string sort +benchmark indicates that it spends most of its time in the C library routine +memmove(). Within that routine, most of the execution is consumed by a pair +of instructions: rep movsw and rep movsd. These are repeated string move -- +word width and repeated string move -- doubleword width, respectively. + +This is precisely where we want to see the time spent. It's interesting to +note that the memmove() of the particular compiler/profiler tested (Watcom +C/C++ 10.0) was "smart" enough to do most of the moving on word or +doubleword boundaries. The string sort benchmark specifically sets arbitrary +boundaries, so we'd expect to see lots of byte-wide moves. The "smart" +memmove() is able to move bytes only when it has to, and does the remainder +of the work via words and doublewords (which can move more bits at a time). + +680x0 Code (Macintosh CodeWarrior): Because CodeWarrior's profiler is +function based, it is impossible to get an idea of how much time the test +spends in library routines such as memmove(). Fortunately, as an artifact of +the early version of the benchmark, the string sort algorithm makes use of +the MoveMemory() routine in the sysspec.c file (system specific routines). +This call, on anything other than a 16-bit DOS system, calls memmove() +directly. Hence, we can get a good approximation of how much time is spent +moving bytes. + +The answer is that nearly 78% of the benchmark's time is consumed by +MoveMemory(), the rest being taken up by the other routines (the +str_is_less() routine, which performs string comparisons, takes about 7% of +the time). As above, we can guess that most of the benchmark's time is +dependent on the performance of the library's memmove() routine. + +Porting Considerations + +As with the numeric sort routine, the string sort benchmark should be simple +to port. Simpler, in fact. The string sort benchmark routine is not +dependent on any typedef that may change from machine to machine (unless a +char type is not 8 bits). + +The string sort benchmark depends on the following global definitions: + +NUMSTRARRAYS - Sets the upper limit on the number of arrays that the +benchmark will attempt to build. The string sort benchmark creates work for +itself by requiring the system to sort more and more arrays, not bigger and +bigger arrays. (See section on Numeric Sort for an explanation.) This +constant sets the upper limit to the number of arrays the system will build +before it signals an error. The default value is 100, and may be changed if +your system exceeds this limit. + +STRARRAYSIZE - Sets the default size of the string arrays built. We say +"arrays" because, as with the numeric sort benchmark, the system adds work +not by expanding the size of the array, but by adding more arrays. This +value is set to 8111, and should not be modified, since results would not be +comparable with other runs of the same benchmark on other machines. + +To test for a correct execution of the string sort benchmark, #define +the DEBUG symbol. This will enable code that verifies the arrays are +properly sorted. Set up a command file that runs only the string sort, +and execute the benchmark program. If the routine is operating +properly, the benchmark will print "String sort: OK", this message is +printed quite often. Otherwise, the program will display "SORT ERROR" +for each pair of strings it finds out of order (which can be really +often). + +References + +See the references for the Numeric Sort benchmark. + +Bitfield Operations + +Description + +The purpose of this benchmark is to explore how efficiently the system +executes operations that deal with "twiddling bits." The test is set up to +simulate a "bit map"; a data structure used to keep track of storage usage. +(Don't confuse this meaning of "bitmap" with its use in describing a +graphics data structure.) + +Systems often use bit maps to keep an inventory of memory blocks or (more +frequently) disk blocks. In the case of a bit map that manages disk usage, +an operating system will set aside a buffer in memory so that each bit in +that buffer corresponds to a block on the disk drive. A 0 bit means that the +corresponding block is free; a 1 bit means the block is in use. Whenever a +file requests a new block of disk storage, the operating system searches the +bit map for the first 0 bit, sets the bit (to indicate that the block is now +spoken for), and returns the number of the corresponding disk block to the +requesting file. + +These types of operations are precisely what this test simulates. A block of +memory is set allocated for the bit map. Another block of memory is +allocated, and set up to hold a series of "bit map commands". Each bitmap +command tells the simulation to do 1 of 3 things: + +1) Clear a series of consecutive bits, + +2) Set a series of consecutive bits, or + +3) Complement (1->0 and 0->1) a series of consecutive bits. + +The bit map command block is loaded with a set of random bit map commands +(each command covers an random number of bits), and simulation routine steps +sequentially through the command block, grabbing a command and executing it. + +The bitfield benchmark reports the number of bits it was able to operate on +per second. The size of the bit map is constant; the bitfield operations +array is adjusted based on the capabilities of the processor. (See the +section describing the auto-adjust feature of the benchmarks.) + +Analysis + +Optimized 486 code: Using the Watcom C/C++ 10.0 profiler, the Bitfield +benchmark appears to spend all of its time in two routines: ToggleBitRun() +(74% of the time) and DoBitFieldIteration() (24% of the time). We say +"appears" because this is misleading, as we will explain. + +First, it is important to recall that the test performs one of three +operations for each run of bits (see above). The routine ToggleBitRun() +handles two of those three operations: setting a run of bits and clearing a +run of bits. An if() statement inside ToggleBitRun() decides which of the +two operations is performed. (Speed freaks will quite rightly point out that +this slows the entire algorithm. ToggleBitRun() is called by a switch() +statement which has already decided whether bits should be set or cleared; +it's a waste of time to have ToggleBitRun() have to make that decision yet +again.) + +DoBitFieldIteration() is the "outer" routine that calls ToggleBitRun(). +DoBitFieldIteration() also calls FlipBitRun(). This latter routine is the +one that performs the third bitfield operation: complementing a run of bits. +FlipBitRun() gets no "air time" at all (while DoBitFieldIteration() gets 24 +% of the time) simply because the compiler's optimizer recognizes that +FlipBitRun() is only called by DoBitFieldIteration(), and is called only +once. Consequently, the optimizer moves FlipBitRun() "inline", i.e., into +DoBitFieldIteration(). This removes an unnecessary call/return cycle (and is +probably part of the reason why the FlipBitRun() code gets 24% of the +algorithm's time, instead of something closer to 30% of its time.) + +Within the routines, those lines of code that actually do the shifting, the +and operations, and the or operations, consume time evenly. This should make +for a good test of a processor's "bit twiddling" capabilities. + +680x0 Code (Macintosh CodeWarrior): The CodeWarrior profiler is function +based. Consequently, it is impossible to produce a profile of machine +instruction execution time. We can, however, get a good picture of how the +algorithm divides its time among the various functions. + +Unlike the 486 compiler, the CodeWarrior compiler did not appear to collapse +the FlipBitRun() routine into the outer DoBitFieldIteration() routine. (We +don't know this for certain, of course. It's possible that the compiler +would have done this had we not been profiling.) + +In any case, the time spent in the two "core" routines of the bitfield test +are shown below: + +FlipBitRun() - 18031.2 microsecs (called 509 times) + +ToggleBitRun() - 50770.6 microsecs (called 1031 times) + +In terms of total time, FlipBitRun() takes about 35% of the time (it gets +about 33% of the calls). Remember, ToggleBitRun() is a single routine that +is called both to set and clear bits. Hence, ToggleBitRun() is called twice +as often as FlipBitRun(). + +We can conclude that time spent setting bits to 1, setting bits to 0, and +changing the state of bits, is about equal; the load is balanced close to +what we'd expect it to be, based on the structure of the algorithm. + +Porting Considerations + +The bitfield operations benchmark is dependent on the size of the long +datatype. On most systems, this is 32 bits. However, on some of the newer +RISC chips, a long can be 64 bits long. If your system does use 64-bit +longs, you'll need to #define the symbol LONG64. + +If you are unsure of the size of a long in your system (some C compiler +manuals make it difficult to discover), simply place an ALLSTATS=T line in +the command file and run the benchmarks. This will cause the benchmark +program to display (among other things) the size of the data types int, +short, and long in bytes. + +BITFARRAYSIZE - Sets the number of longs in the bit map array. This number +is fixed, and should not be altered. The bitfield test adjusts itself by +adding more bitfield commands (see above), not by creating a larger bit map. + +Currently, there is no code added to test for correct execution. If you are +concerned that your port was incorrect, you'll need to step through your +favorite debugger and verify execution against the original source code. + +** I added a resetting of the random number generator, and a resetting +** of the bitfield to each loop. Those operations are outside of the +** timed loop, and should add to make the benchmark more consistent. +** There also is now debugging information available. If you define +** DEBUG then the program will write a file named "debugbit.dat", +** which is the contents of the bitfield after the calibration loop of +** 30 operations. You can compare this file with the file +** "debugbit.good" that comes with the distribution. +** Uwe F. Mayer <mayer@tux.edu> + +References + +None. + +Emulated Floating-point + +Description + +The emulated floating-point benchmark includes routines that are similar to +those that would be executed whenever a system performs floating-point +operations in the absence of a coprocessor. In general, this amounts to a +mixture of integer instructions, including shift operations, integer +addition and subtraction, and bit testing (among others). + +The benchmark itself is remarkably simple. The test builds three +1-dimensional arrays and loads the first two up with random floating-point +numbers. The arrays are then partitioned into 4 equal-sized groups, and the +test proceeds by performing addition, subtraction, multiplication, and +division -- one operation on each group. (For example, for the addition +group, an element from the first array is added to the second array and the +result is placed in the third array.) + +Of course, most of the work takes place inside the routines that perform the +addition, subtraction, multiplication, and division. These routines operate +on a special data type (referred to as an InternalFPF number) that -- though +not strictly IEEE compliant -- carries all the necessary data fields to +support an IEEE-compatible floating-point system. Specifically, an +InternalFPF number is built up of the following fields: + +Type (indicates a NORMAL, SUBNORMAL, etc.) + +Mantissa sign + +Unbiased, signed 16-bit exponent + +4-word (16 bits) mantissa. + +The emulated floating-point test reports its results in number of loops per +second (where a "loop" is one pass through the arrays as described above). + +Finally, we are aware that this test could be on its way to becoming an +anachronism. A growing number of systems are appearing that have +coprocessors built into the main CPU. It's possible that floating-point +emulation will one day be a thing of the past. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The algorithm's time is distributed +across a number of routines. The distribution is: + +ShiftMantLeft1() - 60% of the time + +ShiftMantRight1() - 17% of the time + +DivideInternalFPF() - 14% of the time + +MultiplyInternalFPF() - 5% of the time. + +The first two routines are similar to one another; both shift bits about in +a floating-point number's mantissa. It's reasonable that ShiftMantLeft1() +should take a larger share of the system's time; it is called as part of the +normalization process that concludes every emulated addition, subtraction, +mutiplication, and division. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is +function-based; consequently, it isn't possible to get timing at the machine +instruction level. However, the output to CodeWarrior's profiler has +provided insight into the breakdown of time spent in various functions that +forces us to rethink our 486 code analysis. + +Analyzing what goes on inside the emulated floating-point tests is a tough +one to call because some of the routines that are part of the test are +called by the function that builds the arrays. Consequently, a quick look at +the profiler's output can be misleading; it's not obvious how much time a +particular routine is spending in the test and how much time that same +routine is spending setting up the test (an operation that does not get +timed). + +Specifically, the routine that loads up the arrays with test data calls +LongToInternalFPF() and DivideInternalFPF(). LongToInternalFPF() makes one +call to normalize() if the number is not a true zero. In turn, normalize() +makes an indeterminate number of calls to ShiftMantLeft1(), depending on the +structure of the mantissa being normalized. + +What's worse, DivideInternalFPF() makes all sorts of calls to all kinds of +important low-level routines such as Sub16Bits() and ShiftMantLeft1(). +Untangling the wiring of which routine is being called as part of the test, +and which is being called as part of the setup could probably be done with +the computer equivalent of detective work and spelunking, but in the +interest of time we'll opt for approximation. + +Here's a breakdown of some of the important routines and their times: + +AddSubInternalFPF() - 1003.9 microsecs (called 9024 times) + +MultiplyInternalFPF() - 20143 microsecs (called 5610 times) + +DivideInternalFPF() - 18820.9 microsecs (called 3366 times). + +The 3366 calls to DivideInternalFPF() are timed calls, not setup calls -- +the profiler at least gives outputs of separate calls made to the same +routine, so we can determine which call is being made by the benchmark, and +which is being made by the setup routine. It turns out that the setup +routine calls DivideInternalFPF() 30,000 times. + +Notice that though addition/subtraction are called most often, +multiplication next, then finally division; the time spent in each is the +reverse. Division takes the most time, then multiplication, finally +addition/subtraction. (There's probably some universal truth lurking here +somewhere, but we haven't found it yet.) + +Other routines, and their breakdown: + +Add16Bits() - 115.3 microsecs + +ShiftMantRight1() - 574.2 microsecs + +Sub16Bits() - 1762 microsecs + +StickySiftRightMant - 40.4 microsecs + +ShiftMantLeft1() - 17486.1 microsecs + +The times for the last three routines are suspect, since they are called by +DivideInternalFPF(), and a large portion of their time could be part of the +setup process. This is what leads us to question the results obtained in the +486 analysis, since it, too, is unable to determine precisely who is calling +whom. + +Porting Considerations + +Earlier versions of this benchmark were extremely sensitive to porting; +particularly to the "endianism" of the target system. We have tried to +eliminate many of these problems. The test is nonetheless more "sensitive" +to porting than most others. + +Pay close attention to the following defines and typedefs. They can be found +in the files EMFLOAT.H, NMGLOBAL.H, and NBENCH1.H: + +u8 - Stands for unsigned, 8-bit. Usually defined to be unsigned char. + +u16 - Stands for unsigned, 16-bit. Usually defined to be unsigned short. + +u32 - Stands for unsigned, 32-bit. Usually defined to be unsigned long. + +INTERNAL_FPF_PRECISION - Indicates the number of elements in the mantissa of +an InternalFPF number. Should be set to 4. + +The exponent field of an InternalFPF number is of type short. It should be +set to whatever minimal data type can hold a signed, 16-bit number. + +Other global definitions you will want to be aware of: + +CPUEMFLOATLOOPMAX - Sets the maximum number of loops the benchmark will +attempt before flagging an error. Each execution of a loop in the emulated +floating-point test is "non-destructive," since the test takes factors from +two arrays, operates on the factors, and places the result in a third array. +Consequently, the test makes more work for itself by increasing the number +of times it passes through the arrays (# of loops). If the system exceeds +the limit set by CPUEMFLOATLOOPMAX, it will signal an error. + +This value may be altered to suit your system; it will not effect the +benchmark results (unless you reduce it so much the system can never +generate enough loops to produce a good test run). + +EMFARRAYSIZE - Sets the size of the arrays to be used in the test. This +value is the number of entries (InternalFPF numbers) per array. Currently, +the number is fixed at 3000, and should not be altered. + +Currently, there is no means of testing correct execution of the benchmark +other than via debugger. There are routines available to decode the internal +floating point format and print out the numbers, but no formal correctness +test has been constructed. (This should be available soon. -- 3/14/95 RG) + +** It now prints out the operations of 8 of the entries used in the +** test. Assuming you leave EMFARRAYSIZE at 3000, your results should +** look like the ones below. The number in front of the colon is the +** index of the entry. +** +** 2: (-1.1160E 0) + (-4.5159E 0) = -5.6320E 0 +** 6: (-4.4507E -1) - (-8.2050E -1) = +3.7543E -1 +** 10: (+1.2465E 0) * (+7.4667E -1) = +9.3075E -1 +** 14: (-1.2781E 0) / (-1.7367E 0) = +7.3596E -1 +** 2986: (-7.0390E 0) * (-2.0752E 0) = +1.4607E 1 +** 2990: (+8.3753E -1) / (+2.3876E 1) = +3.5078E -2 +** 2994: (-1.1393E 0) + (-1.6080E 1) = -1.7219E 1 +** 2998: (+7.2450E 0) - (-8.2654E -1) = +8.0716E 0 +** +** Uwe F. Mayer <mayer@tux.edu> + +References + +Microprocessor Programming for Computer Hobbyists, Neill Graham, Tab Books, +Blue Ridge Summit, PA, 1977. + +Apple Numerica Manual, Second edition, Apple Computer, Addison-Wesley +Publishing Co., Reading, MA, 1988. + +Fourier Series + +Description + +This is a floating-point benchmark designed primarily to exercise the +trigonometric and transcendental functions of the system. It calculates the +first n Fourier coefficients of the function (x+1)x on the interval 0,2. In +this case, the function (x+1)x is being treated as a cyclic waveform with a +period of 2. + +The Fourier coefficients, when applied as factors to a properly constructed +series of sine and cosine functions, allow you to approximate the original +waveform. (In fact, if you can calculate all the Fourier coefficients -- +there'll be an infinite number -- you can reconstruct the waveform exactly). +You have to calculate the coefficients via integration, and the algorithm +does this using a simple trapezoidal rule for its numeric integration +function. + +The upshot of all this is that it provides an exercise for the +floating-point routines that calculate sine, cosine, and raising a number to +a power. There are also some floating-point multiplications, divisions, +additions, and subtractions mixed in. + +The benchmark reports its results as the number of coefficients calculated +per second. + +As an additional note, we should point out that the performance of this +benchmark is heavily dependent on how well-built the compiler's math library +is. We have seen at least two cases where recompilation with new (and +improved!) math libraries have resulted in two-fold and five-fold +performance improvements. (Apparently, when a compiler gets moved to a new +platform, the trigonometric and transcendental functions in the math +libraries are among the last routines to be "hand optimized" for the new +platform.) About all we can say about this is that whenever you run this +test, verify that you have the latest and greatest math libraries. + +Analysis + +Optimized 486 code: The benchmark partitions its time almost evenly among +the modules pow387, exp386, and trig387; giving between 25% and 28% of its +time to each. This is based on profiling with the Watcom compiler running +under Windows NT. These modules hold the routines that handle raising a +number to a power and performing trigonometric (sine and cosine) +calculations. For example, within trig387, time was nearly equally divided +between the routine that calculates sine and the routine that calculates +cosine. + +The remaining time (between 17% and 18%) was spent in the balance of the +test. We noticed that most of that time occurred in the routine +thefunction(). This is at the heart of the numerical integration routine the +benchmark uses. + +Consequently, this benchmark should be a good test of the exponential and +trigonometric capabilities of a processor. (Note that we recognize that the +performance also depends on how well the compiler's math library is built.) + +680x0 Code (Macintosh CodeWarrior): The CodeWarrior profiler is function +based, therefore it is impossible to get performance results for individual +machine instructions. The CodeWarrior compiler is also unable to tell us how +much time is spent within a given library routine; we can't see how much +time gets spent executing the sin(), cos(), or pow() functions (which, +unfortunately, was the whole idea behind the benchmark). + +About all we can glean from the results is that thefunction() takes about +74% of the time in the test (this is where the heavy math calculations take +place) while trapezoidintegrate() accounts for about 26% of the time on its +own. + +Porting Considerations + +Necessarily, this benchmark is at the mercy of the efficiency of the +floating-point support provided by whatever compiler you are using. It is +recommended that, if you are doing the port yourself, you contact the +designers of the compiler, and discuss with them what optimization switches +should be set to produce the fastest code. (This sounds simple; usually it's +not. Some systems let you decide between speed and true IEEE compliance.) + +As far as global definitions go, this benchmark is happily free of them. All +the math is done using double data types. We have noticed that, on some Unix +systems, you must be careful to include the correct math libraries. +Typically, you'll discover this at link time. + +To test for correct execution of the benchmark: It's unlikely you'll need to +do this, since the algorithm is so cut-and-dried. Furthermore, there are no +explicit provisions made to verify the correctness. You can, however, either +dip into your favorite debugger, or alter the code to print out the contents +of the abase (which holds the A[i] terms) and bbase (which holds the B[i] +terms) arrays as they are being filled (see routine DoFPUTransIteration). +** This is exactly what I have done, it now prints out A[i] and B[i] data. +** Uwe F. Mayer <mayer@tux.edu> +Run the benchmark with a command file set to execute only the Fourier test, +and examine the contents of the arrays. The first 100 are listed below. + +A[i]= + 2.84 1.05 0.274 0.0824 0.0102 -0.024 -0.0426 -0.0536 -0.0605 -0.065 +-0.0679 -0.0698 -0.0709 -0.0715 -0.0717 -0.0715 -0.0711 -0.0704 +-0.0696 -0.0685 -0.0674 -0.0661 -0.0647 -0.0632 -0.0615 -0.0598 -0.058 +-0.0561 -0.0542 -0.0521 -0.0501 -0.0479 -0.0457 -0.0434 -0.0411 +-0.0387 -0.0363 -0.0338 -0.0313 -0.0288 -0.0262 -0.0236 -0.0209 +-0.0183 -0.0156 -0.0129 -0.0102 -0.00744 -0.0047 -0.00196 0.000794 +0.00355 0.0063 0.00905 0.0118 0.0145 0.0172 0.0199 0.0226 0.0253 +0.0279 0.0305 0.0331 0.0357 0.0382 0.0407 0.0431 0.0455 0.0479 0.0502 +0.0525 0.0547 0.0569 0.059 0.061 0.063 0.0649 0.0668 0.0686 0.0703 +0.072 0.0736 0.0751 0.0765 0.0779 0.0792 0.0804 0.0816 0.0826 0.0836 +0.0845 0.0853 0.0861 0.0867 0.0873 0.0877 0.0881 0.0884 0.0887 0.0888 + +B[i]= +(undefined) -1.88 -1.16 -0.806 -0.61 -0.487 -0.402 -0.34 -0.293 -0.255 +-0.224 -0.199 -0.177 -0.158 -0.141 -0.126 -0.113 -0.101 -0.0901 +-0.0802 -0.071 -0.0625 -0.0546 -0.0473 -0.0404 -0.034 -0.0279 -0.0222 +-0.0168 -0.0117 -0.00693 -0.00238 0.00193 0.00601 0.00988 0.0135 0.017 +0.0203 0.0234 0.0263 0.0291 0.0317 0.0341 0.0364 0.0385 0.0405 0.0424 +0.0441 0.0457 0.0471 0.0484 0.0496 0.0507 0.0516 0.0525 0.0532 0.0538 +0.0543 0.0546 0.0549 0.055 0.0551 0.055 0.0549 0.0546 0.0543 0.0538 +0.0533 0.0527 0.052 0.0512 0.0503 0.0493 0.0483 0.0472 0.046 0.0447 +0.0434 0.042 0.0405 0.039 0.0374 0.0358 0.0341 0.0323 0.0305 0.0287 +0.0268 0.0249 0.023 0.021 0.019 0.0169 0.0149 0.0128 0.0107 0.00857 +0.00644 0.0043 0.00215 + +Note that there is no B[0] coefficient. If the above numbers are in the +arrays shown, you can feel pretty confident that the benchmark it working +properly. + +References + +Engineering and Scientific Computations in Pascal, Lawrence P. Huelsman, +Harper & Row, New York, 1986. + +Assignment Algorithm + +Description + +This test is built on an algorithm with direct application to the business +world. The assignment algorithm solves the following problem: Say you have X +machines and Y jobs. Any of the machines can do any of the jobs; however, the +machines are sufficiently different so that the cost of doing a particular +job can vary depending what machine does it. Furthermore, the jobs are +sufficiently different that the cost varies depending on which job a given +machine does. You therefore construct a matrix; machines are the rows, jobs +are the columns, and the [i,j] element of the array is the cost of doing the +jth job on the ith machine. How can you assign the jobs so that the cost of +completing them all is minimal? (This also assumes that one machine does one +job.) + +Did you get that? + +The assignment algorithm benchmark is largely a test of how well the +processor handles problems built around array manipulation. It is not a +floating-point test; the "cost matrix" built by the algorithm is simply a 2D +array of long integers. This benchmark considers an iteration to be a run of +the assignment algorithm on a 101 x 101 - element matrix. It reports its +results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): There are numerous loops within the +assignment algorithm. The development system we were using (Watcom C/C++ +10.0) appears to have a fine time unrolling many of them. Consequently, it +is difficult to pin down the execution impact of single lines (as in, for +example, the numeric sort benchmark). + +On the level of functions, the benchmark spends around 70% of its time in +the routine first_assignments(). This is where a) lone zeros in rows and +columns are found and selected, and b) a choice is made between duplicate +zeros. Around 23% of the time is spent in the second_assignments() routine +where (if first_assignments() fails) the matrix is partitioned into smaller +submatrices. + +Overall, we did a tally of instruction mix execution. The approximate +breakdowns are: + +move - 38% + +conditional jump - 12% + +unconditional jump - 11% + +comparison - 14% + +math/logical/shift - 24% + +Many of the move instructions that appeared to consume the most amounts of +time were referencing items on the local stack frame. This required an +indirect reference through EBP, plus a constant offset to resolve the +address. + +This should be a good exercise of a cache, since operations in the +first_assignments() routine require both row-wise and column-wise movement +through the array. Note that the routine could be made more "severe" by +chancing the assignedtableau[][] array to an array of unsigned char -- +forcing fetches on byte boundaries. + +680x0 Code (CodeWarrior): The CodeWarrior profiler is function-based. +Consequently, it's not possible to determine what's going on at the machine +instruction level. We can, however, get a good idea of how much time the +algorithm spends in each routine. The important routines are broken down as +follows: + +calc_minimum_costs() - approximately 0.3% of the time + +(250 microsecs) + +first_assignments() - approximately 79% of the time + +(96284.6 microsecs) + +second_assignments() - approximately 19% of the time + +(22758 microsecs) + +These times are approximate; some time is spent in the Assignment() routine +itself. + +These figures are reasonably close to those of the 486, at least in terms of +the mixture of time spent in a particular routine. Hence, this should still +be a good test of system cache (as described in the preceding section), +given the behavior of the first_assignments() routine. + +Porting Considerations + +The assignment algorithm test is purely an integer benchmark, and requires +no special data types that might be affected by ports to different +architectures. There are only two global constants that affect the +algorithm: + +ASSIGNROWS and ASSIGNCOLS - These set the size of the assignment array. Both +are defined to be 101 (so, the array that is benchmarked is a 101 x 101 +-element array of longs). These values should not be altered. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, set up a command file that executes only the assignment +algorithm, and run the benchmark. (You may want to pipe the output through a +paging filter, like the more program.) The act of defining DEBUG will enable +a section of code that displays the assigned columns on a per-row basis. If +the benchmark is working properly, the numbers to be displayed +should be: + +R000: 056 R001: 066 R002: 052 R003: 065 R004: 043 R005: 023 R006: 016 +R007: 077 R008: 095 R009: 004 R010: 064 R011: 076 R012: 078 R013: 091 +R014: 013 R015: 029 R016: 044 R017: 014 R018: 041 R019: 042 R020: 020 +R021: 071 R022: 024 R023: 017 R024: 055 R025: 040 R026: 070 R027: 025 +R028: 031 R029: 019 R030: 073 R031: 002 R032: 047 R033: 009 R034: 035 +R035: 045 R036: 005 R037: 063 R038: 081 R039: 039 R040: 087 R041: 008 +R042: 053 R043: 093 R044: 049 R045: 092 R046: 061 R047: 046 R048: 026 +R049: 034 R050: 088 R051: 000 R052: 028 R053: 018 R054: 072 R055: 021 +R056: 037 R057: 082 R058: 006 R059: 058 R060: 096 R061: 068 R062: 069 +R063: 054 R064: 057 R065: 086 R066: 097 R067: 084 R068: 099 R069: 051 +R070: 098 R071: 003 R072: 074 R073: 062 R074: 080 R075: 033 R076: 011 +R077: 094 R078: 012 R079: 050 R080: 010 R081: 038 R082: 089 R083: 059 +R084: 022 R085: 079 R086: 015 R087: 007 R088: 075 R089: 083 R090: 060 +R091: 048 R092: 032 R093: 067 R094: 001 R095: 030 R096: 027 R097: 085 +R098: 090 R099: 036 R100: 100 + +These are the column choices for each row made by the algorithm. If +you see these numbers displayed, the algorithm is working correctly. + +*** The original debugging information was incorrect, as it not only +*** display the chosen columns, but also displayed eliminated columns. +*** Changed to show all 101 entries. Uwe F. Mayer <mayer@tux.edu> + +References + +Quantitative Decision Making for Business, Gordon, Pressman, and Cohn, +Prentice-Hall, Englewood Cliffs, NJ, 1990. + +Quantitative Decision Making, Guiseppi A. Forgionne, Wadsworth Publishing +Co., California, 1986. + +Huffman Compression + +Description + +This is a compression algorithm that -- while helpful for some time as a +text compression technique -- has since fallen out of fashion on account of +the superior performance by algorithms such as LZW compression. It is, +however, still used in some graphics file formats in one form or another. + +The benchmark consists of three parts: + +Building a "Huffman Tree" (explained below), + +Compression, and + +Decompression. + +A "Huffman Tree" is a special data structure that guides the compression and +decompression processes. If you were to diagram one, it would look like a +large binary tree (i.e., two branches per each node). Describing its +function in detail is beyond the scope of this paper (see the references for +more information). We should, however, point out that the tree is built from +the "bottom up"; and the procedure for constructing it requires that the +algorithm scan the uncompressed buffer, building a frequency table for all +the characters appearing in the buffer. (This version of the Huffman +algorithm compresses byte-at-a-time, though there's no reason why the same +principle could not be applied to tokens larger than one byte.) + +Once the tree is built, text compression is relatively straightforward. The +algorithm fetches a character from the uncompressed buffer, navigates the +tree based on the character's value, and produces a bit stream that is +concatenated to the compressed buffer. Decompression is the reverse of that +process. (We recognize that we are simplifying the algorithm. Again, we +recommend you check the references.) + +The Huffman Compression benchmark considers an iteration to be the three +operations described above, performed on an uncompressed text buffer of 5000 +bytes. It reports its results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The Huffman compression algorithm -- +tree building, compression, and decompression -- is written as a single, +large routine: DoHuffIteration(). All the benchmark's time is spent within +that routine. + +Components of DoHuffIteration() that consume the most time are those that +perform the compression and decompression . + +The code for performing the compression spends most of its time (accounting +for about 13%) constructing the bit string for a character that is being +compressed. It does this by seeking up the tree from a leaf, emitting 1's +and 0's in the process, until it reaches the root. The stream of 1's and 0's +are loaded into a character array; the algorithm then walks "backward" +through the array, setting (or clearing) bits in the compression buffer as +it goes. + +Similarly, the decompression portion takes about 12% of the time as the +algorithm pulls bits out of the compressed buffer -- using them to navigate +the Huffman tree -- and reconstructs the original text. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based. Consequently, it's impossible to get performance scores for +individual machine instructions. Furthermore, as mentioned above, the +Huffman compression algorithm is written as a monolithic routine. This makes +the results from the CodeWarrior profiler all the more sparse. + +We can at least point out that the lowmost routines (GetCompBit() and +SetCompBit()) that read and write individual bits, though called nearly 13 +million times each, account for only 0.7% and 0.3% of the total time, +respectively. + +Porting Considerations + +The Huffman algorithm relies on no special data types. It should port +readily. Global constants of interest include: + +EXCLUDED - This is a large, positive value. Currently it is set to 32000, +and should be left alone. Basically, this is a token that the system uses to +indicate an excluded character (one that does not appear in the plain-text). +It is set to a ridiculously high value that will never appear in the +pointers of the tree during normal construction. + +MAXHUFFLOOPS - This is another one of those "governor" constants. The +Huffman benchmark creates more work for itself by doing multiple +compression/decompression loops. This constant sets the maximum number of +loops it will attempt per iteration before it gives up. Currently, it is set +to 50000. Though it is unlikely you'll ever need to modify this value, you +can increase it if your machine is too fast for the adjustment algorithm. Do +not reduce the number. + +HUFFARRAYSIZE - This value sets the size of the plain-text array to be +compressed. You can override this value with the command file to see how +well your machine performs for larger or smaller arrays. The subsequent +results, however, are invalid for comparison with other systems. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the Huffman compression +algorithm, and run the benchmark. Defining DEBUG will enable a section of +code that verifies the decompression as it takes place (i.e., the routine +compares -- character at a time -- the uncompressed data with the original +plain-text). If there's an error, the program will repeatedly display: "Error +at textoffset xxx". + +** If everything is correct it will emit quite a few "Huffman: OK" messages. +** +** I added a resetting of the random number generator, outside of the +** timed loop, and a resetting of the Huffman tree, inside of the +** timed loop. That should help to make the benchmark more consistent. +** The program did originally only reset half of the tree, which lead +** to runtime errors on some systems. The effect on the benchmark +** should be negligible, and in fact comes out as being of the order +** of less than 1% on my test system. +** Uwe F. Mayer <mayer@tux.edu> + +References + +Data Compression: Methods and Theory, James A. Storer, Computer Science +Press, Rockville, MD, 1988. + +An Introduction to Text Processing, Peter D. Smith, MIT Press, Cambridge, +MA, 1990. + +IDEA Encryption + +Description + +This is another benchmark based on a "higher-level" algorithm; "higher +-level" in the sense that it is more complex than a sort or a search +operation. + +Security -- and, therefore, cryptography -- are becoming increasingly +important issues in the computer realm. It's likely that more and more +machines will be running routines like the IDEA encryption algorithm. (IDEA +is an acronym for the International Data Encryption Algorithm.) + +A good description of the algorithm (and, in fact, the reference we used to +create the source code for the test) can be found in Bruce Schneier's +exhaustive exploration of encryption, "Applied Cryptography" (see +references). To quote Mr. Schneier: "In my opinion, it [IDEA] is the best +and most secure block algorithm available to the public at this time." + +IDEA is a symmetrical, block cipher algorithm. Symmetrical means that the +same routine used to encrypt the data also decrypts the data. A block cipher +works on the plain-text (the message to be encrypted) in fixed, discrete +chunks. In the case of IDEA, the algorithm encrypts and decrypts 64 bits at +a time. + +As pointed out in Schneier's book, there are three operations that the IDEA +uses to do its work: + +XOR (exclusive-or) + +Addition modulo 216 (ignoring overflow) + +Multiplication modulo 216+1 (ignoring overflow). + +IDEA requires a key of 128 bits. However, keys and blocks are further +subdivided into 16-bit chunks, so that any given operation within the IDEA +encryption is performed on 16-bit quantities. (This is one of the many +advantages of the algorithm, it is efficient even on 16-bit processors.) + +The IDEA benchmark considers an "iteration" to be an encryption and +decryption of a buffer of 4000 bytes. The test actually builds 3 buffers: +The first to hold the original plain-text, the second to hold the encrypted +text, and the third to hold the decrypted text (the contents of which should +match that of the first buffer). It reports its results in iterations per +second. + +Analysis + +Optimized 486 code: The algorithm actually spends most of its time (nearly +75%) within the mul() routine, which performs the multiplication modulo +216+1. This is a super-simple routine, consisting primarily of if +statements, shifts, and additions. + +The remaining time (around 24%) is spent in the balance of the cipher_idea() +routine. (Note that cipher_idea() calls the mul() routine frequently; so, +the 24% is comprised of the other lines of cipher_idea()). cipher_idea() is +littered with simple pointer-fetch-and-increment operations, some addition, +and some exclusive-or operations. + +Note that IDEA's exercise of system capabilities probably doesn't extend +beyond testing simple integer math operations. Since the buffer size is set +to 4000 bytes, the test will run entirely in processor cache on most +systems. Even the cache won't get a heavy "internal" workout, since the +algorithm proceeds sequentially through each buffer from lower to higher +addresses. + +680x0 code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based; consequently, it is impossible to determine execution profiles for +individual machine instructions. We can, however, get an idea of how much +time is spent in each routine. + +As with Huffman compression, the IDEA algorithm is written monolithically -- +a single, large routine does most of the work. However, a special +multiplication routine, mul(), is frequently called within each +encryption/decryption iteration (see above). + +In this instance, the results for the 68K system diverges widely from those +of the 486 system. The CodeWarrior profiler shows the mul() routine as +taking only 4% of the total time in the benchmark, even though it is called +over 20 million times. The outer routine is called 600,000 times, and +accounts for about 96% of the whole program's entire time. + +Porting Considerations + +Since IDEA does its work in 16-bit units, it is particularly important that +u16 be defined to whatever datatype provides an unsigned 16-bit integer on +the test platform. Usually, unsigned short works for this. (You can verify +the size of a short by running the benchmarks with a command file that +includes ALLSTATS=T as one of the commands. This will cause the benchmark +program to display a message that tells the size of the int, short, and long +data-types in bytes.) + +Also, the mul() routine in IDEA requires the u32 datatype to define an +unsigned 32-bit integer. In most cases, unsigned long works. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the IDEA algorithm, and +run the benchmark. Defining DEBUG will enable a section of code that +compares the original plain-text with the output of the test. (Remember, the +benchmark performs both encryption and decryption.) If the algorithm has +failed, the output will not match the input, and you'll see "IDEA Error" +messages all over your display. + +References + +Applied Cryptography: Protocols, Algorithms, and Source Code in C, Bruce +Schneier, John Wiley & Sons, Inc., New York, 1994. + +Neural Net + +Description + +The Neural Net simulation benchmark is based on a simple back-propagation +neural network presented by Maureen Caudill as part of a BYTE article that +appeared in the October, 1991 issue (see "Expert Networks" in that issue). +The network involved is a simple 3-layer (input neurodes, middle-layer +neurodes, and output neurodes) network that accepts a number of 5 x 7 input +patterns and produce a single 8-bit output pattern. + +The test involves sending the network an input pattern that is the 5 x 7 +"image" of a character (1's and 0's -- 1's representing lit pixels, 0's +representing unlit pixels), and teaching it the 8-bit ASCII code for the +character. + +A thorough description of how the back propagation algorithm works is beyond +the scope of this paper. We recommend you search through the references +given at the end of this paper, particularly Ms. Caudill's article, for +detailed discussion. In brief, the benchmark is primarily an exercise in +floating-point operations, with some frequent use of the exp() function. It +also performs a great deal of array references, though the arrays in use are +well under 300 elements each (and less than 100 in most cases). + +The Neural Net benchmark considers an iteration to be a single learning +cycle. (A "learning cycle" is defined as the time it takes the network to be +able to associate all input patterns to the correct output patterns within a +specified tolerance.) It reports its results in iterations per second. + +Analysis + +Optimized 486 code: The forward pass of the network (i.e., calculating +outputs from inputs) utilize a sigmoid function. This function has, at its +heart, a call to the exp() library routine. A small but non-negligible +amount of time is spent in that function (a little over 5% for the 486 +system we tested). + +The learning portion of the network benchmark depends on the derivative of +the sigmoid function, which turns out to require only multiplications and +subtractions. Consequently, each learning pass exercises only simple +floating-point operations. + +If we divide the time spent in the test into two parts -- forward pass and +backward pass (the latter being the learning pass) -- then the test appears +to spend the greatest part of its time in the learning phase. In fact, most +time is spent in the adjust_mid_wts() routine. This is the part of the +routine that alters the weights on the middle layer neurodes. (It accounts +for over 40% of the benchmark's time.) + +680x0 Code (Macintosh CodeWarrior): Though CodeWarrior's profiler is +function based, the neural net benchmark is highly modular. We can therefore +get a good breakdown of routine usage: + +worst_pass_error() - 304 microsecs (called 4680 times) + +adjust_mid_wts() - 83277 microsecs (called 46800 times) + +adjust_out_wts() - 17394 microsecs (called 46800 times) + +do_mid_error() - 11512 microsecs (called 46800 times) + +do_out_error() - 3002 microsecs (called 46800 times) + +do_mid_forward() - 49559 microsecs (called 46800 times) + +do_out_forward() - 20634 microsecs (called 46800 times) + +Again, most time was spent in adjust_mid_wts() (as on the 486), accounting +for almost twice as much time as do_mid_forward(). + +Porting Consideration + +The Neural Net benchmark is not dependent on any special data types. There +are a number of global variables and arrays that should not be altered in +any way. Most importantly, the #defines found in NBENCH1.H under the Neural +Net section should not be changed. These control not only the number of +neurodes in each layer; they also include constants that govern the learning +processes. + +Other globals to be aware of: + +MAXNNETLOOPS - This constant simply sets the upper limit on the number of +training loops the test will permit per iteration. The Neural Net benchmark +adjusts its workload by re-teaching itself over and over (each time it +begins a new training session, the network is "cleared" -- loaded with +random values). It is unlikely you will ever need to modify this constant. + +inpath - This string pointer is set to the path from which the neural net's +input data is read. It is currently hardwired to "NNET.DAT". You shouldn't +have to change this name, unless your file system requires directory +information as part of the path. + +Note that the Neural Net benchmark is the only test that requires an +external data file. The contents of the file are listed in an attachment to +this paper. You should use the attachment to reconstruct the file should it +become lost or corrupted. Any changes to the file will invalidate the test +results. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the Neural Net test, and +run the benchmark. Defining DEBUG will enable a section of code that +displays how many passes through the learning process were required for the +net to learn. It should learn in 780 passes. + +References + +"Expert Networks," Maureen Caudill, BYTE Magazine, October, 1991. + +Simulating Neural Networks, Norbert Hoffmann, Verlag Vieweg, Wiesbaden, +1994. + +Signal and Image Processing with Neural Networks, Timothy Masters, John +Wiley and Sons, New York, 1994. + +Introduction to Neural Networks, Jeannette Stanley, California Scientific +Software, CA, 1989. + +LU Decomposition + +Description + +LU Decomposition is an algorithm that can be used as the heart of a program +for solving linear equations. Suppose you have a matrix A. LU Decomposition +determines the matrices L and U such that + +L . U = A + +where L is a lower triangular matrix and U is an upper triangular matrix. (A +lower triangular matrix has nonzero elements only on the main diagonal and +below. An upper triangular matrix has nonzero elements only on the main +diagonal and above.) + +Without going into the mathematical details too deeply, having the L and U +matrices makes the solution of linear equations (i.e., equations of the form +A . x = b) quite easy. It turns out that you can also use LU decomposition +to determine matrix inverses and determinants. + +The algorithm used in the benchmarks was derived from Numerical Recipes in +Pascal (there is a C version of the book, which we did not have on hand), a +book we heartily recommend to anyone serious about mathematical and +scientific computing. The authors are approving of LU decomposition as a +means of solving linear equations, pointing out that their version (which +makes use of what we would have to call "Crout's method with partial +implicit pivoting") is a factor of 3 better than one of their Gauss-Jordan +routines, a factor of 1.5 better than another. They go on to demonstrate the +use of LU decomposition for iterative improvement of linear equation +solutions. + +The benchmark begins by creating a "solvable" linear system. This is easily +done by loading up the column vector b with random integers, then +initializing A with an identity matrix. The equations are then "scrambled" +by either multiplying a row by a constant, or adding one row to another. The +scrambled matrices are handed to the LU algorithm. + +The LU Decomposition benchmark considers a single iteration to be the +solution of one set of equations (the size of A is fixed at 101 x 101 +elements). It reports its results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The entire algorithm consists of two +parts: the LU decomposition itself, and the back substitution algorithm that +builds the solution vector. The majority of the algorithm's time takes place +within the former; the algorithm that builds the L and U matrices (this +takes place in routine ludcmp()). + +Within ludcmp(), there are two extremely tight for loops forming the heart +of Crout's algorithm that consume the majority of the time. The loops are +"tight" in that they each consist of only one line of code; in both cases, +the line of code is a "multiply and accumulate" operation (actually, it's +sort of a multiply and de-accumulate, since the result of the multiplication +is subtracted, not added). + +In both cases, the items multiplied are elements from the A array; and one +factor's row index is varying more rapidly, while another factor's column +index is varying more rapidly. + +Note that this is a good overall test of floating-point operations within +matrices. Most of the math is floating-point; primarily additions, +subtractions, and multiplications (only a few divisions). + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based. It is therefore impossible to determine execution profiles at the +machine-code level. The profiler does, however, allow us to determine how +much time the benchmark spends in each routine. This breakdown is as +follows: + +lusolve() - 3.4 microsecs (about 0% of the time) + +lubksb() 1198 microsec (about 2% of the time) + +ludcmp() - 63171 microsec (about 91% of the time) + +The above percentages are for the whole program. Consequently, as a portion +of actual benchmark time, the amount attributed to each will be slightly +larger (though the proportions will remain the same). + +Since ludcmp() performs the actual LU decomposition, this is exactly where +we'd want the benchmark to spend its time. The lubksb() routine calls +ludcmp(), using the resulting matrix to "back-solve" the linear equation. + +Porting Considerations + +The LU Decomposition routine requires no special data types, and is immune +to byte ordering. It does make use of a typedef (LUdblptr) that includes an +embedded union; this allows the benchmark to "coerce" a pointer to double +into a pointer to a 2D array of double. This arrangement has not caused +problems with the compilers we have tested to date. + +Other constants and globals to be aware of: + +LUARRAYROWS and LUARRAYCOLS - These constants set the size of the +coefficient matrix, A. They cannot be altered by command file. In fact, you +shouldn't alter them at all, or your results will be invalid. Currently, +they are both set to 101. + +MAXLUARRAYS - This is another "governor" constant. The algorithm performs +dynamic workload adjustment by building more and more arrays to solve per +timing round. This sets the maximum upper limit of arrays that it will +build. Currently, it is set to 1000, which should be more than enough for +the reasonable future (1000 arrays of 101 x 101 floating-point doubles would +require somewhere around 80 megabytes of RAM -- and that's not counting the +column vectors). + +To test for correct execution of the benchmark: Currently, there is no +simple technique for doing this. You can, however, either use your favorite +debugger (or embed a printf() statement) at the conclusion of the lubksb() +routine. When this routine concludes, the array b will hold the solution +vector. These items will be stored as floating-point doubles, and the first +14 are (with rounding): + +46 20 23 22 85 86 97 95 8 89 75 67 6 86 + +If you find these numbers as the first 14 in the array b[], then you're +virtually guaranteed that the algorithm is working correctly. + +*** The above is not correct, as the initial matrix is not the identity, +*** but a matrix with random nonzero entries on the diagonal (they have +*** altered the algorithm since they wrote the documentation). +*** I changed the output of the debugging routine, it now prints first +*** what the array b should hold (as righthand side divided by diagonal +*** entry), and then it prints what the array b does hold after the +*** decomposition has been done to compute the solution of the system. If +*** you get the same, then fine. +*** And, by the way, my original right hand sides are +*** 46 23 85 97 8 75 6 81 88 76 6 84 31 53 2 ... +*** and the diagonal entries are +*** 520 922 186 495 89 267 786 571 175 600 738 321 897 541 859 ... +*** You notice that one has every other number of the original sequence. +*** This is due to BYTE's change of the algorithm, as they now also use the +*** random number generator to generate the diagonal elements. +*** Here is the complete set of data: +*** 46/520=0.09 23/922=0.02 85/186=0.46 97/495=0.20 8/89=0.09 +*** 75/267=0.28 6/786=0.01 81/571=0.14 88/175=0.50 76/600=0.13 +*** 6/738=0.01 84/321=0.26 31/897=0.03 53/541=0.10 2/859=0.00 +*** 86/92=0.93 51/121=0.42 29/248=0.12 51/789=0.06 84/6=14.00 +*** 21/180=0.12 33/48=0.69 2/899=0.00 12/820=0.01 69/372=0.19 +*** 59/809=0.07 74/18=4.11 40/788=0.05 39/56=0.70 86/91=0.95 +*** 33/878=0.04 82/165=0.50 42/561=0.07 8/274=0.03 84/694=0.12 +*** 32/352=0.09 25/969=0.03 59/816=0.07 33/112=0.29 5/125=0.04 +*** 89/740=0.12 7/223=0.03 54/994=0.05 33/80=0.41 55/676=0.08 +*** 6/524=0.01 36/544=0.07 21/160=0.13 58/596=0.10 15/717=0.02 +*** 84/311=0.27 98/530=0.18 46/713=0.06 41/233=0.18 73/640=0.11 +*** 40/343=0.12 72/586=0.12 100/965=0.10 59/764=0.08 37/866=0.04 +*** 27/682=0.04 3/652=0.00 41/352=0.12 87/786=0.11 45/79=0.57 +*** 83/761=0.11 41/817=0.05 46/209=0.22 78/930=0.08 85/210=0.40 +*** 80/756=0.11 18/931=0.02 30/669=0.04 47/127=0.37 85/891=0.10 +*** 66/364=0.18 83/955=0.09 58/637=0.09 58/778=0.07 82/288=0.28 +*** 42/540=0.08 76/290=0.26 59/36=1.64 29/463=0.06 63/476=0.13 +*** 6/340=0.02 73/341=0.21 59/737=0.08 81/492=0.16 98/443=0.22 +*** 58/32=1.81 53/562=0.09 54/263=0.21 46/367=0.13 58/390=0.15 +*** 96/845=0.11 30/746=0.04 2/687=0.00 28/849=0.03 84/180=0.47 +*** 85/382=0.22 +*** Uwe F. Mayer <mayer@tux.edu> + +References + +Numerical Recipes in Pascal: The Art of Scientific Computing, Press, +Flannery, Teukolsky, Vetterling, Cambridge University Press, New York, 1989. diff --git a/debugbit.good.gz b/debugbit.good.gz Binary files differnew file mode 100644 index 0000000..fdc893e --- /dev/null +++ b/debugbit.good.gz diff --git a/emfloat.c b/emfloat.c new file mode 100644 index 0000000..5e73890 --- /dev/null +++ b/emfloat.c @@ -0,0 +1,1343 @@ +/* +** emfloat.c +** Source for emulated floating-point routines. +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine. +** +** Created: +** Last update: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + + +#include <stdio.h> +#include <string.h> +#include "nmglobal.h" +#include "emfloat.h" + +/* +** Floating-point emulator. +** These routines are only "sort of" IEEE-compliant. All work is +** done using an internal representation. Also, the routines do +** not check for many of the exceptions that might occur. +** Still, the external formats produced are IEEE-compatible, +** with the restriction that they presume a low-endian machine +** (though the endianism will not effect the performance). +** +** Some code here was based on work done by Steve Snelgrove of +** Orem, UT. Other code comes from routines presented in +** the long-ago book: "Microprocessor Programming for +** Computer Hobbyists" by Neill Graham. +*/ + +/************************** +** SetupCPUEmFloatArrays ** +*************************** +** Set up the arrays that will be used in the emulated +** floating-point tests. +** This is done by loading abase and bbase elements with +** random numbers. We use our long-to-floating point +** routine to set them up. +** NOTE: We really don't need the pointer to cbase...cbase +** is overwritten in the benchmark. +*/ +void SetupCPUEmFloatArrays(InternalFPF *abase, + InternalFPF *bbase, + InternalFPF *cbase, + ulong arraysize) +{ +ulong i; +InternalFPF locFPF1,locFPF2; +/* +** Reset random number generator so things repeat. Inserted by Uwe F. Mayer. +*/ +extern int32 randnum(int32 lngval); +randnum((int32)13); + +for(i=0;i<arraysize;i++) +{/* LongToInternalFPF(randwc(50000L),&locFPF1); */ + Int32ToInternalFPF(randwc((int32)50000),&locFPF1); + /* LongToInternalFPF(randwc(50000L)+1L,&locFPF2); */ + Int32ToInternalFPF(randwc((int32)50000)+(int32)1,&locFPF2); + DivideInternalFPF(&locFPF1,&locFPF2,abase+i); + /* LongToInternalFPF(randwc(50000L)+1L,&locFPF2); */ + Int32ToInternalFPF(randwc((int32)50000)+(int32)1,&locFPF2); + DivideInternalFPF(&locFPF1,&locFPF2,bbase+i); +} +return; +} + +/*********************** +** DoEmFloatIteration ** +************************ +** Perform an iteration of the emulated floating-point +** benchmark. Note that "an iteration" can involve multiple +** loops through the benchmark. +*/ +ulong DoEmFloatIteration(InternalFPF *abase, + InternalFPF *bbase, + InternalFPF *cbase, + ulong arraysize, ulong loops) +{ +ulong elapsed; /* For the stopwatch */ +static uchar jtable[16] = {0,0,0,0,1,1,1,1,2,2,2,2,2,3,3,3}; +ulong i; +#ifdef DEBUG +int number_of_loops; +#endif +/* +** Begin timing +*/ +elapsed=StartStopwatch(); +#ifdef DEBUG +number_of_loops=loops-1; /* the index of the first loop we run */ +#endif + +/* +** Each pass through the array performs operations in +** the followingratios: +** 4 adds, 4 subtracts, 5 multiplies, 3 divides +** (adds and subtracts being nearly the same operation) +*/ +while(loops--) +{ + for(i=0;i<arraysize;i++) + switch(jtable[i % 16]) + { + case 0: /* Add */ + AddSubInternalFPF(0,abase+i, + bbase+i, + cbase+i); + break; + case 1: /* Subtract */ + AddSubInternalFPF(1,abase+i, + bbase+i, + cbase+i); + break; + case 2: /* Multiply */ + MultiplyInternalFPF(abase+i, + bbase+i, + cbase+i); + break; + case 3: /* Divide */ + DivideInternalFPF(abase+i, + bbase+i, + cbase+i); + break; + } +#ifdef DEBUG +{ + ulong j[8]; /* we test 8 entries */ + int k; + ulong i; + char buffer[1024]; + if (number_of_loops==loops) /* the first loop */ + { + j[0]=(ulong)2; + j[1]=(ulong)6; + j[2]=(ulong)10; + j[3]=(ulong)14; + j[4]=(ulong)(arraysize-14); + j[5]=(ulong)(arraysize-10); + j[6]=(ulong)(arraysize-6); + j[7]=(ulong)(arraysize-2); + for(k=0;k<8;k++){ + i=j[k]; + InternalFPFToString(buffer,abase+i); + printf("%6ld: (%s) ",i,buffer); + switch(jtable[i % 16]) + { + case 0: strcpy(buffer,"+"); break; + case 1: strcpy(buffer,"-"); break; + case 2: strcpy(buffer,"*"); break; + case 3: strcpy(buffer,"/"); break; + } + printf("%s ",buffer); + InternalFPFToString(buffer,bbase+i); + printf("(%s) = ",buffer); + InternalFPFToString(buffer,cbase+i); + printf("%s\n",buffer); + } + } +} +#endif +} +return(StopStopwatch(elapsed)); +} + +/*********************** +** SetInternalFPFZero ** +************************ +** Set an internal floating-point-format number to zero. +** sign determines the sign of the zero. +*/ +static void SetInternalFPFZero(InternalFPF *dest, + uchar sign) +{ +int i; /* Index */ + +dest->type=IFPF_IS_ZERO; +dest->sign=sign; +dest->exp=MIN_EXP; +for(i=0;i<INTERNAL_FPF_PRECISION;i++) + dest->mantissa[i]=0; +return; +} + +/*************************** +** SetInternalFPFInfinity ** +**************************** +** Set an internal floating-point-format number to infinity. +** This can happen if the exponent exceeds MAX_EXP. +** As above, sign picks the sign of infinity. +*/ +static void SetInternalFPFInfinity(InternalFPF *dest, + uchar sign) +{ +int i; /* Index */ + +dest->type=IFPF_IS_INFINITY; +dest->sign=sign; +dest->exp=MIN_EXP; +for(i=0;i<INTERNAL_FPF_PRECISION;i++) + dest->mantissa[i]=0; +return; +} + +/********************** +** SetInternalFPFNaN ** +*********************** +** Set an internal floating-point-format number to Nan +** (not a number). Note that we "emulate" an 80x87 as far +** as the mantissa bits go. +*/ +static void SetInternalFPFNaN(InternalFPF *dest) +{ +int i; /* Index */ + +dest->type=IFPF_IS_NAN; +dest->exp=MAX_EXP; +dest->sign=1; +dest->mantissa[0]=0x4000; +for(i=1;i<INTERNAL_FPF_PRECISION;i++) + dest->mantissa[i]=0; + +return; +} + +/******************* +** IsMantissaZero ** +******************** +** Pass this routine a pointer to an internal floating point format +** number's mantissa. It checks for an all-zero mantissa. +** Returns 0 if it is NOT all zeros, !=0 otherwise. +*/ +static int IsMantissaZero(u16 *mant) +{ +int i; /* Index */ +int n; /* Return value */ + +n=0; +for(i=0;i<INTERNAL_FPF_PRECISION;i++) + n|=mant[i]; + +return(!n); +} + +/************** +** Add16Bits ** +*************** +** Add b, c, and carry. Retult in a. New carry in carry. +*/ +static void Add16Bits(u16 *carry, + u16 *a, + u16 b, + u16 c) +{ +u32 accum; /* Accumulator */ + +/* +** Do the work in the 32-bit accumulator so we can return +** the carry. +*/ +accum=(u32)b; +accum+=(u32)c; +accum+=(u32)*carry; +*carry=(u16)((accum & 0x00010000) ? 1 : 0); /* New carry */ +*a=(u16)(accum & 0xFFFF); /* Result is lo 16 bits */ +return; +} + +/************** +** Sub16Bits ** +*************** +** Additive inverse of above. +*/ +static void Sub16Bits(u16 *borrow, + u16 *a, + u16 b, + u16 c) +{ +u32 accum; /* Accumulator */ + +accum=(u32)b; +accum-=(u32)c; +accum-=(u32)*borrow; +*borrow=(u32)((accum & 0x00010000) ? 1 : 0); /* New borrow */ +*a=(u16)(accum & 0xFFFF); +return; +} + +/******************* +** ShiftMantLeft1 ** +******************** +** Shift a vector of 16-bit numbers left 1 bit. Also provides +** a carry bit, which is shifted in at the beginning, and +** shifted out at the end. +*/ +static void ShiftMantLeft1(u16 *carry, + u16 *mantissa) +{ +int i; /* Index */ +int new_carry; +u16 accum; /* Temporary holding placed */ + +for(i=INTERNAL_FPF_PRECISION-1;i>=0;i--) +{ accum=mantissa[i]; + new_carry=accum & 0x8000; /* Get new carry */ + accum=accum<<1; /* Do the shift */ + if(*carry) + accum|=1; /* Insert previous carry */ + *carry=new_carry; + mantissa[i]=accum; /* Return shifted value */ +} +return; +} + +/******************** +** ShiftMantRight1 ** +********************* +** Shift a mantissa right by 1 bit. Provides carry, as +** above +*/ +static void ShiftMantRight1(u16 *carry, + u16 *mantissa) +{ +int i; /* Index */ +int new_carry; +u16 accum; + +for(i=0;i<INTERNAL_FPF_PRECISION;i++) +{ accum=mantissa[i]; + new_carry=accum & 1; /* Get new carry */ + accum=accum>>1; + if(*carry) + accum|=0x8000; + *carry=new_carry; + mantissa[i]=accum; +} +return; +} + + +/***************************** +** StickyShiftMantRight ** +****************************** +** This is a shift right of the mantissa with a "sticky bit". +** I.E., if a carry of 1 is shifted out of the least significant +** bit, the least significant bit is set to 1. +*/ +static void StickyShiftRightMant(InternalFPF *ptr, + int amount) +{ +int i; /* Index */ +u16 carry; /* Self-explanatory */ +u16 *mantissa; + +mantissa=ptr->mantissa; + +if(ptr->type!=IFPF_IS_ZERO) /* Don't bother shifting a zero */ +{ + /* + ** If the amount of shifting will shift everyting + ** out of existence, then just clear the whole mantissa + ** and set the lowmost bit to 1. + */ + if(amount>=INTERNAL_FPF_PRECISION * 16) + { + for(i=0;i<INTERNAL_FPF_PRECISION-1;i++) + mantissa[i]=0; + mantissa[INTERNAL_FPF_PRECISION-1]=1; + } + else + for(i=0;i<amount;i++) + { + carry=0; + ShiftMantRight1(&carry,mantissa); + if(carry) + mantissa[INTERNAL_FPF_PRECISION-1] |= 1; + } +} +return; +} + + +/************************************************** +** POST ARITHMETIC PROCESSING ** +** (NORMALIZE, ROUND, OVERFLOW, AND UNDERFLOW) ** +**************************************************/ + +/************** +** normalize ** +*************** +** Normalize an internal-representation number. Normalization +** discards empty most-significant bits. +*/ +static void normalize(InternalFPF *ptr) +{ +u16 carry; + +/* +** As long as there's a highmost 0 bit, shift the significand +** left 1 bit. Each time you do this, though, you've +** gotta decrement the exponent. +*/ +while ((ptr->mantissa[0] & 0x8000) == 0) +{ + carry = 0; + ShiftMantLeft1(&carry, ptr->mantissa); + ptr->exp--; +} +return; +} + +/**************** +** denormalize ** +***************** +** Denormalize an internal-representation number. This means +** shifting it right until its exponent is equivalent to +** minimum_exponent. (You have to do this often in order +** to perform additions and subtractions). +*/ +static void denormalize(InternalFPF *ptr, + int minimum_exponent) +{ +long exponent_difference; + +if (IsMantissaZero(ptr->mantissa)) +{ + printf("Error: zero significand in denormalize\n"); +} + +exponent_difference = ptr->exp-minimum_exponent; +if (exponent_difference < 0) +{ + /* + ** The number is subnormal + */ + exponent_difference = -exponent_difference; + if (exponent_difference >= (INTERNAL_FPF_PRECISION * 16)) + { + /* Underflow */ + SetInternalFPFZero(ptr, ptr->sign); + } + else + { + ptr->exp+=exponent_difference; + StickyShiftRightMant(ptr, exponent_difference); + } +} +return; +} + + +/********************* +** RoundInternalFPF ** +********************** +** Round an internal-representation number. +** The kind of rounding we do here is simplest...referred to as +** "chop". "Extraneous" rightmost bits are simply hacked off. +*/ +void RoundInternalFPF(InternalFPF *ptr) +{ +/* int i; */ + +if (ptr->type == IFPF_IS_NORMAL || + ptr->type == IFPF_IS_SUBNORMAL) +{ + denormalize(ptr, MIN_EXP); + if (ptr->type != IFPF_IS_ZERO) + { + + /* clear the extraneous bits */ + ptr->mantissa[3] &= 0xfff8; +/* for (i=4; i<INTERNAL_FPF_PRECISION; i++) + { + ptr->mantissa[i] = 0; + } +*/ + /* + ** Check for overflow + */ +/* Does not do anything as ptr->exp is a short and MAX_EXP=37268 + if (ptr->exp > MAX_EXP) + { + SetInternalFPFInfinity(ptr, ptr->sign); + } +*/ + } +} +return; +} + +/******************************************************* +** ARITHMETIC OPERATIONS ON INTERNAL REPRESENTATION ** +*******************************************************/ + +/*************** +** choose_nan ** +**************** +** Called by routines that are forced to perform math on +** a pair of NaN's. This routine "selects" which NaN is +** to be returned. +*/ +static void choose_nan(InternalFPF *x, + InternalFPF *y, + InternalFPF *z, + int intel_flag) +{ +int i; + +/* +** Compare the two mantissas, +** return the larger. Note that we will be emulating +** an 80387 in this operation. +*/ +for (i=0; i<INTERNAL_FPF_PRECISION; i++) +{ + if (x->mantissa[i] > y->mantissa[i]) + { + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + return; + } + if (x->mantissa[i] < y->mantissa[i]) + { + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + return; + } +} + +/* +** They are equal +*/ +if (!intel_flag) + /* if the operation is addition */ + memmove((void *)x,(void *)z,sizeof(InternalFPF)); +else + /* if the operation is multiplication */ + memmove((void *)y,(void *)z,sizeof(InternalFPF)); +return; +} + + +/********************** +** AddSubInternalFPF ** +*********************** +** Adding or subtracting internal-representation numbers. +** Internal-representation numbers pointed to by x and y are +** added/subtracted and the result returned in z. +*/ +static void AddSubInternalFPF(uchar operation, + InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int exponent_difference; +u16 borrow; +u16 carry; +int i; +InternalFPF locx,locy; /* Needed since we alter them */ + +/* +** Following big switch statement handles the +** various combinations of operand types. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case ZERO_ZERO: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + if (x->sign ^ y->sign ^ operation) + { + z->sign = 0; /* positive */ + } + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: +case SUBNORMAL_ZERO: +case NORMAL_ZERO: +case INFINITY_ZERO: +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_SUBNORMAL: +case ZERO_NORMAL: +case ZERO_INFINITY: +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + z->sign ^= operation; + break; + +case SUBNORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_SUBNORMAL: +case NORMAL_NORMAL: + /* + ** Copy x and y to locals, since we may have + ** to alter them. + */ + memmove((void *)&locx,(void *)x,sizeof(InternalFPF)); + memmove((void *)&locy,(void *)y,sizeof(InternalFPF)); + + /* compute sum/difference */ + exponent_difference = locx.exp-locy.exp; + if (exponent_difference == 0) + { + /* + ** locx.exp == locy.exp + ** so, no shifting required + */ + if (locx.type == IFPF_IS_SUBNORMAL || + locy.type == IFPF_IS_SUBNORMAL) + z->type = IFPF_IS_SUBNORMAL; + else + z->type = IFPF_IS_NORMAL; + + /* + ** Assume that locx.mantissa > locy.mantissa + */ + z->sign = locx.sign; + z->exp= locx.exp; + } + else + if (exponent_difference > 0) + { + /* + ** locx.exp > locy.exp + */ + StickyShiftRightMant(&locy, + exponent_difference); + z->type = locx.type; + z->sign = locx.sign; + z->exp = locx.exp; + } + else /* if (exponent_difference < 0) */ + { + /* + ** locx.exp < locy.exp + */ + StickyShiftRightMant(&locx, + -exponent_difference); + z->type = locy.type; + z->sign = locy.sign ^ operation; + z->exp = locy.exp; + } + + if (locx.sign ^ locy.sign ^ operation) + { + /* + ** Signs are different, subtract mantissas + */ + borrow = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + Sub16Bits(&borrow, + &z->mantissa[i], + locx.mantissa[i], + locy.mantissa[i]); + + if (borrow) + { + /* The y->mantissa was larger than the + ** x->mantissa leaving a negative + ** result. Change the result back to + ** an unsigned number and flip the + ** sign flag. + */ + z->sign = locy.sign ^ operation; + borrow = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + { + Sub16Bits(&borrow, + &z->mantissa[i], + 0, + z->mantissa[i]); + } + } + else + { + /* The assumption made above + ** (i.e. x->mantissa >= y->mantissa) + ** was correct. Therefore, do nothing. + ** z->sign = x->sign; + */ + } + + if (IsMantissaZero(z->mantissa)) + { + z->type = IFPF_IS_ZERO; + z->sign = 0; /* positive */ + } + else + if (locx.type == IFPF_IS_NORMAL || + locy.type == IFPF_IS_NORMAL) + { + normalize(z); + } + } + else + { + /* signs are the same, add mantissas */ + carry = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + { + Add16Bits(&carry, + &z->mantissa[i], + locx.mantissa[i], + locy.mantissa[i]); + } + + if (carry) + { + z->exp++; + carry=0; + ShiftMantRight1(&carry,z->mantissa); + z->mantissa[0] |= 0x8000; + z->type = IFPF_IS_NORMAL; + } + else + if (z->mantissa[0] & 0x8000) + z->type = IFPF_IS_NORMAL; + } + break; + +case INFINITY_INFINITY: + SetInternalFPFNaN(z); + break; + +case NAN_NAN: + choose_nan(x, y, z, 1); + break; +} + +/* +** All the math is done; time to round. +*/ +RoundInternalFPF(z); +return; +} + + +/************************ +** MultiplyInternalFPF ** +************************* +** Two internal-representation numbers x and y are multiplied; the +** result is returned in z. +*/ +static void MultiplyInternalFPF(InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int i; +int j; +u16 carry; +u16 extra_bits[INTERNAL_FPF_PRECISION]; +InternalFPF locy; /* Needed since this will be altered */ +/* +** As in the preceding function, this large switch +** statement selects among the many combinations +** of operands. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: +case INFINITY_INFINITY: +case ZERO_ZERO: +case ZERO_SUBNORMAL: +case ZERO_NORMAL: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + z->sign ^= y->sign; + break; + +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: +case SUBNORMAL_ZERO: +case NORMAL_ZERO: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + z->sign ^= x->sign; + break; + +case ZERO_INFINITY: +case INFINITY_ZERO: + SetInternalFPFNaN(z); + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + + +case SUBNORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_SUBNORMAL: +case NORMAL_NORMAL: + /* + ** Make a local copy of the y number, since we will be + ** altering it in the process of multiplying. + */ + memmove((void *)&locy,(void *)y,sizeof(InternalFPF)); + + /* + ** Check for unnormal zero arguments + */ + if (IsMantissaZero(x->mantissa) || IsMantissaZero(y->mantissa)) + SetInternalFPFInfinity(z, 0); + + /* + ** Initialize the result + */ + if (x->type == IFPF_IS_SUBNORMAL || + y->type == IFPF_IS_SUBNORMAL) + z->type = IFPF_IS_SUBNORMAL; + else + z->type = IFPF_IS_NORMAL; + + z->sign = x->sign ^ y->sign; + z->exp = x->exp + y->exp ; + for (i=0; i<INTERNAL_FPF_PRECISION; i++) + { + z->mantissa[i] = 0; + extra_bits[i] = 0; + } + + for (i=0; i<(INTERNAL_FPF_PRECISION*16); i++) + { + /* + ** Get rightmost bit of the multiplier + */ + carry = 0; + ShiftMantRight1(&carry, locy.mantissa); + if (carry) + { + /* + ** Add the multiplicand to the product + */ + carry = 0; + for (j=(INTERNAL_FPF_PRECISION-1); j>=0; j--) + Add16Bits(&carry, + &z->mantissa[j], + z->mantissa[j], + x->mantissa[j]); + } + else + { + carry = 0; + } + + /* + ** Shift the product right. Overflow bits get + ** shifted into extra_bits. We'll use it later + ** to help with the "sticky" bit. + */ + ShiftMantRight1(&carry, z->mantissa); + ShiftMantRight1(&carry, extra_bits); + } + + /* + ** Normalize + ** Note that we use a "special" normalization routine + ** because we need to use the extra bits. (These are + ** bits that may have been shifted off the bottom that + ** we want to reclaim...if we can. + */ + while ((z->mantissa[0] & 0x8000) == 0) + { + carry = 0; + ShiftMantLeft1(&carry, extra_bits); + ShiftMantLeft1(&carry, z->mantissa); + z->exp--; + } + + /* + ** Set the sticky bit if any bits set in extra bits. + */ + if (IsMantissaZero(extra_bits)) + { + z->mantissa[INTERNAL_FPF_PRECISION-1] |= 1; + } + break; + +case NAN_NAN: + choose_nan(x, y, z, 0); + break; +} + +/* +** All math done...do rounding. +*/ +RoundInternalFPF(z); +return; +} + + +/********************** +** DivideInternalFPF ** +*********************** +** Divide internal FPF number x by y. Return result in z. +*/ +static void DivideInternalFPF(InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int i; +int j; +u16 carry; +u16 extra_bits[INTERNAL_FPF_PRECISION]; +InternalFPF locx; /* Local for x number */ + +/* +** As with preceding function, the following switch +** statement selects among the various possible +** operands. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case ZERO_ZERO: +case INFINITY_INFINITY: + SetInternalFPFNaN(z); + break; + +case ZERO_SUBNORMAL: +case ZERO_NORMAL: + if (IsMantissaZero(y->mantissa)) + { + SetInternalFPFNaN(z); + break; + } + +case ZERO_INFINITY: +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: + SetInternalFPFZero(z, x->sign ^ y->sign); + break; + +case SUBNORMAL_ZERO: +case NORMAL_ZERO: + if (IsMantissaZero(x->mantissa)) + { + SetInternalFPFNaN(z); + break; + } + +case INFINITY_ZERO: +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: + SetInternalFPFInfinity(z, 0); + z->sign = x->sign ^ y->sign; + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + +case SUBNORMAL_SUBNORMAL: +case NORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_NORMAL: + /* + ** Make local copy of x number, since we'll be + ** altering it in the process of dividing. + */ + memmove((void *)&locx,(void *)x,sizeof(InternalFPF)); + + /* + ** Check for unnormal zero arguments + */ + if (IsMantissaZero(locx.mantissa)) + { + if (IsMantissaZero(y->mantissa)) + SetInternalFPFNaN(z); + else + SetInternalFPFZero(z, 0); + break; + } + if (IsMantissaZero(y->mantissa)) + { + SetInternalFPFInfinity(z, 0); + break; + } + + /* + ** Initialize the result + */ + z->type = x->type; + z->sign = x->sign ^ y->sign; + z->exp = x->exp - y->exp + + ((INTERNAL_FPF_PRECISION * 16 * 2)); + for (i=0; i<INTERNAL_FPF_PRECISION; i++) + { + z->mantissa[i] = 0; + extra_bits[i] = 0; + } + + while ((z->mantissa[0] & 0x8000) == 0) + { + carry = 0; + ShiftMantLeft1(&carry, locx.mantissa); + ShiftMantLeft1(&carry, extra_bits); + + /* + ** Time to subtract yet? + */ + if (carry == 0) + for (j=0; j<INTERNAL_FPF_PRECISION; j++) + { + if (y->mantissa[j] > extra_bits[j]) + { + carry = 0; + goto no_subtract; + } + if (y->mantissa[j] < extra_bits[j]) + break; + } + /* + ** Divisor (y) <= dividend (x), subtract + */ + carry = 0; + for (j=(INTERNAL_FPF_PRECISION-1); j>=0; j--) + Sub16Bits(&carry, + &extra_bits[j], + extra_bits[j], + y->mantissa[j]); + carry = 1; /* 1 shifted into quotient */ + no_subtract: + ShiftMantLeft1(&carry, z->mantissa); + z->exp--; + } + break; + +case NAN_NAN: + choose_nan(x, y, z, 0); + break; +} + +/* +** Math complete...do rounding +*/ +RoundInternalFPF(z); +} + +/********************** +** LongToInternalFPF ** +** Int32ToInternalFPF ** +*********************** +** Convert a signed (long) 32-bit integer into an internal FPF number. +*/ +/* static void LongToInternalFPF(long mylong, */ +static void Int32ToInternalFPF(int32 mylong, + InternalFPF *dest) +{ +int i; /* Index */ +u16 myword; /* Used to hold converted stuff */ +/* +** Save the sign and get the absolute value. This will help us +** with 64-bit machines, since we use only the lower 32 +** bits just in case. (No longer necessary after we use int32.) +*/ +/* if(mylong<0L) */ +if(mylong<(int32)0) +{ dest->sign=1; + mylong=(int32)0-mylong; +} +else + dest->sign=0; +/* +** Prepare the destination floating point number +*/ +dest->type=IFPF_IS_NORMAL; +for(i=0;i<INTERNAL_FPF_PRECISION;i++) + dest->mantissa[i]=0; + +/* +** See if we've got a zero. If so, make the resultant FP +** number a true zero and go home. +*/ +if(mylong==0) +{ dest->type=IFPF_IS_ZERO; + dest->exp=0; + return; +} + +/* +** Not a true zero. Set the exponent to 32 (internal FPFs have +** no bias) and load the low and high words into their proper +** locations in the mantissa. Then normalize. The action of +** normalizing slides the mantissa bits into place and sets +** up the exponent properly. +*/ +dest->exp=32; +myword=(u16)((mylong >> 16) & 0xFFFFL); +dest->mantissa[0]=myword; +myword=(u16)(mylong & 0xFFFFL); +dest->mantissa[1]=myword; +normalize(dest); +return; +} + +#ifdef DEBUG +/************************ +** InternalFPFToString ** +************************* +** FOR DEBUG PURPOSES +** This routine converts an internal floating point representation +** number to a string. Used in debugging the package. +** Returns length of converted number. +** NOTE: dest must point to a buffer big enough to hold the +** result. Also, this routine does append a null (an effect +** of using the sprintf() function). It also returns +** a length count. +** NOTE: This routine returns 5 significant digits. Thats +** about all I feel safe with, given the method of +** conversion. It should be more than enough for programmers +** to determine whether the package is properly ported. +*/ +static int InternalFPFToString(char *dest, + InternalFPF *src) +{ +InternalFPF locFPFNum; /* Local for src (will be altered) */ +InternalFPF IFPF10; /* Floating-point 10 */ +InternalFPF IFPFComp; /* For doing comparisons */ +int msign; /* Holding for mantissa sign */ +int expcount; /* Exponent counter */ +int ccount; /* Character counter */ +int i,j,k; /* Index */ +u16 carryaccum; /* Carry accumulator */ +u16 mycarry; /* Local for carry */ + +/* +** Check first for the simple things...Nan, Infinity, Zero. +** If found, copy the proper string in and go home. +*/ +switch(src->type) +{ + case IFPF_IS_NAN: + memcpy(dest,"NaN",3); + return(3); + + case IFPF_IS_INFINITY: + if(src->sign==0) + memcpy(dest,"+Inf",4); + else + memcpy(dest,"-Inf",4); + return(4); + + case IFPF_IS_ZERO: + if(src->sign==0) + memcpy(dest,"+0",2); + else + memcpy(dest,"-0",2); + return(2); +} + +/* +** Move the internal number into our local holding area, since +** we'll be altering it to print it out. +*/ +memcpy((void *)&locFPFNum,(void *)src,sizeof(InternalFPF)); + +/* +** Set up a floating-point 10...which we'll use a lot in a minute. +*/ +/* LongToInternalFPF(10L,&IFPF10); */ +Int32ToInternalFPF((int32)10,&IFPF10); + +/* +** Save the mantissa sign and make it positive. +*/ +msign=src->sign; + +/* src->sign=0 */ /* bug, fixed Nov. 13, 1997 */ +(&locFPFNum)->sign=0; + +expcount=0; /* Init exponent counter */ + +/* +** See if the number is less than 10. If so, multiply +** the number repeatedly by 10 until it's not. For each +** multiplication, decrement a counter so we can keep track +** of the exponent. +*/ + +while(1) +{ AddSubInternalFPF(1,&locFPFNum,&IFPF10,&IFPFComp); + if(IFPFComp.sign==0) break; + MultiplyInternalFPF(&locFPFNum,&IFPF10,&IFPFComp); + expcount--; + memcpy((void *)&locFPFNum,(void *)&IFPFComp,sizeof(InternalFPF)); +} +/* +** Do the reverse of the above. As long as the number is +** greater than or equal to 10, divide it by 10. Increment the +** exponent counter for each multiplication. +*/ + +while(1) +{ + AddSubInternalFPF(1,&locFPFNum,&IFPF10,&IFPFComp); + if(IFPFComp.sign!=0) break; + DivideInternalFPF(&locFPFNum,&IFPF10,&IFPFComp); + expcount++; + memcpy((void *)&locFPFNum,(void *)&IFPFComp,sizeof(InternalFPF)); +} + +/* +** About time to start storing things. First, store the +** mantissa sign. +*/ +ccount=1; /* Init character counter */ +if(msign==0) + *dest++='+'; +else + *dest++='-'; + +/* +** At this point we know that the number is in the range +** 10 > n >=1. We need to "strip digits" out of the +** mantissa. We do this by treating the mantissa as +** an integer and multiplying by 10. (Not a floating-point +** 10, but an integer 10. Since this is debug code and we +** could care less about speed, we'll do it the stupid +** way and simply add the number to itself 10 times. +** Anything that makes it to the left of the implied binary point +** gets stripped off and emitted. We'll do this for +** 5 significant digits (which should be enough to +** verify things). +*/ +/* +** Re-position radix point +*/ +carryaccum=0; +while(locFPFNum.exp>0) +{ + mycarry=0; + ShiftMantLeft1(&mycarry,locFPFNum.mantissa); + carryaccum=(carryaccum<<1); + if(mycarry) carryaccum++; + locFPFNum.exp--; +} + +while(locFPFNum.exp<0) +{ + mycarry=0; + ShiftMantRight1(&mycarry,locFPFNum.mantissa); + locFPFNum.exp++; +} + +for(i=0;i<6;i++) + if(i==1) + { /* Emit decimal point */ + *dest++='.'; + ccount++; + } + else + { /* Emit a digit */ + *dest++=('0'+carryaccum); + ccount++; + + carryaccum=0; + memcpy((void *)&IFPF10, + (void *)&locFPFNum, + sizeof(InternalFPF)); + + /* Do multiply via repeated adds */ + for(j=0;j<9;j++) + { + mycarry=0; + for(k=(INTERNAL_FPF_PRECISION-1);k>=0;k--) + Add16Bits(&mycarry,&(IFPFComp.mantissa[k]), + locFPFNum.mantissa[k], + IFPF10.mantissa[k]); + carryaccum+=mycarry ? 1 : 0; + memcpy((void *)&locFPFNum, + (void *)&IFPFComp, + sizeof(InternalFPF)); + } + } + +/* +** Now move the 'E', the exponent sign, and the exponent +** into the string. +*/ +*dest++='E'; + +/* sprint is supposed to return an integer, but it caused problems on SunOS + * with the native cc. Hence we force it. + * Uwe F. Mayer + */ +ccount+=(int)sprintf(dest,"%4d",expcount); + +/* +** All done, go home. +*/ +return(ccount); + +} + +#endif diff --git a/emfloat.h b/emfloat.h new file mode 100644 index 0000000..41cc6d9 --- /dev/null +++ b/emfloat.h @@ -0,0 +1,154 @@ + +/* +** emfloat.h +** Header for emfloat.c +** +** BYTEmark (tm) +** BYTE Magazine's Native Mode benchmarks +** Rick Grehan, BYTE Magazine +** +** Create: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include <stdio.h> + +/* Is this a 64 bit architecture? If so, this will define LONG64 */ +/* Uwe F. Mayer 15 November 1997 */ +#include "pointer.h" + +/* +** DEFINES +*/ +#define u8 unsigned char +#define u16 unsigned short +#ifdef LONG64 +#define u32 unsigned int +#else +#define u32 unsigned long +#endif +#define uchar unsigned char +#define ulong unsigned long + +#define MAX_EXP 32767L +#define MIN_EXP (-32767L) + +#define IFPF_IS_ZERO 0 +#define IFPF_IS_SUBNORMAL 1 +#define IFPF_IS_NORMAL 2 +#define IFPF_IS_INFINITY 3 +#define IFPF_IS_NAN 4 +#define IFPF_TYPE_COUNT 5 + +#define ZERO_ZERO 0 +#define ZERO_SUBNORMAL 1 +#define ZERO_NORMAL 2 +#define ZERO_INFINITY 3 +#define ZERO_NAN 4 + +#define SUBNORMAL_ZERO 5 +#define SUBNORMAL_SUBNORMAL 6 +#define SUBNORMAL_NORMAL 7 +#define SUBNORMAL_INFINITY 8 +#define SUBNORMAL_NAN 9 + +#define NORMAL_ZERO 10 +#define NORMAL_SUBNORMAL 11 +#define NORMAL_NORMAL 12 +#define NORMAL_INFINITY 13 +#define NORMAL_NAN 14 + +#define INFINITY_ZERO 15 +#define INFINITY_SUBNORMAL 16 +#define INFINITY_NORMAL 17 +#define INFINITY_INFINITY 18 +#define INFINITY_NAN 19 + +#define NAN_ZERO 20 +#define NAN_SUBNORMAL 21 +#define NAN_NORMAL 22 +#define NAN_INFINITY 23 +#define NAN_NAN 24 +#define OPERAND_ZERO 0 +#define OPERAND_SUBNORMAL 1 +#define OPERAND_NORMAL 2 +#define OPERAND_INFINITY 3 +#define OPERAND_NAN 4 + +/* +** Following already defined in NMGLOBAL.H +** +#define INTERNAL_FPF_PRECISION 4 +*/ + +/* +** TYPEDEFS +*/ + +typedef struct +{ + u8 type; /* Indicates, NORMAL, SUBNORMAL, etc. */ + u8 sign; /* Mantissa sign */ + short exp; /* Signed exponent...no bias */ + u16 mantissa[INTERNAL_FPF_PRECISION]; +} InternalFPF; + +/* +** PROTOTYPES +*/ +void SetupCPUEmFloatArrays(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, ulong arraysize); +ulong DoEmFloatIteration(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, + ulong arraysize, ulong loops); +static void SetInternalFPFZero(InternalFPF *dest, + uchar sign); +static void SetInternalFPFInfinity(InternalFPF *dest, + uchar sign); +static void SetInternalFPFNaN(InternalFPF *dest); +static int IsMantissaZero(u16 *mant); +static void Add16Bits(u16 *carry,u16 *a,u16 b,u16 c); +static void Sub16Bits(u16 *borrow,u16 *a,u16 b,u16 c); +static void ShiftMantLeft1(u16 *carry,u16 *mantissa); +static void ShiftMantRight1(u16 *carry,u16 *mantissa); +static void StickyShiftRightMant(InternalFPF *ptr,int amount); +static void normalize(InternalFPF *ptr); +static void denormalize(InternalFPF *ptr,int minimum_exponent); +void RoundInternalFPF(InternalFPF *ptr); +static void choose_nan(InternalFPF *x,InternalFPF *y,InternalFPF *z, + int intel_flag); +static void AddSubInternalFPF(uchar operation,InternalFPF *x, + InternalFPF *y,InternalFPF *z); +static void MultiplyInternalFPF(InternalFPF *x,InternalFPF *y, + InternalFPF *z); +static void DivideInternalFPF(InternalFPF *x,InternalFPF *y, + InternalFPF *z); +/* static void LongToInternalFPF(long mylong, */ +static void Int32ToInternalFPF(int32 mylong, + InternalFPF *dest); +#ifdef DEBUG +static int InternalFPFToString(char *dest, + InternalFPF *src); +#endif + +/* +** EXTERNALS +*/ +extern ulong StartStopwatch(); +extern ulong StopStopwatch(ulong elapsed); +/* extern long randwc(long num); */ +extern int32 randwc(int32 num); diff --git a/hardware b/hardware Binary files differnew file mode 100755 index 0000000..6fb3293 --- /dev/null +++ b/hardware diff --git a/hardware.c b/hardware.c new file mode 100644 index 0000000..4838b2f --- /dev/null +++ b/hardware.c @@ -0,0 +1,202 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define BUF_SIZ 1024 + +/****************** +** output_string ** +******************* +** Displays a string on the screen. Also, if the flag +** write_to_file is set, outputs the string to the output file. +** Note, this routine presumes that you've included a carriage +** return at the end of the buffer. +*/ +static void output_string(const char *buffer, const int write_to_file, + FILE *global_ofile){ + printf("%s",buffer); + if(write_to_file!=0) + fprintf(global_ofile,"%s",buffer); + return; +} + + +/****************** +** removeNewLine ** +******************* +** Removes a trailing newline character if present +*/ +static void removeNewLine(char * s) { + if(strlen(s)>0 && s[strlen(s)-1] == '\n') { + s[strlen(s)-1] = '\0'; + } +} + + +/*************** +** runCommand ** +**************** +** Run the system command through a pipe +** The pointer result must point to a pre-allocated array of at least BUF_SIZ +*/ +static void runCommand (const char *command, char *result) { + FILE * pipe; + + pipe = popen(command, "r"); + if(pipe == NULL) { + /* command failed */ + result[0] = '\0'; + } else { + if(NULL == fgets(result, BUF_SIZ, pipe)){ + /* command failed */ + result[0] = '\0'; + } + pclose(pipe); + } + removeNewLine(result); +} + + +/******************** +** readProcCpuInfo ** +********************* +** Reads and parses /proc/cpuinfo on a Linux system +** The pointers must point to pre-allocated arrays of at least BUF_SIZ +*/ +static void readProcCpuInfo (char *model, char *cache) { + FILE * info; + char * cp; + int cpus = 0; + char * buffer_end; + char buffer[BUF_SIZ]; + char vendor_id[BUF_SIZ]; + char model_name[BUF_SIZ]; + char cpu_MHz[BUF_SIZ]; + int i; + float f; + + vendor_id[0] = model_name[0] = cpu_MHz[0] = model[0] = cache[0] = '\0'; + info = fopen("/proc/cpuinfo", "r"); + if(info != NULL) { + /* command did not fail */ + while(NULL != fgets(buffer, BUF_SIZ, info)){ + buffer_end = buffer + strlen(buffer); + cp = buffer; + if(! strncmp(buffer, "processor", 9)) { + cpus++; + } else if(! strncmp(buffer, "vendor_id", 9)) { + cp+=strlen("vendor_id"); + while(cp < buffer_end && ( *cp == ' ' || *cp == ':'|| *cp == '\t')) + cp++; + if(cp<buffer_end) { + strcpy(vendor_id, cp); + } + removeNewLine(vendor_id); + } else if(! strncmp(buffer, "model name", 10)) { + cp+=strlen("model name"); + while(cp < buffer_end && ( *cp == ' ' || *cp == ':'|| *cp == '\t')) + cp++; + if(cp<buffer_end) { + strcpy(model_name, cp); + } + removeNewLine(model_name); + } else if(! strncmp(buffer, "cpu MHz", 7)) { + cp+=strlen("cpu MHz"); + while(cp < buffer_end && ( *cp == ' ' || *cp == ':'|| *cp == '\t')) + cp++; + if(cp<buffer_end) { + strcpy(cpu_MHz, cp); + } + removeNewLine(cpu_MHz); + } else if(! strncmp(buffer, "cache size", 10)) { + cp+=strlen("cache size"); + while(cp < buffer_end && ( *cp == ' ' || *cp == ':'|| *cp == '\t')) + cp++; + if(cp<buffer_end) { + strcpy(cache, cp); + } + removeNewLine(cache); + } + } + if(cpus>1) { + if (cpus==2) { + strcpy(model, "Dual"); + } else { + sprintf(model, "%d CPU", cpus); + } + } + cp = model + strlen(model); + if(vendor_id[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + strcpy(cp, vendor_id); + cp += strlen(vendor_id); + } + if(model_name[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + strcpy(cp, model_name); + cp += strlen(model_name); + } + if(cpu_MHz[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + f = atof(cpu_MHz); + i = (int)(f+0.5f); + sprintf(cpu_MHz, "%dMHz", i); + strcpy(cp, cpu_MHz); + cp += strlen(cpu_MHz); + } + fclose(info); + } +} + + +/************* +** hardware ** +************** +** Runs the system command "uname -s -r" +** Reads /proc/cpuinfo if on a linux system +** Writes output +*/ +void hardware(const int write_to_file, FILE *global_ofile) { + char buffer[BUF_SIZ]; + char os[BUF_SIZ]; + char model[BUF_SIZ]; + char cache[BUF_SIZ]; + char os_command[] = "uname -s -r"; +#ifdef NO_UNAME + os[0] = '\0'; +#else + runCommand(os_command, os); +#endif + if(NULL != strstr(os, "Linux")) { + readProcCpuInfo (model, cache); + } else { + model[0] = '\0'; + cache[0] = '\0'; + } + sprintf(buffer, "CPU : %s\n", model); + output_string(buffer, write_to_file, global_ofile); + sprintf(buffer, "L2 Cache : %s\n", cache); + output_string(buffer, write_to_file, global_ofile); + sprintf(buffer, "OS : %s\n", os); + output_string(buffer, write_to_file, global_ofile); +} + + +/************************ +** main for hardware.c ** +************************* +** For testing of code only +** Should be commented out +*/ +/* +int main(int argc, char * argv[]) { + hardware(0, NULL); + return 0; +} +*/ diff --git a/hardware.h b/hardware.h new file mode 100644 index 0000000..2a07934 --- /dev/null +++ b/hardware.h @@ -0,0 +1,2 @@ +extern +void hardware(const int write_to_file, FILE *global_ofile); @@ -0,0 +1,2 @@ +#include <stdio.h> +int main () {printf("hello.\n");return(0);} @@ -0,0 +1,120 @@ + +/* +** misc.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include <stdio.h> +#include "misc.h" + +/*********************************************************** +** MISCELLANEOUS BUT OTHERWISE NECESSARY ROUTINES ** +***********************************************************/ + +/**************************** +** RANDOM NUMBER GENERATOR ** +***************************** +** This is a second-order linear congruential random number +** generator. Its advantage is (of course) that it can be +** seeded and will thus produce repeatable sequences of +** random numbers. +*/ + +/**************************** +* randwc() * +***************************** +** Returns signed long random modulo num. +*/ +/* +long randwc(long num) +{ + return(randnum(0L)%num); +} +*/ +/* +** Returns signed 32-bit random modulo num. +*/ +int32 randwc(int32 num) +{ + return(randnum((int32)0)%num); +} + +/*************************** +** abs_randwc() ** +**************************** +** Same as randwc(), only this routine returns only +** positive numbers. +*/ +/* +unsigned long abs_randwc(unsigned long num) +{ +long temp; + +temp=randwc(num); +if(temp<0) temp=0L-temp; + +return((unsigned long)temp); +} +*/ +u32 abs_randwc(u32 num) +{ +int32 temp; /* Temporary storage */ + +temp=randwc(num); +if(temp<0) temp=(int32)0-temp; + +return((u32)temp); +} + +/**************************** +* randnum() * +***************************** +** Second order linear congruential generator. +** Constants suggested by J. G. Skellam. +** If val==0, returns next member of sequence. +** val!=0, restart generator. +*/ +/* +long randnum(long lngval) +{ + register long interm; + static long randw[2] = { 13L , 117L }; + + if (lngval!=0L) + { randw[0]=13L; randw[1]=117L; } + + interm=(randw[0]*254754L+randw[1]*529562L)%999563L; + randw[1]=randw[0]; + randw[0]=interm; + return(interm); +} +*/ +int32 randnum(int32 lngval) +{ + register int32 interm; + static int32 randw[2] = { (int32)13 , (int32)117 }; + + if (lngval!=(int32)0) + { randw[0]=(int32)13; randw[1]=(int32)117; } + + interm=(randw[0]*(int32)254754+randw[1]*(int32)529562)%(int32)999563; + randw[1]=randw[0]; + randw[0]=interm; + return(interm); +} + @@ -0,0 +1,41 @@ +/* +** misc.h +** Header for misc.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/************************ +** FUNCTION PROTOTYPES ** +************************/ + +/* +long randwc(long num); +unsigned long abs_randwc(unsigned long num); +long randnum(long lngval); +*/ + +#include "nmglobal.h" +int32 randwc(int32 num); +u32 abs_randwc(u32 num); +int32 randnum(int32 lngval); + + diff --git a/nbench0.c b/nbench0.c new file mode 100644 index 0000000..784b501 --- /dev/null +++ b/nbench0.c @@ -0,0 +1,1174 @@ + +/* +** nbench0.c +*/ + +/******************************************* +** BYTEmark (tm) ** +** BYTE MAGAZINE'S NATIVE MODE BENCHMARKS ** +** FOR CPU/FPU ** +** ver 2.0 ** +** Rick Grehan, BYTE Magazine ** +******************************************** +** NOTE: These benchmarks do NOT check for the presence +** of an FPU. You have to find that out manually. +** +** REVISION HISTORY FOR BENCHMARKS +** 9/94 -- First beta. --RG +** 12/94 -- Bug discovered in some of the integer routines +** (IDEA, Huffman,...). Routines were not accurately counting +** the number of loops. Fixed. --RG (Thanks to Steve A.) +** 12/94 -- Added routines to calculate and display index +** values. Indexes based on DELL XPS 90 (90 MHz Pentium). +** 1/95 -- Added Mac time manager routines for more accurate +** timing on Macintosh (said to be good to 20 usecs) -- RG +** 1/95 -- Re-did all the #defines so they made more +** sense. See NMGLOBAL.H -- RG +** 3/95 -- Fixed memory leak in LU decomposition. Did not +** invalidate previous results, just made it easier to run.--RG +** 3/95 -- Added TOOLHELP.DLL timing routine to Windows timer. --RG +** 10/95 -- Added memory array & alignment; moved memory +** allocation out of LU Decomposition -- RG +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <time.h> +#include <math.h> +#include "nmglobal.h" +#include "nbench0.h" +#include "hardware.h" + +/************* +**** main **** +*************/ +#ifdef MAC +void main(void) +#else +int main(int argc, char *argv[]) +#endif +{ +int i; /* Index */ +time_t time_and_date; /* Self-explanatory */ +struct tm *loctime; +double bmean; /* Benchmark mean */ +double bstdev; /* Benchmark stdev */ +double lx_memindex; /* Linux memory index (mainly integer operations)*/ +double lx_intindex; /* Linux integer index */ +double lx_fpindex; /* Linux floating-point index */ +double intindex; /* Integer index */ +double fpindex; /* Floating-point index */ +ulong bnumrun; /* # of runs */ + +#ifdef MAC + MaxApplZone(); +#endif + +#ifdef MACTIMEMGR +/* Set up high res timer */ +MacHSTdelay=600*1000*1000; /* Delay is 10 minutes */ + +memset((char *)&myTMTask,0,sizeof(TMTask)); + +/* Prime and remove the task, calculating overhead */ +PrimeTime((QElemPtr)&myTMTask,-MacHSTdelay); +RmvTime((QElemPtr)&myTMTask); +MacHSTohead=MacHSTdelay+myTMTask.tmCount; +#endif + +#ifdef WIN31TIMER +/* Set up the size of the timer info structure */ +win31tinfo.dwSize=(DWORD)sizeof(TIMERINFO); +/* Load library */ +if((hThlp=LoadLibrary("TOOLHELP.DLL"))<32) +{ printf("Error loading TOOLHELP\n"); + exit(0); +} +if(!(lpfn=GetProcAddress(hThlp,"TimerCount"))) +{ printf("TOOLHELP error\n"); + exit(0); +} +#endif + +/* +** Set global parameters to default. +*/ +global_min_ticks=MINIMUM_TICKS; +global_min_seconds=MINIMUM_SECONDS; +global_allstats=0; +global_custrun=0; +global_align=8; +write_to_file=0; +lx_memindex=(double)1.0; /* set for geometric mean computations */ +lx_intindex=(double)1.0; +lx_fpindex=(double)1.0; +intindex=(double)1.0; +fpindex=(double)1.0; +mem_array_ents=0; /* Nothing in mem array */ + +/* +** We presume all tests will be run unless told +** otherwise +*/ +for(i=0;i<NUMTESTS;i++) + tests_to_do[i]=1; + +/* +** Initialize test data structures to default +** values. +*/ +set_request_secs(); /* Set all request_secs fields */ +global_numsortstruct.adjust=0; +global_numsortstruct.arraysize=NUMARRAYSIZE; + +global_strsortstruct.adjust=0; +global_strsortstruct.arraysize=STRINGARRAYSIZE; + +global_bitopstruct.adjust=0; +global_bitopstruct.bitfieldarraysize=BITFARRAYSIZE; + +global_emfloatstruct.adjust=0; +global_emfloatstruct.arraysize=EMFARRAYSIZE; + +global_fourierstruct.adjust=0; + +global_assignstruct.adjust=0; + +global_ideastruct.adjust=0; +global_ideastruct.arraysize=IDEAARRAYSIZE; + +global_huffstruct.adjust=0; +global_huffstruct.arraysize=HUFFARRAYSIZE; + +global_nnetstruct.adjust=0; + +global_lustruct.adjust=0; + +/* +** For Macintosh -- read the command line. +*/ +#ifdef MAC +UCommandLine(); +#endif + +/* +** Handle any command-line arguments. +*/ +if(argc>1) + for(i=1;i<argc;i++) + if(parse_arg(argv[i])==-1) + { display_help(argv[0]); + exit(0); + } +/* +** Output header +*/ +#ifdef LINUX +output_string("\nBYTEmark* Native Mode Benchmark ver. 2 (10/95)\n"); +output_string("Index-split by Andrew D. Balsa (11/97)\n"); +output_string("Linux/Unix* port by Uwe F. Mayer (12/96,11/97)\n"); +#else +output_string("BBBBBB YYY Y TTTTTTT EEEEEEE\n"); +output_string("BBB B YYY Y TTT EEE\n"); +output_string("BBB B YYY Y TTT EEE\n"); +output_string("BBBBBB YYY Y TTT EEEEEEE\n"); +output_string("BBB B YYY TTT EEE\n"); +output_string("BBB B YYY TTT EEE\n"); +output_string("BBBBBB YYY TTT EEEEEEE\n\n"); +output_string("\nBYTEmark (tm) Native Mode Benchmark ver. 2 (10/95)\n"); +#endif +/* +** See if the user wants all stats. Output heading info +** if so. +*/ +if(global_allstats) +{ + output_string("\n"); + output_string("============================== ALL STATISTICS ===============================\n"); + time(&time_and_date); + loctime=localtime(&time_and_date); + sprintf(buffer,"**Date and time of benchmark run: %s",asctime(loctime)); + output_string(buffer); + sprintf(buffer,"**Sizeof: char:%u short:%u int:%u long:%u u8:%u u16:%u u32:%u int32:%u\n", + (unsigned int)sizeof(char), + (unsigned int)sizeof(short), + (unsigned int)sizeof(int), + (unsigned int)sizeof(long), + (unsigned int)sizeof(u8), + (unsigned int)sizeof(u16), + (unsigned int)sizeof(u32), + (unsigned int)sizeof(int32)); + output_string(buffer); +#ifdef LINUX +#include "sysinfo.c" +#else + sprintf(buffer,"**%s\n",sysname); + output_string(buffer); + sprintf(buffer,"**%s\n",compilername); + output_string(buffer); + sprintf(buffer,"**%s\n",compilerversion); + output_string(buffer); +#endif + output_string("=============================================================================\n"); +} + +/* +** Execute the tests. +*/ +#ifdef LINUX +output_string("\nTEST : Iterations/sec. : Old Index : New Index\n"); +output_string(" : : Pentium 90* : AMD K6/233*\n"); +output_string("--------------------:------------------:-------------:------------\n"); +#endif + +for(i=0;i<NUMTESTS;i++) +{ + if(tests_to_do[i]) + { sprintf(buffer,"%s :",ftestnames[i]); + output_string(buffer); + if (0!=bench_with_confidence(i, + &bmean, + &bstdev, + &bnumrun)){ + output_string("\n** WARNING: The current test result is NOT 95 % statistically certain.\n"); + output_string("** WARNING: The variation among the individual results is too large.\n"); + output_string(" :"); + } +#ifdef LINUX + sprintf(buffer," %15.5g : %9.2f : %9.2f\n", + bmean,bmean/bindex[i],bmean/lx_bindex[i]); +#else + sprintf(buffer," Iterations/sec.: %13.2f Index: %6.2f\n", + bmean,bmean/bindex[i]); +#endif + output_string(buffer); + /* + ** Gather integer or FP indexes + */ + if((i==4)||(i==8)||(i==9)){ + /* FP index */ + fpindex=fpindex*(bmean/bindex[i]); + /* Linux FP index */ + lx_fpindex=lx_fpindex*(bmean/lx_bindex[i]); + } + else{ + /* Integer index */ + intindex=intindex*(bmean/bindex[i]); + if((i==0)||(i==3)||(i==6)||(i==7)) + /* Linux integer index */ + lx_intindex=lx_intindex*(bmean/lx_bindex[i]); + else + /* Linux memory index */ + lx_memindex=lx_memindex*(bmean/lx_bindex[i]); + } + + if(global_allstats) + { + sprintf(buffer," Absolute standard deviation: %g\n",bstdev); + output_string(buffer); + if (bmean>(double)1e-100){ + /* avoid division by zero */ + sprintf(buffer," Relative standard deviation: %g %%\n", + (double)100*bstdev/bmean); + output_string(buffer); + } + sprintf(buffer," Number of runs: %lu\n",bnumrun); + output_string(buffer); + show_stats(i); + sprintf(buffer,"Done with %s\n\n",ftestnames[i]); + output_string(buffer); + } + } +} +/* printf("...done...\n"); */ + +/* +** Output the total indexes +*/ +if(global_custrun==0) +{ + output_string("==========================ORIGINAL BYTEMARK RESULTS==========================\n"); + sprintf(buffer,"INTEGER INDEX : %.3f\n", + pow(intindex,(double).142857)); + output_string(buffer); + sprintf(buffer,"FLOATING-POINT INDEX: %.3f\n", + pow(fpindex,(double).33333)); + output_string(buffer); + output_string("Baseline (MSDOS*) : Pentium* 90, 256 KB L2-cache, Watcom* compiler 10.0\n"); +#ifdef LINUX + output_string("==============================LINUX DATA BELOW===============================\n"); + hardware(write_to_file, global_ofile); +#include "sysinfoc.c" + sprintf(buffer,"MEMORY INDEX : %.3f\n", + pow(lx_memindex,(double).3333333333)); + output_string(buffer); + sprintf(buffer,"INTEGER INDEX : %.3f\n", + pow(lx_intindex,(double).25)); + output_string(buffer); + sprintf(buffer,"FLOATING-POINT INDEX: %.3f\n", + pow(lx_fpindex,(double).3333333333)); + output_string(buffer); + output_string("Baseline (LINUX) : AMD K6/233*, 512 KB L2-cache, gcc 2.7.2.3, libc-5.4.38\n"); +#endif +output_string("* Trademarks are property of their respective holder.\n"); +} + +exit(0); +} + +/************** +** parse_arg ** +*************** +** Given a pointer to a string, we assume that's an argument. +** Parse that argument and act accordingly. +** Return 0 if ok, else return -1. +*/ +static int parse_arg(char *argptr) +{ +int i; /* Index */ +FILE *cfile; /* Command file identifier */ + +/* +** First character has got to be a hyphen. +*/ +if(*argptr++!='-') return(-1); + +/* +** Convert the rest of the argument to upper case +** so there's little chance of confusion. +*/ +for(i=0;i<strlen(argptr);i++) + argptr[i]=(char)toupper((int)argptr[i]); + +/* +** Next character picks the action. +*/ +switch(*argptr++) +{ + case '?': return(-1); /* Will display help */ + + case 'V': global_allstats=1; return(0); /* verbose mode */ + + case 'C': /* Command file name */ + /* + ** First try to open the file for reading. + */ + cfile=fopen(argptr,"r"); + if(cfile==(FILE *)NULL) + { printf("**Error opening file: %s\n",argptr); + return(-1); + } + read_comfile(cfile); /* Read commands */ + fclose(cfile); + break; + default: + return(-1); +} +return(0); +} + +/******************* +** display_help() ** +******************** +** Display a help message showing argument requirements and such. +** Exit when you're done...I mean, REALLY exit. +*/ +void display_help(char *progname) +{ + printf("Usage: %s [-v] [-c<FILE>]\n",progname); + printf(" -v = verbose\n"); + printf(" -c = input parameters thru command file <FILE>\n"); + exit(0); +} + + +/***************** +** read_comfile ** +****************** +** Read the command file. Set global parameters as +** specified. This routine assumes that the command file +** is already open. +*/ +static void read_comfile(FILE *cfile) +{ +char inbuf[40]; +char *eptr; /* Offset to "=" sign */ +int i; /* Index */ + +/* +** Sit in a big loop, reading a line from the file at each +** pass. Terminate on EOF. +*/ +while(fgets(inbuf,39,cfile)!=(char *)NULL) +{ + /* Overwrite the CR character */ + if(strlen(inbuf)>0) + inbuf[strlen(inbuf)-1]='\0'; + + /* + ** Parse up to the "=" sign. If we don't find an + ** "=", then flag an error. + */ + if((eptr=strchr(inbuf,(int)'='))==(char *)NULL) + { printf("**COMMAND FILE ERROR at LINE:\n %s\n", + inbuf); + goto skipswitch; /* A GOTO!!!! */ + } + + /* + ** Insert a null where the "=" was, then convert + ** the substring to uppercase. That will enable + ** us to perform the match. + */ + *eptr++='\0'; + strtoupper((char *)&inbuf[0]); + i=MAXPARAM; + do { + if(strcmp(inbuf,paramnames[i])==0) + break; + } while(--i>=0); + + if(i<0) + { printf("**COMMAND FILE ERROR -- UNKNOWN PARAM: %s", + inbuf); + goto skipswitch; + } + + /* + ** Advance eptr to the next field...which should be + ** the value assigned to the parameter. + */ + switch(i) + { + case PF_GMTICKS: /* GLOBALMINTICKS */ + global_min_ticks=(ulong)atol(eptr); + break; + + case PF_MINSECONDS: /* MINSECONDS */ + global_min_seconds=(ulong)atol(eptr); + set_request_secs(); + break; + + case PF_ALLSTATS: /* ALLSTATS */ + global_allstats=getflag(eptr); + break; + + case PF_OUTFILE: /* OUTFILE */ + strcpy(global_ofile_name,eptr); + global_ofile=fopen(global_ofile_name,"a"); + /* + ** Open the output file. + */ + if(global_ofile==(FILE *)NULL) + { printf("**Error opening output file: %s\n", + global_ofile_name); + ErrorExit(); + } + write_to_file=-1; + break; + + case PF_CUSTOMRUN: /* CUSTOMRUN */ + global_custrun=getflag(eptr); + for(i=0;i<NUMTESTS;i++) + tests_to_do[i]=1-global_custrun; + break; + + case PF_DONUM: /* DONUMSORT */ + tests_to_do[TF_NUMSORT]=getflag(eptr); + break; + + case PF_NUMNUMA: /* NUMNUMARRAYS */ + global_numsortstruct.numarrays= + (ushort)atoi(eptr); + global_numsortstruct.adjust=1; + break; + + case PF_NUMASIZE: /* NUMARRAYSIZE */ + global_numsortstruct.arraysize= + (ulong)atol(eptr); + break; + + case PF_NUMMINS: /* NUMMINSECONDS */ + global_numsortstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOSTR: /* DOSTRINGSORT */ + tests_to_do[TF_SSORT]=getflag(eptr); + break; + + case PF_STRASIZE: /* STRARRAYSIZE */ + global_strsortstruct.arraysize= + (ulong)atol(eptr); + break; + + case PF_NUMSTRA: /* NUMSTRARRAYS */ + global_strsortstruct.numarrays= + (ushort)atoi(eptr); + global_strsortstruct.adjust=1; + break; + + case PF_STRMINS: /* STRMINSECONDS */ + global_strsortstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOBITF: /* DOBITFIELD */ + tests_to_do[TF_BITOP]=getflag(eptr); + break; + + case PF_NUMBITOPS: /* NUMBITOPS */ + global_bitopstruct.bitoparraysize= + (ulong)atol(eptr); + global_bitopstruct.adjust=1; + break; + + case PF_BITFSIZE: /* BITFIELDSIZE */ + global_bitopstruct.bitfieldarraysize= + (ulong)atol(eptr); + break; + + case PF_BITMINS: /* BITMINSECONDS */ + global_bitopstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOEMF: /* DOEMF */ + tests_to_do[TF_FPEMU]=getflag(eptr); + break; + + case PF_EMFASIZE: /* EMFARRAYSIZE */ + global_emfloatstruct.arraysize= + (ulong)atol(eptr); + break; + + case PF_EMFLOOPS: /* EMFLOOPS */ + global_emfloatstruct.loops= + (ulong)atol(eptr); + break; + + case PF_EMFMINS: /* EMFMINSECOND */ + global_emfloatstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOFOUR: /* DOFOUR */ + tests_to_do[TF_FFPU]=getflag(eptr); + break; + + case PF_FOURASIZE: /* FOURASIZE */ + global_fourierstruct.arraysize= + (ulong)atol(eptr); + global_fourierstruct.adjust=1; + break; + + case PF_FOURMINS: /* FOURMINSECONDS */ + global_fourierstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOASSIGN: /* DOASSIGN */ + tests_to_do[TF_ASSIGN]=getflag(eptr); + break; + + case PF_AARRAYS: /* ASSIGNARRAYS */ + global_assignstruct.numarrays= + (ulong)atol(eptr); + break; + + case PF_ASSIGNMINS: /* ASSIGNMINSECONDS */ + global_assignstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOIDEA: /* DOIDEA */ + tests_to_do[TF_IDEA]=getflag(eptr); + break; + + case PF_IDEAASIZE: /* IDEAARRAYSIZE */ + global_ideastruct.arraysize= + (ulong)atol(eptr); + break; + + case PF_IDEALOOPS: /* IDEALOOPS */ + global_ideastruct.loops= + (ulong)atol(eptr); + break; + + case PF_IDEAMINS: /* IDEAMINSECONDS */ + global_ideastruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOHUFF: /* DOHUFF */ + tests_to_do[TF_HUFF]=getflag(eptr); + break; + + case PF_HUFFASIZE: /* HUFFARRAYSIZE */ + global_huffstruct.arraysize= + (ulong)atol(eptr); + break; + + case PF_HUFFLOOPS: /* HUFFLOOPS */ + global_huffstruct.loops= + (ulong)atol(eptr); + global_huffstruct.adjust=1; + break; + + case PF_HUFFMINS: /* HUFFMINSECONDS */ + global_huffstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DONNET: /* DONNET */ + tests_to_do[TF_NNET]=getflag(eptr); + break; + + case PF_NNETLOOPS: /* NNETLOOPS */ + global_nnetstruct.loops= + (ulong)atol(eptr); + global_nnetstruct.adjust=1; + break; + + case PF_NNETMINS: /* NNETMINSECONDS */ + global_nnetstruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_DOLU: /* DOLU */ + tests_to_do[TF_LU]=getflag(eptr); + break; + + case PF_LUNARRAYS: /* LUNUMARRAYS */ + global_lustruct.numarrays= + (ulong)atol(eptr); + global_lustruct.adjust=1; + break; + + case PF_LUMINS: /* LUMINSECONDS */ + global_lustruct.request_secs= + (ulong)atol(eptr); + break; + + case PF_ALIGN: /* ALIGN */ + global_align=atoi(eptr); + break; + } +skipswitch: + continue; +} /* End while */ + +return; +} + +/************ +** getflag ** +************* +** Return 1 if cptr points to "T"; 0 otherwise. +*/ +static int getflag(char *cptr) +{ + if(toupper((int)*cptr)=='T') return(1); +return(0); +} + +/*************** +** strtoupper ** +**************** +** Convert's a string to upper case. The string is presumed +** to consist only of alphabetic characters, and to be terminated +** with a null. +*/ +static void strtoupper(char *s) +{ + +do { +/* +** Oddly enough, the following line did not work under THINK C. +** So, I modified it....hmmmm. --RG + *s++=(char)toupper((int)*s); +*/ + *s=(char)toupper((int)*s); + s++; +} while(*s!=(char)'\0'); +return; +} + +/********************* +** set_request_secs ** +********************** +** Set everyone's "request_secs" entry to whatever +** value is in global_min_secs. This is done +** at the beginning, and possibly later if the +** user redefines global_min_secs in the command file. +*/ +static void set_request_secs(void) +{ + +global_numsortstruct.request_secs=global_min_seconds; +global_strsortstruct.request_secs=global_min_seconds; +global_bitopstruct.request_secs=global_min_seconds; +global_emfloatstruct.request_secs=global_min_seconds; +global_fourierstruct.request_secs=global_min_seconds; +global_assignstruct.request_secs=global_min_seconds; +global_ideastruct.request_secs=global_min_seconds; +global_huffstruct.request_secs=global_min_seconds; +global_nnetstruct.request_secs=global_min_seconds; +global_lustruct.request_secs=global_min_seconds; + +return; +} + + +/************************** +** bench_with_confidence ** +*************************** +** Given a benchmark id that indicates a function, this routine +** repeatedly calls that benchmark, seeking to collect and replace +** scores to get 5 that meet the confidence criteria. +** +** The above is mathematically questionable, as the statistical theory +** depends on independent observations, and if we exchange data points +** depending on what we already have then this certainly violates +** independence of the observations. Hence I changed this so that at +** most 30 observations are done, but none are deleted as we go +** along. We simply do more runs and hope to get a big enough sample +** size so that things stabilize. Uwe F. Mayer +** +** Return 0 if ok, -1 if failure. Returns mean +** and std. deviation of results if successful. +*/ +static int bench_with_confidence(int fid, /* Function id */ + double *mean, /* Mean of scores */ + double *stdev, /* Standard deviation */ + ulong *numtries) /* # of attempts */ +{ +double myscores[30]; /* Need at least 5 scores, use at most 30 */ +double c_half_interval; /* Confidence half interval */ +int i; /* Index */ +/* double newscore; */ /* For improving confidence interval */ + +/* +** Get first 5 scores. Then begin confidence testing. +*/ +for (i=0;i<5;i++) +{ (*funcpointer[fid])(); + myscores[i]=getscore(fid); +#ifdef DEBUG + printf("score # %d = %g\n", i, myscores[i]); +#endif +} +*numtries=5; /* Show 5 attempts */ + +/* +** The system allows a maximum of 30 tries before it gives +** up. Since we've done 5 already, we'll allow 25 more. +*/ + +/* +** Enter loop to test for confidence criteria. +*/ +while(1) +{ + /* + ** Calculate confidence. Should always return 0. + */ + if (0!=calc_confidence(myscores, + *numtries, + &c_half_interval, + mean, + stdev)) return(-1); + + /* + ** Is the length of the half interval 5% or less of mean? + ** If so, we can go home. Otherwise, we have to continue. + */ + if(c_half_interval/ (*mean) <= (double)0.05) + break; + +#ifdef OLDCODE +#undef OLDCODE +#endif +#ifdef OLDCODE +/* this code is no longer valid, we now do not replace but add new scores */ +/* Uwe F. Mayer */ + /* + ** Go get a new score and see if it + ** improves existing scores. + */ + do { + if(*numtries==10) + return(-1); + (*funcpointer[fid])(); + *numtries+=1; + newscore=getscore(fid); + } while(seek_confidence(myscores,&newscore, + &c_half_interval,mean,stdev)==0); +#endif + /* We now simply add a new test run and hope that the runs + finally stabilize, Uwe F. Mayer */ + if(*numtries==30) return(-1); + (*funcpointer[fid])(); + myscores[*numtries]=getscore(fid); +#ifdef DEBUG + printf("score # %ld = %g\n", *numtries, myscores[*numtries]); +#endif + *numtries+=1; +} + +return(0); +} + +#ifdef OLDCODE +/* this procecdure is no longer needed, Uwe F. Mayer */ + /******************** + ** seek_confidence ** + ********************* + ** Pass this routine an array of 5 scores PLUS a new score. + ** This routine tries the new score in place of each of + ** the other five scores to determine if the new score, + ** when replacing one of the others, improves the confidence + ** half-interval. + ** Return 0 if failure. Original 5 scores unchanged. + ** Return -1 if success. Also returns new half-interval, + ** mean, and standard deviation of the sample. + */ + static int seek_confidence( double scores[5], + double *newscore, + double *c_half_interval, + double *smean, + double *sdev) + { + double sdev_to_beat; /* Original sdev to be beaten */ + double temp; /* For doing a swap */ + int is_beaten; /* Indicates original was beaten */ + int i; /* Index */ + + /* + ** First calculate original standard deviation + */ + calc_confidence(scores,c_half_interval,smean,sdev); + sdev_to_beat=*sdev; + is_beaten=-1; + + /* + ** Try to beat original score. We'll come out of this + ** loop with a flag. + */ + for(i=0;i<5;i++) + { + temp=scores[i]; + scores[i]=*newscore; + calc_confidence(scores,c_half_interval,smean,sdev); + scores[i]=temp; + if(sdev_to_beat>*sdev) + { is_beaten=i; + sdev_to_beat=*sdev; + } + } + + if(is_beaten!=-1) + { scores[is_beaten]=*newscore; + return(-1); + } + return(0); + } +#endif + +/******************** +** calc_confidence ** +********************* +** Given a set of numtries scores, calculate the confidence +** half-interval. We'll also return the sample mean and sample +** standard deviation. +** NOTE: This routines presumes a confidence of 95% and +** a confidence coefficient of .95 +** returns 0 if there is an error, otherwise -1 +*/ +static int calc_confidence(double scores[], /* Array of scores */ + int num_scores, /* number of scores in array */ + double *c_half_interval, /* Confidence half-int */ + double *smean, /* Standard mean */ + double *sdev) /* Sample stand dev */ +{ +/* Here is a list of the student-t distribution up to 29 degrees of + freedom. The value at 0 is bogus, as there is no value for zero + degrees of freedom. */ +double student_t[30]={0.0 , 12.706 , 4.303 , 3.182 , 2.776 , 2.571 , + 2.447 , 2.365 , 2.306 , 2.262 , 2.228 , + 2.201 , 2.179 , 2.160 , 2.145 , 2.131 , + 2.120 , 2.110 , 2.101 , 2.093 , 2.086 , + 2.080 , 2.074 , 2.069 , 2.064 , 2.060 , + 2.056 , 2.052 , 2.048 , 2.045 }; +int i; /* Index */ +if ((num_scores<2) || (num_scores>30)) { + output_string("Internal error: calc_confidence called with an illegal number of scores\n"); + return(-1); +} +/* +** First calculate mean. +*/ +*smean=(double)0.0; +for(i=0;i<num_scores;i++){ + *smean+=scores[i]; +} +*smean/=(double)num_scores; + +/* Get standard deviation */ +*sdev=(double)0.0; +for(i=0;i<num_scores;i++) { + *sdev+=(scores[i]-(*smean))*(scores[i]-(*smean)); +} +*sdev/=(double)(num_scores-1); +*sdev=sqrt(*sdev); + +/* Now calculate the length of the confidence half-interval. For a +** confidence level of 95% our confidence coefficient gives us a +** multiplying factor of the upper .025 quartile of a t distribution +** with num_scores-1 degrees of freedom, and dividing by sqrt(number of +** observations). See any introduction to statistics. +*/ +*c_half_interval=student_t[num_scores-1] * (*sdev) / sqrt((double)num_scores); +return(0); +} + +/************* +** getscore ** +************** +** Return the score for a particular benchmark. +*/ +static double getscore(int fid) +{ + +/* +** Fid tells us the function. This is really a matter of +** doing the proper coercion. +*/ +switch(fid) +{ + case TF_NUMSORT: + return(global_numsortstruct.sortspersec); + case TF_SSORT: + return(global_strsortstruct.sortspersec); + case TF_BITOP: + return(global_bitopstruct.bitopspersec); + case TF_FPEMU: + return(global_emfloatstruct.emflops); + case TF_FFPU: + return(global_fourierstruct.fflops); + case TF_ASSIGN: + return(global_assignstruct.iterspersec); + case TF_IDEA: + return(global_ideastruct.iterspersec); + case TF_HUFF: + return(global_huffstruct.iterspersec); + case TF_NNET: + return(global_nnetstruct.iterspersec); + case TF_LU: + return(global_lustruct.iterspersec); +} +return((double)0.0); +} + +/****************** +** output_string ** +******************* +** Displays a string on the screen. Also, if the flag +** write_to_file is set, outputs the string to the output file. +** Note, this routine presumes that you've included a carriage +** return at the end of the buffer. +*/ +static void output_string(char *buffer) +{ + +printf("%s",buffer); +if(write_to_file!=0) + fprintf(global_ofile,"%s",buffer); +return; +} + +/*************** +** show_stats ** +**************** +** This routine displays statistics for a particular benchmark. +** The benchmark is identified by its id. +*/ +static void show_stats (int bid) +{ +char buffer[80]; /* Display buffer */ + +switch(bid) +{ + case TF_NUMSORT: /* Numeric sort */ + sprintf(buffer," Number of arrays: %d\n", + global_numsortstruct.numarrays); + output_string(buffer); + sprintf(buffer," Array size: %ld\n", + global_numsortstruct.arraysize); + output_string(buffer); + break; + + case TF_SSORT: /* String sort */ + sprintf(buffer," Number of arrays: %d\n", + global_strsortstruct.numarrays); + output_string(buffer); + sprintf(buffer," Array size: %ld\n", + global_strsortstruct.arraysize); + output_string(buffer); + break; + + case TF_BITOP: /* Bitmap operation */ + sprintf(buffer," Operations array size: %ld\n", + global_bitopstruct.bitoparraysize); + output_string(buffer); + sprintf(buffer," Bitfield array size: %ld\n", + global_bitopstruct.bitfieldarraysize); + output_string(buffer); + break; + + case TF_FPEMU: /* Floating-point emulation */ + sprintf(buffer," Number of loops: %lu\n", + global_emfloatstruct.loops); + output_string(buffer); + sprintf(buffer," Array size: %lu\n", + global_emfloatstruct.arraysize); + output_string(buffer); + break; + + case TF_FFPU: /* Fourier test */ + sprintf(buffer," Number of coefficients: %lu\n", + global_fourierstruct.arraysize); + output_string(buffer); + break; + + case TF_ASSIGN: + sprintf(buffer," Number of arrays: %lu\n", + global_assignstruct.numarrays); + output_string(buffer); + break; + + case TF_IDEA: + sprintf(buffer," Array size: %lu\n", + global_ideastruct.arraysize); + output_string(buffer); + sprintf(buffer," Number of loops: %lu\n", + global_ideastruct.loops); + output_string(buffer); + break; + + case TF_HUFF: + sprintf(buffer," Array size: %lu\n", + global_huffstruct.arraysize); + output_string(buffer); + sprintf(buffer," Number of loops: %lu\n", + global_huffstruct.loops); + output_string(buffer); + break; + + case TF_NNET: + sprintf(buffer," Number of loops: %lu\n", + global_nnetstruct.loops); + output_string(buffer); + break; + + case TF_LU: + sprintf(buffer," Number of arrays: %lu\n", + global_lustruct.numarrays); + output_string(buffer); + break; +} +return; +} + +/* +** Following code added for Mac stuff, so that we can emulate command +** lines. +*/ + +#ifdef MAC + +/***************** +** UCommandLine ** +****************** +** Reads in a command line, and sets up argc and argv appropriately. +** Note that this routine uses gets() to read in the line. This means +** you'd better not enter more than 128 characters on a command line, or +** things will overflow, and oh boy... +*/ +void UCommandLine(void) +{ +printf("Enter command line\n:"); +gets((char *)Uargbuff); +UParse(); +return; +} + +/*********** +** UParse ** +************ +** Parse the pseudo command-line. This code appeared as part of the +** Small-C library in Dr. Dobb's ToolBook of C. +** It expects the following globals: +** argc = arg count +** argv = Pointer to array of char pointers +** Uargbuff = Character array that holds the arguments. Should be 129 bytes long. +** Udummy1 = This is a 2-byte buffer that holds a "*", and acts as the first +** argument in the argument list. This maintains compatibility with other +** C's, though it does not provide access to the executable filename. +** This routine allows for up to 20 individual command-line arguments. +** Also note that this routine does NOT allow for redirection. +*/ +void UParse(void) +{ +unsigned char *ptr; + +argc=0; /* Start arg count */ +Udummy[0]='*'; /* Set dummy first argument */ +Udummy[1]='\0'; +argv[argc++]=(char *)Udummy; + +ptr=Uargbuff; /* Start pointer */ +while(*ptr) +{ + if(isspace(*ptr)) + { ++ptr; + continue; + } + if(argc<20) argv[argc++]=(char *)ptr; + ptr=UField(ptr); +} +return; +} +/*********** +** UField ** +************ +** Isolate the next command-line field. +*/ +unsigned char *UField(unsigned char *ptr) +{ +while(*ptr) +{ if(isspace(*ptr)) + { *ptr=(unsigned char)NULL; + return(++ptr); + } + ++ptr; +} +return(ptr); +} +#endif diff --git a/nbench0.h b/nbench0.h new file mode 100644 index 0000000..cef0928 --- /dev/null +++ b/nbench0.h @@ -0,0 +1,356 @@ +/* +** nbench0.h +** Header for nbench0.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** 10/95 - Added memory array & alignment -- RG +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** Following should be modified accordingly per each +** compilation. +*/ +char *sysname="You can enter your system description in nbench0.h"; +char *compilername="It then will be printed here after you recompile"; +char *compilerversion="Have a nice day"; + +/* Parameter flags. Must coincide with parameter names array +** which appears below. */ +#define PF_GMTICKS 0 /* GLOBALMINTICKS */ +#define PF_MINSECONDS 1 /* MINSECONDS */ +#define PF_ALLSTATS 2 /* ALLSTATS */ +#define PF_OUTFILE 3 /* OUTFILE */ +#define PF_CUSTOMRUN 4 /* CUSTOMRUN */ +#define PF_DONUM 5 /* DONUMSORT */ +#define PF_NUMNUMA 6 /* NUMNUMARRAYS */ +#define PF_NUMASIZE 7 /* NUMARRAYSIZE */ +#define PF_NUMMINS 8 /* NUMMINSECONDS */ +#define PF_DOSTR 9 /* DOSTRINGSORT */ +#define PF_STRASIZE 10 /* STRARRAYSIZE */ +#define PF_NUMSTRA 11 /* NUMSTRARRAYS */ +#define PF_STRMINS 12 /* STRMINSECONDS */ +#define PF_DOBITF 13 /* DOBITFIELD */ +#define PF_NUMBITOPS 14 /* NUMBITOPS */ +#define PF_BITFSIZE 15 /* BITFIELDSIZE */ +#define PF_BITMINS 16 /* BITMINSECONDS */ +#define PF_DOEMF 17 /* DOEMF */ +#define PF_EMFASIZE 18 /* EMFARRAYSIZE */ +#define PF_EMFLOOPS 19 /* EMFLOOPS */ +#define PF_EMFMINS 20 /* EMFMINSECOND */ +#define PF_DOFOUR 21 /* DOFOUR */ +#define PF_FOURASIZE 22 /* FOURASIZE */ +#define PF_FOURMINS 23 /* FOURMINSECONDS */ +#define PF_DOASSIGN 24 /* DOASSIGN */ +#define PF_AARRAYS 25 /* ASSIGNARRAYS */ +#define PF_ASSIGNMINS 26 /* ASSIGNMINSECONDS */ +#define PF_DOIDEA 27 /* DOIDEA */ +#define PF_IDEAASIZE 28 /* IDEAARRAYSIZE */ +#define PF_IDEALOOPS 29 /* IDEALOOPS */ +#define PF_IDEAMINS 30 /* IDEAMINSECONDS */ +#define PF_DOHUFF 31 /* DOHUFF */ +#define PF_HUFFASIZE 32 /* HUFFARRAYSIZE */ +#define PF_HUFFLOOPS 33 /* HUFFLOOPS */ +#define PF_HUFFMINS 34 /* HUFFMINSECONDS */ +#define PF_DONNET 35 /* DONNET */ +#define PF_NNETLOOPS 36 /* NNETLOOPS */ +#define PF_NNETMINS 37 /* NNETMINSECONDS */ +#define PF_DOLU 38 /* DOLU */ +#define PF_LUNARRAYS 39 /* LUNUMARRAYS */ +#define PF_LUMINS 40 /* LUMINSECONDS */ +#define PF_ALIGN 41 /* ALIGN */ + +#define MAXPARAM 41 + +/* Tests-to-do flags...must coincide with test. */ +#define TF_NUMSORT 0 +#define TF_SSORT 1 +#define TF_BITOP 2 +#define TF_FPEMU 3 +#define TF_FFPU 4 +#define TF_ASSIGN 5 +#define TF_IDEA 6 +#define TF_HUFF 7 +#define TF_NNET 8 +#define TF_LU 9 + +#define NUMTESTS 10 + +/* +** GLOBALS +*/ + +#define BUF_SIZ 1024 + +/* +** Test names +*/ +char *ftestnames[] = { + "NUMERIC SORT ", + "STRING SORT ", + "BITFIELD ", + "FP EMULATION ", + "FOURIER ", + "ASSIGNMENT ", + "IDEA ", + "HUFFMAN ", + "NEURAL NET ", + "LU DECOMPOSITION" }; + +/* +** Indexes -- Baseline is DELL Pentium XP90 +** 11/28/94 +*/ +double bindex[] = { + 38.993, /* Numeric sort */ + 2.238, /* String sort */ + 5829704, /* Bitfield */ + 2.084, /* FP Emulation */ + 879.278, /* Fourier */ + .2628, /* Assignment */ + 65.382, /* IDEA */ + 36.062, /* Huffman */ + .6225, /* Neural Net */ + 19.3031 }; /* LU Decomposition */ + +/* +** Indices -- Baseline is a AMD K6-233, 32MB RAM (60ns SDRAM),512k L2 cache, +** Linux kernel 2.0.32, libc-5.4.38, gcc-2.7.2.3) +** Nov/30/97 +*/ +double lx_bindex[] = { + 118.73, /* Numeric sort */ + 14.459, /* String sort */ + 27910000, /* Bitfield */ + 9.0314, /* FP Emulation */ + 1565.5, /* Fourier */ + 1.0132, /* Assignment */ + 220.21, /* IDEA */ + 112.93, /* Huffman */ + 1.4799, /* Neural Net */ + 26.732}; /* LU Decomposition */ + +/* Parameter names */ +char *paramnames[]= { + "GLOBALMINTICKS", + "MINSECONDS", + "ALLSTATS", + "OUTFILE", + "CUSTOMRUN", + "DONUMSORT", + "NUMNUMARRAYS", + "NUMARRAYSIZE", + "NUMMINSECONDS", + "DOSTRINGSORT", + "STRARRAYSIZE", + "NUMSTRARRAYS", + "STRMINSECONDS", + "DOBITFIELD", + "NUMBITOPS", + "BITFIELDSIZE", + "BITMINSECONDS", + "DOEMF", + "EMFARRAYSIZE", + "EMFLOOPS", + "EMFMINSECONDS", + "DOFOUR", + "FOURSIZE", + "FOURMINSECONDS", + "DOASSIGN", + "ASSIGNARRAYS", + "ASSIGNMINSECONDS", + "DOIDEA", + "IDEARRAYSIZE", + "IDEALOOPS", + "IDEAMINSECONDS", + "DOHUFF", + "HUFARRAYSIZE", + "HUFFLOOPS", + "HUFFMINSECONDS", + "DONNET", + "NNETLOOPS", + "NNETMINSECONDS", + "DOLU", + "LUNUMARRAYS", + "LUMINSECONDS", + "ALIGN" }; + +/* +** Following array is a collection of flags indicating which +** tests to perform. +*/ +int tests_to_do[NUMTESTS]; + +/* +** Buffer for holding output text. +*/ +char buffer[BUF_SIZ]; + +/* +** Global parameters. +*/ +ulong global_min_ticks; /* Minimum ticks */ +ulong global_min_seconds; /* Minimum seconds tests run */ +int global_allstats; /* Statistics dump flag */ +char global_ofile_name[BUF_SIZ];/* Output file name */ +FILE *global_ofile; /* Output file */ +int global_custrun; /* Custom run flag */ +int write_to_file; /* Write output to file */ +int global_align; /* Memory alignment */ + +/* +** Following global is the memory array. This is used to store +** original and aligned (modified) memory addresses. +*/ +ulong mem_array[2][MEM_ARRAY_SIZE]; +int mem_array_ents; /* # of active entries */ + +/* +** Following are global structures, one built for +** each of the tests. +*/ +SortStruct global_numsortstruct; /* For numeric sort */ +SortStruct global_strsortstruct; /* For string sort */ +BitOpStruct global_bitopstruct; /* For bitfield operations */ +EmFloatStruct global_emfloatstruct; /* For emul. float. point */ +FourierStruct global_fourierstruct; /* For fourier test */ +AssignStruct global_assignstruct; /* For assignment algorithm */ +IDEAStruct global_ideastruct; /* For IDEA encryption */ +HuffStruct global_huffstruct; /* For Huffman compression */ +NNetStruct global_nnetstruct; /* For Neural Net */ +LUStruct global_lustruct; /* For LU decomposition */ + +/* +** The following array of function struct pointers lets +** us very rapidly map a function to its controlling +** data structure. NOTE: These must match the "TF_xxx" +** constants above. +*/ +void *global_fstruct[] = +{ (void *)&global_numsortstruct, + (void *)&global_strsortstruct, + (void *)&global_bitopstruct, + (void *)&global_emfloatstruct, + (void *)&global_fourierstruct, + (void *)&global_assignstruct, + (void *)&global_ideastruct, + (void *)&global_huffstruct, + (void *)&global_nnetstruct, + (void *)&global_lustruct }; + +/* +** Following globals added to support command line emulation on +** the Macintosh....which doesn't have command lines. +*/ +#ifdef MAC +int argc; /* Argument count */ +char *argv[20]; /* Argument vectors */ + +unsigned char Uargbuff[129]; /* Buffer holding arguments string */ +unsigned char Udummy[2]; /* Dummy buffer for first arg */ + +#endif + +#ifdef MACTIMEMGR +#include <Types.h> +#include <Timer.h> +/* +** Timer globals for Mac +*/ +struct TMTask myTMTask; +long MacHSTdelay,MacHSTohead; + +#endif + +/* +** Following globals used by Win 31 timing routines. +** NOTE: This requires the includes of the w31timer.asm +** file in your project!! +*/ +#ifdef WIN31TIMER +#include <windows.h> +#include <toolhelp.h> +extern TIMERINFO win31tinfo; +extern HANDLE hThlp; +extern FARPROC lpfn; +#endif + +/* +** PROTOTYPES +*/ +static int parse_arg(char *argptr); +static void display_help(char *progname); +static void read_comfile(FILE *cfile); +static int getflag(char *cptr); +static void strtoupper(char *s); +static void set_request_secs(void); +static int bench_with_confidence(int fid, + double *mean, double *stdev, ulong *numtries); +/* +static int seek_confidence(double scores[5], + double *newscore, double *c_half_interval, + double *smean,double *sdev); +*/ +static int calc_confidence(double scores[], + int num_scores, + double *c_half_interval,double *smean, + double *sdev); +static double getscore(int fid); +static void output_string(char *buffer); +static void show_stats(int bid); + +#ifdef MAC +void UCommandLine(void); +void UParse(void); +unsigned char *UField(unsigned char *ptr); +#endif + +/* +** EXTERNAL PROTOTYPES +*/ +extern void DoNumSort(void); /* From NBENCH1 */ +extern void DoStringSort(void); +extern void DoBitops(void); +extern void DoEmFloat(void); +extern void DoFourier(void); +extern void DoAssign(void); +extern void DoIDEA(void); +extern void DoHuffman(void); +extern void DoNNET(void); +extern void DoLU(void); + +extern void ErrorExit(void); /* From SYSSPEC */ + +/* +** Array of pointers to the benchmark functions. +*/ +void (*funcpointer[])(void) = +{ DoNumSort, + DoStringSort, + DoBitops, + DoEmFloat, + DoFourier, + DoAssign, + DoIDEA, + DoHuffman, + DoNNET, + DoLU }; + + diff --git a/nbench1.c b/nbench1.c new file mode 100644 index 0000000..05c35df --- /dev/null +++ b/nbench1.c @@ -0,0 +1,4445 @@ + +/* +** nbench1.c +*/ + +/******************************** +** BYTEmark (tm) ** +** BYTE NATIVE MODE BENCHMARKS ** +** VERSION 2 ** +** ** +** Included in this source ** +** file: ** +** Numeric Heapsort ** +** String Heapsort ** +** Bitfield test ** +** Floating point emulation ** +** Fourier coefficients ** +** Assignment algorithm ** +** IDEA Encyption ** +** Huffman compression ** +** Back prop. neural net ** +** LU Decomposition ** +** (linear equations) ** +** ---------- ** +** Rick Grehan, BYTE Magazine ** +********************************* +** +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** 10/95 - Removed allocation that was taking place inside +** the LU Decomposition benchmark. Though it didn't seem to +** make a difference on systems we ran it on, it nonetheless +** removes an operating system dependency that probably should +** not have been there. +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** INCLUDES +*/ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <math.h> +#include "nmglobal.h" +#include "nbench1.h" +#include "wordcat.h" + +#ifdef DEBUG +static int numsort_status=0; +static int stringsort_status=0; +#endif + +/********************* +** NUMERIC HEAPSORT ** +********************** +** This test implements a heapsort algorithm, performed on an +** array of longs. +*/ + +/************** +** DoNumSort ** +*************** +** This routine performs the CPU numeric sort test. +** NOTE: Last version incorrectly stated that the routine +** returned result in # of longword sorted per second. +** Not so; the routine returns # of iterations per sec. +*/ + +void DoNumSort(void) +{ +SortStruct *numsortstruct; /* Local pointer to global struct */ +farlong *arraybase; /* Base pointers of array */ +long accumtime; /* Accumulated time */ +double iterations; /* Iteration counter */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For holding error codes */ + +/* +** Link to global structure +*/ +numsortstruct=&global_numsortstruct; + +/* +** Set the error context string. +*/ +errorcontext="CPU:Numeric Sort"; + +/* +** See if we need to do self adjustment code. +*/ +if(numsortstruct->adjust==0) +{ + /* + ** Self-adjustment code. The system begins by sorting 1 + ** array. If it does that in no time, then two arrays + ** are built and sorted. This process continues until + ** enough arrays are built to handle the tolerance. + */ + numsortstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long) * + numsortstruct->numarrays * numsortstruct->arraysize, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the numeric sort. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then allocate for more arrays and + ** try again. + */ + if(DoNumSortIteration(arraybase, + numsortstruct->arraysize, + numsortstruct->numarrays)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase,&systemerror); + if(numsortstruct->numarrays++>NUMNUMARRAYS) + { printf("CPU:NSORT -- NUMNUMARRAYS hit.\n"); + ErrorExit(); + } + } +} +else +{ /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long) * + numsortstruct->numarrays * numsortstruct->arraysize, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + +} +/* +** All's well if we get here. Repeatedly perform sorts until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoNumSortIteration(arraybase, + numsortstruct->arraysize, + numsortstruct->numarrays); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)<numsortstruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); + +numsortstruct->sortspersec=iterations * + (double)numsortstruct->numarrays / TicksToFracSecs(accumtime); + +if(numsortstruct->adjust==0) + numsortstruct->adjust=1; + +#ifdef DEBUG +if (numsort_status==0) printf("Numeric sort: OK\n"); +numsort_status=0; +#endif +return; +} + +/*********************** +** DoNumSortIteration ** +************************ +** This routine executes one iteration of the numeric +** sort benchmark. It returns the number of ticks +** elapsed for the iteration. +*/ +static ulong DoNumSortIteration(farlong *arraybase, + ulong arraysize, + uint numarrays) +{ +ulong elapsed; /* Elapsed ticks */ +ulong i; +/* +** Load up the array with random numbers +*/ +LoadNumArrayWithRand(arraybase,arraysize,numarrays); + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute a heap of heapsorts +*/ +for(i=0;i<numarrays;i++) + NumHeapSort(arraybase+i*arraysize,0L,arraysize-1L); + +/* +** Get elapsed time +*/ +elapsed=StopStopwatch(elapsed); +#ifdef DEBUG +{ + for(i=0;i<arraysize-1;i++) + { /* + ** Compare to check for proper + ** sort. + */ + if(arraybase[i+1]<arraybase[i]) + { printf("Sort Error\n"); + numsort_status=1; + break; + } + } +} +#endif + +return(elapsed); +} + +/************************* +** LoadNumArrayWithRand ** +************************** +** Load up an array with random longs. +*/ +static void LoadNumArrayWithRand(farlong *array, /* Pointer to arrays */ + ulong arraysize, + uint numarrays) /* # of elements in array */ +{ +long i; /* Used for index */ +farlong *darray; /* Destination array pointer */ +/* +** Initialize the random number generator +*/ +/* randnum(13L); */ +randnum((int32)13); + +/* +** Load up first array with randoms +*/ +for(i=0L;i<arraysize;i++) + /* array[i]=randnum(0L); */ + array[i]=randnum((int32)0); + +/* +** Now, if there's more than one array to load, copy the +** first into each of the others. +*/ +darray=array; +while(--numarrays) +{ darray+=arraysize; + for(i=0L;i<arraysize;i++) + darray[i]=array[i]; +} + +return; +} + +/**************** +** NumHeapSort ** +***************** +** Pass this routine a pointer to an array of long +** integers. Also pass in minimum and maximum offsets. +** This routine performs a heap sort on that array. +*/ +static void NumHeapSort(farlong *array, + ulong bottom, /* Lower bound */ + ulong top) /* Upper bound */ +{ +ulong temp; /* Used to exchange elements */ +ulong i; /* Loop index */ + +/* +** First, build a heap in the array +*/ +for(i=(top/2L); i>0; --i) + NumSift(array,i,top); + +/* +** Repeatedly extract maximum from heap and place it at the +** end of the array. When we get done, we'll have a sorted +** array. +*/ +for(i=top; i>0; --i) +{ NumSift(array,bottom,i); + temp=*array; /* Perform exchange */ + *array=*(array+i); + *(array+i)=temp; +} +return; +} + +/************ +** NumSift ** +************* +** Peforms the sift operation on a numeric array, +** constructing a heap in the array. +*/ +static void NumSift(farlong *array, /* Array of numbers */ + ulong i, /* Minimum of array */ + ulong j) /* Maximum of array */ +{ +unsigned long k; +long temp; /* Used for exchange */ + +while((i+i)<=j) +{ + k=i+i; + if(k<j) + if(array[k]<array[k+1L]) + ++k; + if(array[i]<array[k]) + { + temp=array[k]; + array[k]=array[i]; + array[i]=temp; + i=k; + } + else + i=j+1; +} +return; +} + +/******************** +** STRING HEAPSORT ** +********************/ + +/***************** +** DoStringSort ** +****************** +** This routine performs the CPU string sort test. +** Arguments: +** requested_secs = # of seconds to execute test +** stringspersec = # of strings per second sorted (RETURNED) +*/ +void DoStringSort(void) +{ + +SortStruct *strsortstruct; /* Local for sort structure */ +faruchar *arraybase; /* Base pointer of char array */ +long accumtime; /* Accumulated time */ +double iterations; /* # of iterations */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For holding error code */ + +/* +** Link to global structure +*/ +strsortstruct=&global_strsortstruct; + +/* +** Set the error context +*/ +errorcontext="CPU:String Sort"; + +/* +** See if we have to perform self-adjustment code +*/ +if(strsortstruct->adjust==0) +{ + /* + ** Initialize the number of arrays. + */ + strsortstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for array. We'll add an extra 100 + ** bytes to protect memory as strings move around + ** (this can happen during string adjustment) + */ + arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) * + (long)strsortstruct->numarrays,&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the string sort. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then de-allocate the array, reallocate a + ** an additional array, and try again. + */ + if(DoStringSortIteration(arraybase, + strsortstruct->numarrays, + strsortstruct->arraysize)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase,&systemerror); + strsortstruct->numarrays+=1; + } +} +else +{ + /* + ** We don't have to perform self adjustment code. + ** Simply allocate the space for the array. + */ + arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) * + (long)strsortstruct->numarrays,&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } +} +/* +** All's well if we get here. Repeatedly perform sorts until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoStringSortIteration(arraybase, + strsortstruct->numarrays, + strsortstruct->arraysize); + iterations+=(double)strsortstruct->numarrays; +} while(TicksToSecs(accumtime)<strsortstruct->request_secs); + +/* +** Clean up, calculate results, and go home. +** Set flag to show we don't need to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); +strsortstruct->sortspersec=iterations / (double)TicksToFracSecs(accumtime); +if(strsortstruct->adjust==0) + strsortstruct->adjust=1; +#ifdef DEBUG +if (stringsort_status==0) printf("String sort: OK\n"); +stringsort_status=0; +#endif +return; +} + +/************************** +** DoStringSortIteration ** +*************************** +** This routine executes one iteration of the string +** sort benchmark. It returns the number of ticks +** Note that this routine also builds the offset pointer +** array. +*/ +static ulong DoStringSortIteration(faruchar *arraybase, + uint numarrays,ulong arraysize) +{ +farulong *optrarray; /* Offset pointer array */ +unsigned long elapsed; /* Elapsed ticks */ +unsigned long nstrings; /* # of strings in array */ +int syserror; /* System error code */ +unsigned int i; /* Index */ +farulong *tempobase; /* Temporary offset pointer base */ +faruchar *tempsbase; /* Temporary string base pointer */ + +/* +** Load up the array(s) with random numbers +*/ +optrarray=LoadStringArray(arraybase,numarrays,&nstrings,arraysize); + +/* +** Set temp base pointers...they will be modified as the +** benchmark proceeds. +*/ +tempobase=optrarray; +tempsbase=arraybase; + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute heapsorts +*/ +for(i=0;i<numarrays;i++) +{ StrHeapSort(tempobase,tempsbase,nstrings,0L,nstrings-1); + tempobase+=nstrings; /* Advance base pointers */ + tempsbase+=arraysize+100; +} + +/* +** Record elapsed time +*/ +elapsed=StopStopwatch(elapsed); + +#ifdef DEBUG +{ + unsigned long i; + for(i=0;i<nstrings-1;i++) + { /* + ** Compare strings to check for proper + ** sort. + */ + if(str_is_less(optrarray,arraybase,nstrings,i+1,i)) + { printf("Sort Error\n"); + stringsort_status=1; + break; + } + } +} +#endif + +/* +** Release the offset pointer array built by +** LoadStringArray() +*/ +FreeMemory((farvoid *)optrarray,&syserror); + +/* +** Return elapsed ticks. +*/ +return(elapsed); +} + +/******************** +** LoadStringArray ** +********************* +** Initialize the string array with random strings of +** varying sizes. +** Returns the pointer to the offset pointer array. +** Note that since we're creating a number of arrays, this +** routine builds one array, then copies it into the others. +*/ +static farulong *LoadStringArray(faruchar *strarray, /* String array */ + uint numarrays, /* # of arrays */ + ulong *nstrings, /* # of strings */ + ulong arraysize) /* Size of array */ +{ +faruchar *tempsbase; /* Temporary string base pointer */ +farulong *optrarray; /* Local for pointer */ +farulong *tempobase; /* Temporary offset pointer base pointer */ +unsigned long curroffset; /* Current offset */ +int fullflag; /* Indicates full array */ +unsigned char stringlength; /* Length of string */ +unsigned char i; /* Index */ +unsigned long j; /* Another index */ +unsigned int k; /* Yet another index */ +unsigned int l; /* Ans still one more index */ +int systemerror; /* For holding error code */ + +/* +** Initialize random number generator. +*/ +/* randnum(13L); */ +randnum((int32)13); + +/* +** Start with no strings. Initialize our current offset pointer +** to 0. +*/ +*nstrings=0L; +curroffset=0L; +fullflag=0; + +do +{ + /* + ** Allocate a string with a random length no + ** shorter than 4 bytes and no longer than + ** 80 bytes. Note we have to also make sure + ** there's room in the array. + */ + /* stringlength=(unsigned char)((1+abs_randwc(76L)) & 0xFFL);*/ + stringlength=(unsigned char)((1+abs_randwc((int32)76)) & 0xFFL); + if((unsigned long)stringlength+curroffset+1L>=arraysize) + { stringlength=(unsigned char)((arraysize-curroffset-1L) & + 0xFF); + fullflag=1; /* Indicates a full */ + } + + /* + ** Store length at curroffset and advance current offset. + */ + *(strarray+curroffset)=stringlength; + curroffset++; + + /* + ** Fill up the rest of the string with random bytes. + */ + for(i=0;i<stringlength;i++) + { *(strarray+curroffset)= + /* (unsigned char)(abs_randwc((long)0xFE)); */ + (unsigned char)(abs_randwc((int32)0xFE)); + curroffset++; + } + + /* + ** Increment the # of strings counter. + */ + *nstrings+=1L; + +} while(fullflag==0); + +/* +** We now have initialized a single full array. If there +** is more than one array, copy the original into the +** others. +*/ +k=1; +tempsbase=strarray; +while(k<numarrays) +{ tempsbase+=arraysize+100; /* Set base */ + for(l=0;l<arraysize;l++) + tempsbase[l]=strarray[l]; + k++; +} + +/* +** Now the array is full, allocate enough space for an +** offset pointer array. +*/ +optrarray=(farulong *)AllocateMemory(*nstrings * sizeof(unsigned long) * + numarrays, + &systemerror); +if(systemerror) +{ ReportError("CPU:Stringsort",systemerror); + FreeMemory((void *)strarray,&systemerror); + ErrorExit(); +} + +/* +** Go through the newly-built string array, building +** offsets and putting them into the offset pointer +** array. +*/ +curroffset=0; +for(j=0;j<*nstrings;j++) +{ *(optrarray+j)=curroffset; + curroffset+=(unsigned long)(*(strarray+curroffset))+1L; +} + +/* +** As above, we've made one copy of the offset pointers, +** so duplicate this array in the remaining ones. +*/ +k=1; +tempobase=optrarray; +while(k<numarrays) +{ tempobase+=*nstrings; + for(l=0;l<*nstrings;l++) + tempobase[l]=optrarray[l]; + k++; +} + +/* +** All done...go home. Pass local pointer back. +*/ +return(optrarray); +} + +/************** +** stradjust ** +*************** +** Used by the string heap sort. Call this routine to adjust the +** string at offset i to length l. The members of the string array +** are moved accordingly and the length of the string at offset i +** is set to l. +*/ +static void stradjust(farulong *optrarray, /* Offset pointer array */ + faruchar *strarray, /* String array */ + ulong nstrings, /* # of strings */ + ulong i, /* Offset to adjust */ + uchar l) /* New length */ +{ +unsigned long nbytes; /* # of bytes to move */ +unsigned long j; /* Index */ +int direction; /* Direction indicator */ +unsigned char adjamount; /* Adjustment amount */ + +/* +** If new length is less than old length, the direction is +** down. If new length is greater than old length, the +** direction is up. +*/ +direction=(int)l - (int)*(strarray+*(optrarray+i)); +adjamount=(unsigned char)abs(direction); + +/* +** See if the adjustment is being made to the last +** string in the string array. If so, we don't have to +** do anything more than adjust the length field. +*/ +if(i==(nstrings-1L)) +{ *(strarray+*(optrarray+i))=l; + return; +} + +/* +** Calculate the total # of bytes in string array from +** location i+1 to end of array. Whether we're moving "up" or +** down, this is how many bytes we'll have to move. +*/ +nbytes=*(optrarray+nstrings-1L) + + (unsigned long)*(strarray+*(optrarray+nstrings-1L)) + 1L - + *(optrarray+i+1L); + +/* +** Calculate the source and the destination. Source is +** string position i+1. Destination is string position i+l +** (i+"ell"...don't confuse 1 and l). +** Hand this straight to memmove and let it handle the +** "overlap" problem. +*/ +MoveMemory((farvoid *)(strarray+*(optrarray+i)+l+1), + (farvoid *)(strarray+*(optrarray+i+1)), + (unsigned long)nbytes); + +/* +** We have to adjust the offset pointer array. +** This covers string i+1 to numstrings-1. +*/ +for(j=i+1;j<nstrings;j++) + if(direction<0) + *(optrarray+j)=*(optrarray+j)-adjamount; + else + *(optrarray+j)=*(optrarray+j)+adjamount; + +/* +** Store the new length and go home. +*/ +*(strarray+*(optrarray+i))=l; +return; +} + +/**************** +** strheapsort ** +***************** +** Pass this routine a pointer to an array of unsigned char. +** The array is presumed to hold strings occupying at most +** 80 bytes (counts a byte count). +** This routine also needs a pointer to an array of offsets +** which represent string locations in the array, and +** an unsigned long indicating the number of strings +** in the array. +*/ +static void StrHeapSort(farulong *optrarray, /* Offset pointers */ + faruchar *strarray, /* Strings array */ + ulong numstrings, /* # of strings in array */ + ulong bottom, /* Region to sort...bottom */ + ulong top) /* Region to sort...top */ +{ +unsigned char temp[80]; /* Used to exchange elements */ +unsigned char tlen; /* Temp to hold length */ +unsigned long i; /* Loop index */ + + +/* +** Build a heap in the array +*/ +for(i=(top/2L); i>0; --i) + strsift(optrarray,strarray,numstrings,i,top); + +/* +** Repeatedly extract maximum from heap and place it at the +** end of the array. When we get done, we'll have a sorted +** array. +*/ +for(i=top; i>0; --i) +{ + strsift(optrarray,strarray,numstrings,0,i); + + /* temp = string[0] */ + tlen=*strarray; + MoveMemory((farvoid *)&temp[0], /* Perform exchange */ + (farvoid *)strarray, + (unsigned long)(tlen+1)); + + + /* string[0]=string[i] */ + tlen=*(strarray+*(optrarray+i)); + stradjust(optrarray,strarray,numstrings,0,tlen); + MoveMemory((farvoid *)strarray, + (farvoid *)(strarray+*(optrarray+i)), + (unsigned long)(tlen+1)); + + /* string[i]=temp */ + tlen=temp[0]; + stradjust(optrarray,strarray,numstrings,i,tlen); + MoveMemory((farvoid *)(strarray+*(optrarray+i)), + (farvoid *)&temp[0], + (unsigned long)(tlen+1)); + +} +return; +} + +/**************** +** str_is_less ** +***************** +** Pass this function: +** 1) A pointer to an array of offset pointers +** 2) A pointer to a string array +** 3) The number of elements in the string array +** 4) Offsets to two strings (a & b) +** This function returns TRUE if string a is < string b. +*/ +static int str_is_less(farulong *optrarray, /* Offset pointers */ + faruchar *strarray, /* String array */ + ulong numstrings, /* # of strings */ + ulong a, ulong b) /* Offsets */ +{ +int slen; /* String length */ + +/* +** Determine which string has the minimum length. Use that +** to call strncmp(). If they match up to that point, the +** string with the longer length wins. +*/ +slen=(int)*(strarray+*(optrarray+a)); +if(slen > (int)*(strarray+*(optrarray+b))) + slen=(int)*(strarray+*(optrarray+b)); + +slen=strncmp((char *)(strarray+*(optrarray+a)), + (char *)(strarray+*(optrarray+b)),slen); + +if(slen==0) +{ + /* + ** They match. Return true if the length of a + ** is greater than the length of b. + */ + if(*(strarray+*(optrarray+a)) > + *(strarray+*(optrarray+b))) + return(TRUE); + return(FALSE); +} + +if(slen<0) return(TRUE); /* a is strictly less than b */ + +return(FALSE); /* Only other possibility */ +} + +/************ +** strsift ** +************* +** Pass this function: +** 1) A pointer to an array of offset pointers +** 2) A pointer to a string array +** 3) The number of elements in the string array +** 4) Offset within which to sort. +** Sift the array within the bounds of those offsets (thus +** building a heap). +*/ +static void strsift(farulong *optrarray, /* Offset pointers */ + faruchar *strarray, /* String array */ + ulong numstrings, /* # of strings */ + ulong i, ulong j) /* Offsets */ +{ +unsigned long k; /* Temporaries */ +unsigned char temp[80]; +unsigned char tlen; /* For string lengths */ + + +while((i+i)<=j) +{ + k=i+i; + if(k<j) + if(str_is_less(optrarray,strarray,numstrings,k,k+1L)) + ++k; + if(str_is_less(optrarray,strarray,numstrings,i,k)) + { + /* temp=string[k] */ + tlen=*(strarray+*(optrarray+k)); + MoveMemory((farvoid *)&temp[0], + (farvoid *)(strarray+*(optrarray+k)), + (unsigned long)(tlen+1)); + + /* string[k]=string[i] */ + tlen=*(strarray+*(optrarray+i)); + stradjust(optrarray,strarray,numstrings,k,tlen); + MoveMemory((farvoid *)(strarray+*(optrarray+k)), + (farvoid *)(strarray+*(optrarray+i)), + (unsigned long)(tlen+1)); + + /* string[i]=temp */ + tlen=temp[0]; + stradjust(optrarray,strarray,numstrings,i,tlen); + MoveMemory((farvoid *)(strarray+*(optrarray+i)), + (farvoid *)&temp[0], + (unsigned long)(tlen+1)); + i=k; + } + else + i=j+1; +} +return; +} + +/************************ +** BITFIELD OPERATIONS ** +*************************/ + +/************* +** DoBitops ** +************** +** Perform the bit operations test portion of the CPU +** benchmark. Returns the iterations per second. +*/ +void DoBitops(void) +{ +BitOpStruct *locbitopstruct; /* Local bitop structure */ +farulong *bitarraybase; /* Base of bitmap array */ +farulong *bitoparraybase; /* Base of bitmap operations array */ +ulong nbitops; /* # of bitfield operations */ +ulong accumtime; /* Accumulated time in ticks */ +double iterations; /* # of iterations */ +char *errorcontext; /* Error context string */ +int systemerror; /* For holding error codes */ +int ticks; + +/* +** Link to global structure. +*/ +locbitopstruct=&global_bitopstruct; + +/* +** Set the error context. +*/ +errorcontext="CPU:Bitfields"; + +/* +** See if we need to run adjustment code. +*/ +if(locbitopstruct->adjust==0) +{ + bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize * + sizeof(ulong),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + /* + ** Initialize bitfield operations array to [2,30] elements + */ + locbitopstruct->bitoparraysize=30L; + + while(1) + { + /* + ** Allocate space for operations array + */ + bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L* + sizeof(ulong), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)bitarraybase,&systemerror); + ErrorExit(); + } + /* + ** Do an iteration of the bitmap test. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then de-allocate the array, reallocate a + ** larger version, and try again. + */ + ticks=DoBitfieldIteration(bitarraybase, + bitoparraybase, + locbitopstruct->bitoparraysize, + &nbitops); +#ifdef DEBUG +#ifdef LINUX + if (locbitopstruct->bitoparraysize==30L){ + /* this is the first loop, write a debug file */ + FILE *file; + unsigned long *running_base; /* same as farulong */ + long counter; + file=fopen("debugbit.dat","w"); + running_base=bitarraybase; + for (counter=0;counter<(long)(locbitopstruct->bitfieldarraysize);counter++){ +#ifdef LONG64 + fprintf(file,"%08X",(unsigned int)(*running_base&0xFFFFFFFFL)); + fprintf(file,"%08X",(unsigned int)((*running_base>>32)&0xFFFFFFFFL)); + if ((counter+1)%4==0) fprintf(file,"\n"); +#else + fprintf(file,"%08lX",*running_base); + if ((counter+1)%8==0) fprintf(file,"\n"); +#endif + running_base=running_base+1; + } + fclose(file); + printf("\nWrote the file debugbit.dat, you may want to compare it to debugbit.good\n"); + } +#endif +#endif + + if (ticks>global_min_ticks) break; /* We're ok...exit */ + + FreeMemory((farvoid *)bitoparraybase,&systemerror); + locbitopstruct->bitoparraysize+=100L; + } +} +else +{ + /* + ** Don't need to do self adjustment, just allocate + ** the array space. + */ + bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize * + sizeof(ulong),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L* + sizeof(ulong), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)bitarraybase,&systemerror); + ErrorExit(); + } +} + +/* +** All's well if we get here. Repeatedly perform bitops until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoBitfieldIteration(bitarraybase, + bitoparraybase, + locbitopstruct->bitoparraysize,&nbitops); + iterations+=(double)nbitops; +} while(TicksToSecs(accumtime)<locbitopstruct->request_secs); + +/* +** Clean up, calculate results, and go home. +** Also, set adjustment flag to show that we don't have +** to do self adjusting in the future. +*/ +FreeMemory((farvoid *)bitarraybase,&systemerror); +FreeMemory((farvoid *)bitoparraybase,&systemerror); +locbitopstruct->bitopspersec=iterations /TicksToFracSecs(accumtime); +if(locbitopstruct->adjust==0) + locbitopstruct->adjust=1; + +return; +} + +/************************ +** DoBitfieldIteration ** +************************* +** Perform a single iteration of the bitfield benchmark. +** Return the # of ticks accumulated by the operation. +*/ +static ulong DoBitfieldIteration(farulong *bitarraybase, + farulong *bitoparraybase, + long bitoparraysize, + ulong *nbitops) +{ +long i; /* Index */ +ulong bitoffset; /* Offset into bitmap */ +ulong elapsed; /* Time to execute */ +/* +** Clear # bitops counter +*/ +*nbitops=0L; + +/* +** Construct a set of bitmap offsets and run lengths. +** The offset can be any random number from 0 to the +** size of the bitmap (in bits). The run length can +** be any random number from 1 to the number of bits +** between the offset and the end of the bitmap. +** Note that the bitmap has 8192 * 32 bits in it. +** (262,144 bits) +*/ +/* +** Reset random number generator so things repeat. +** Also reset the bit array we work on. +** added by Uwe F. Mayer +*/ +randnum((int32)13); +for (i=0;i<global_bitopstruct.bitfieldarraysize;i++) +{ +#ifdef LONG64 + *(bitarraybase+i)=(ulong)0x5555555555555555; +#else + *(bitarraybase+i)=(ulong)0x55555555; +#endif +} +randnum((int32)13); +/* end of addition of code */ + +for (i=0;i<bitoparraysize;i++) +{ + /* First item is offset */ + /* *(bitoparraybase+i+i)=bitoffset=abs_randwc(262140L); */ + *(bitoparraybase+i+i)=bitoffset=abs_randwc((int32)262140); + + /* Next item is run length */ + /* *nbitops+=*(bitoparraybase+i+i+1L)=abs_randwc(262140L-bitoffset);*/ + *nbitops+=*(bitoparraybase+i+i+1L)=abs_randwc((int32)262140-bitoffset); +} + +/* +** Array of offset and lengths built...do an iteration of +** the test. +** Start the stopwatch. +*/ +elapsed=StartStopwatch(); + +/* +** Loop through array off offset/run length pairs. +** Execute operation based on modulus of index. +*/ +for(i=0;i<bitoparraysize;i++) +{ + switch(i % 3) + { + + case 0: /* Set run of bits */ + ToggleBitRun(bitarraybase, + *(bitoparraybase+i+i), + *(bitoparraybase+i+i+1), + 1); + break; + + case 1: /* Clear run of bits */ + ToggleBitRun(bitarraybase, + *(bitoparraybase+i+i), + *(bitoparraybase+i+i+1), + 0); + break; + + case 2: /* Complement run of bits */ + FlipBitRun(bitarraybase, + *(bitoparraybase+i+i), + *(bitoparraybase+i+i+1)); + break; + } +} + +/* +** Return elapsed time +*/ +return(StopStopwatch(elapsed)); +} + + +/***************************** +** ToggleBitRun * +****************************** +** Set or clear a run of nbits starting at +** bit_addr in bitmap. +*/ +static void ToggleBitRun(farulong *bitmap, /* Bitmap */ + ulong bit_addr, /* Address of bits to set */ + ulong nbits, /* # of bits to set/clr */ + uint val) /* 1 or 0 */ +{ +unsigned long bindex; /* Index into array */ +unsigned long bitnumb; /* Bit number */ + +while(nbits--) +{ +#ifdef LONG64 + bindex=bit_addr>>6; /* Index is number /64 */ + bitnumb=bit_addr % 64; /* Bit number in word */ +#else + bindex=bit_addr>>5; /* Index is number /32 */ + bitnumb=bit_addr % 32; /* bit number in word */ +#endif + if(val) + bitmap[bindex]|=(1L<<bitnumb); + else + bitmap[bindex]&=~(1L<<bitnumb); + bit_addr++; +} +return; +} + +/*************** +** FlipBitRun ** +**************** +** Complements a run of bits. +*/ +static void FlipBitRun(farulong *bitmap, /* Bit map */ + ulong bit_addr, /* Bit address */ + ulong nbits) /* # of bits to flip */ +{ +unsigned long bindex; /* Index into array */ +unsigned long bitnumb; /* Bit number */ + +while(nbits--) +{ +#ifdef LONG64 + bindex=bit_addr>>6; /* Index is number /64 */ + bitnumb=bit_addr % 64; /* Bit number in longword */ +#else + bindex=bit_addr>>5; /* Index is number /32 */ + bitnumb=bit_addr % 32; /* Bit number in longword */ +#endif + bitmap[bindex]^=(1L<<bitnumb); + bit_addr++; +} + +return; +} + +/***************************** +** FLOATING-POINT EMULATION ** +*****************************/ + +/************** +** DoEmFloat ** +*************** +** Perform the floating-point emulation routines portion of the +** CPU benchmark. Returns the operations per second. +*/ +void DoEmFloat(void) +{ +EmFloatStruct *locemfloatstruct; /* Local structure */ +InternalFPF *abase; /* Base of A array */ +InternalFPF *bbase; /* Base of B array */ +InternalFPF *cbase; /* Base of C array */ +ulong accumtime; /* Accumulated time in ticks */ +double iterations; /* # of iterations */ +ulong tickcount; /* # of ticks */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For holding error code */ +ulong loops; /* # of loops */ + +/* +** Link to global structure +*/ +locemfloatstruct=&global_emfloatstruct; + +/* +** Set the error context +*/ +errorcontext="CPU:Floating Emulation"; + + +/* +** Test the emulation routines. +*/ +#ifdef DEBUG +#endif + +abase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + ErrorExit(); +} + +bbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)abase,&systemerror); + ErrorExit(); +} + +cbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + ErrorExit(); +} + +/* +** Set up the arrays +*/ +SetupCPUEmFloatArrays(abase,bbase,cbase,locemfloatstruct->arraysize); + +/* +** See if we need to do self-adjusting code. +*/ +if(locemfloatstruct->adjust==0) +{ + locemfloatstruct->loops=0; + + /* + ** Do an iteration of the tests. If the elapsed time is + ** less than minimum, increase the loop count and try + ** again. + */ + for(loops=1;loops<CPUEMFLOATLOOPMAX;loops+=loops) + { tickcount=DoEmFloatIteration(abase,bbase,cbase, + locemfloatstruct->arraysize, + loops); + if(tickcount>global_min_ticks) + { locemfloatstruct->loops=loops; + break; + } + } +} + +/* +** Verify that selft adjustment code worked. +*/ +if(locemfloatstruct->loops==0) +{ printf("CPU:EMFPU -- CMPUEMFLOATLOOPMAX limit hit\n"); + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + FreeMemory((farvoid *)cbase,&systemerror); + ErrorExit(); +} + +/* +** All's well if we get here. Repeatedly perform floating +** tests until the accumulated time is greater than the +** # of seconds requested. +** Each iteration performs arraysize * 3 operations. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoEmFloatIteration(abase,bbase,cbase, + locemfloatstruct->arraysize, + locemfloatstruct->loops); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)<locemfloatstruct->request_secs); + + +/* +** Clean up, calculate results, and go home. +** Also, indicate that adjustment is done. +*/ +FreeMemory((farvoid *)abase,&systemerror); +FreeMemory((farvoid *)bbase,&systemerror); +FreeMemory((farvoid *)cbase,&systemerror); + +locemfloatstruct->emflops=(iterations*(double)locemfloatstruct->loops)/ + (double)TicksToFracSecs(accumtime); +if(locemfloatstruct->adjust==0) + locemfloatstruct->adjust=1; + +#ifdef DEBUG +printf("----------------------------------------------------------------------------\n"); +#endif +return; +} + +/************************* +** FOURIER COEFFICIENTS ** +*************************/ + +/************** +** DoFourier ** +*************** +** Perform the transcendental/trigonometric portion of the +** benchmark. This benchmark calculates the first n +** fourier coefficients of the function (x+1)^x defined +** on the interval 0,2. +*/ +void DoFourier(void) +{ +FourierStruct *locfourierstruct; /* Local fourier struct */ +fardouble *abase; /* Base of A[] coefficients array */ +fardouble *bbase; /* Base of B[] coefficients array */ +unsigned long accumtime; /* Accumulated time in ticks */ +double iterations; /* # of iterations */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For error code */ + +/* +** Link to global structure +*/ +locfourierstruct=&global_fourierstruct; + +/* +** Set error context string +*/ +errorcontext="FPU:Transcendental"; + +/* +** See if we need to do self-adjustment code. +*/ +if(locfourierstruct->adjust==0) +{ + locfourierstruct->arraysize=100L; /* Start at 100 elements */ + while(1) + { + + abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((void *)abase,&systemerror); + ErrorExit(); + } + /* + ** Do an iteration of the tests. If the elapsed time is + ** less than or equal to the permitted minimum, re-allocate + ** larger arrays and try again. + */ + if(DoFPUTransIteration(abase,bbase, + locfourierstruct->arraysize)>global_min_ticks) + break; /* We're ok...exit */ + + /* + ** Make bigger arrays and try again. + */ + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + locfourierstruct->arraysize+=50L; + } +} +else +{ /* + ** Don't need self-adjustment. Just allocate the + ** arrays, and go. + */ + abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((void *)abase,&systemerror); + ErrorExit(); + } +} +/* +** All's well if we get here. Repeatedly perform integration +** tests until the accumulated time is greater than the +** # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoFPUTransIteration(abase,bbase,locfourierstruct->arraysize); + iterations+=(double)locfourierstruct->arraysize*(double)2.0-(double)1.0; +} while(TicksToSecs(accumtime)<locfourierstruct->request_secs); + + +/* +** Clean up, calculate results, and go home. +** Also set adjustment flag to indicate no adjust code needed. +*/ +FreeMemory((farvoid *)abase,&systemerror); +FreeMemory((farvoid *)bbase,&systemerror); + +locfourierstruct->fflops=iterations/(double)TicksToFracSecs(accumtime); + +if(locfourierstruct->adjust==0) + locfourierstruct->adjust=1; + +return; +} + +/************************ +** DoFPUTransIteration ** +************************* +** Perform an iteration of the FPU Transcendental/trigonometric +** benchmark. Here, an iteration consists of calculating the +** first n fourier coefficients of the function (x+1)^x on +** the interval 0,2. n is given by arraysize. +** NOTE: The # of integration steps is fixed at +** 200. +*/ +static ulong DoFPUTransIteration(fardouble *abase, /* A coeffs. */ + fardouble *bbase, /* B coeffs. */ + ulong arraysize) /* # of coeffs */ +{ +double omega; /* Fundamental frequency */ +unsigned long i; /* Index */ +unsigned long elapsed; /* Elapsed time */ + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Calculate the fourier series. Begin by +** calculating A[0]. +*/ + +*abase=TrapezoidIntegrate((double)0.0, + (double)2.0, + 200, + (double)0.0, /* No omega * n needed */ + 0 )/(double)2.0; + +/* +** Calculate the fundamental frequency. +** ( 2 * pi ) / period...and since the period +** is 2, omega is simply pi. +*/ +omega=(double)3.1415926535897932; + +for(i=1;i<arraysize;i++) +{ + + /* + ** Calculate A[i] terms. Note, once again, that we + ** can ignore the 2/period term outside the integral + ** since the period is 2 and the term cancels itself + ** out. + */ + *(abase+i)=TrapezoidIntegrate((double)0.0, + (double)2.0, + 200, + omega * (double)i, + 1); + + /* + ** Calculate the B[i] terms. + */ + *(bbase+i)=TrapezoidIntegrate((double)0.0, + (double)2.0, + 200, + omega * (double)i, + 2); + +} +#ifdef DEBUG +{ + int i; + printf("\nA[i]=\n"); + for (i=0;i<arraysize;i++) printf("%7.3g ",abase[i]); + printf("\nB[i]=\n(undefined) "); + for (i=1;i<arraysize;i++) printf("%7.3g ",bbase[i]); +} +#endif +/* +** All done, stop the stopwatch +*/ +return(StopStopwatch(elapsed)); +} + +/*********************** +** TrapezoidIntegrate ** +************************ +** Perform a simple trapezoid integration on the +** function (x+1)**x. +** x0,x1 set the lower and upper bounds of the +** integration. +** nsteps indicates # of trapezoidal sections +** omegan is the fundamental frequency times +** the series member # +** select = 0 for the A[0] term, 1 for cosine terms, and +** 2 for sine terms. +** Returns the value. +*/ +static double TrapezoidIntegrate( double x0, /* Lower bound */ + double x1, /* Upper bound */ + int nsteps, /* # of steps */ + double omegan, /* omega * n */ + int select) +{ +double x; /* Independent variable */ +double dx; /* Stepsize */ +double rvalue; /* Return value */ + + +/* +** Initialize independent variable +*/ +x=x0; + +/* +** Calculate stepsize +*/ +dx=(x1 - x0) / (double)nsteps; + +/* +** Initialize the return value. +*/ +rvalue=thefunction(x0,omegan,select)/(double)2.0; + +/* +** Compute the other terms of the integral. +*/ +if(nsteps!=1) +{ --nsteps; /* Already done 1 step */ + while(--nsteps ) + { + x+=dx; + rvalue+=thefunction(x,omegan,select); + } +} +/* +** Finish computation +*/ +rvalue=(rvalue+thefunction(x1,omegan,select)/(double)2.0)*dx; + +return(rvalue); +} + +/**************** +** thefunction ** +***************** +** This routine selects the function to be used +** in the Trapezoid integration. +** x is the independent variable +** omegan is omega * n +** select chooses which of the sine/cosine functions +** are used. note the special case for select=0. +*/ +static double thefunction(double x, /* Independent variable */ + double omegan, /* Omega * term */ + int select) /* Choose term */ +{ + +/* +** Use select to pick which function we call. +*/ +switch(select) +{ + case 0: return(pow(x+(double)1.0,x)); + + case 1: return(pow(x+(double)1.0,x) * cos(omegan * x)); + + case 2: return(pow(x+(double)1.0,x) * sin(omegan * x)); +} + +/* +** We should never reach this point, but the following +** keeps compilers from issuing a warning message. +*/ +return(0.0); +} + +/************************* +** ASSIGNMENT ALGORITHM ** +*************************/ + +/************* +** DoAssign ** +************** +** Perform an assignment algorithm. +** The algorithm was adapted from the step by step guide found +** in "Quantitative Decision Making for Business" (Gordon, +** Pressman, and Cohn; Prentice-Hall) +** +** +** NOTES: +** 1. Even though the algorithm distinguishes between +** ASSIGNROWS and ASSIGNCOLS, as though the two might +** be different, it does presume a square matrix. +** I.E., ASSIGNROWS and ASSIGNCOLS must be the same. +** This makes for some algorithmically-correct but +** probably non-optimal constructs. +** +*/ +void DoAssign(void) +{ +AssignStruct *locassignstruct; /* Local structure ptr */ +farlong *arraybase; +char *errorcontext; +int systemerror; +ulong accumtime; +double iterations; + +/* +** Link to global structure +*/ +locassignstruct=&global_assignstruct; + +/* +** Set the error context string. +*/ +errorcontext="CPU:Assignment"; + +/* +** See if we need to do self adjustment code. +*/ +if(locassignstruct->adjust==0) +{ + /* + ** Self-adjustment code. The system begins by working on 1 + ** array. If it does that in no time, then two arrays + ** are built. This process continues until + ** enough arrays are built to handle the tolerance. + */ + locassignstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for arrays + */ + arraybase=(farlong *) AllocateMemory(sizeof(long)* + ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the assignment alg. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then allocate for more arrays and + ** try again. + */ + if(DoAssignIteration(arraybase, + locassignstruct->numarrays)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase, &systemerror); + locassignstruct->numarrays++; + } +} +else +{ /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long)* + ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } +} + +/* +** All's well if we get here. Do the tests. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoAssignIteration(arraybase, + locassignstruct->numarrays); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)<locassignstruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); + +locassignstruct->iterspersec=iterations * + (double)locassignstruct->numarrays / TicksToFracSecs(accumtime); + +if(locassignstruct->adjust==0) + locassignstruct->adjust=1; + +return; + +} + +/********************** +** DoAssignIteration ** +*********************** +** This routine executes one iteration of the assignment test. +** It returns the number of ticks elapsed in the iteration. +*/ +static ulong DoAssignIteration(farlong *arraybase, + ulong numarrays) +{ +longptr abase; /* local pointer */ +ulong elapsed; /* Elapsed ticks */ +ulong i; + +/* +** Set up local pointer +*/ +abase.ptrs.p=arraybase; + +/* +** Load up the arrays with a random table. +*/ +LoadAssignArrayWithRand(arraybase,numarrays); + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute assignment algorithms +*/ +for(i=0;i<numarrays;i++) +{ /* abase.ptrs.p+=i*ASSIGNROWS*ASSIGNCOLS; */ + /* Fixed by Eike Dierks */ + Assignment(*abase.ptrs.ap); + abase.ptrs.p+=ASSIGNROWS*ASSIGNCOLS; +} + +/* +** Get elapsed time +*/ +return(StopStopwatch(elapsed)); +} + +/**************************** +** LoadAssignArrayWithRand ** +***************************** +** Load the assignment arrays with random numbers. All positive. +** These numbers represent costs. +*/ +static void LoadAssignArrayWithRand(farlong *arraybase, + ulong numarrays) +{ +longptr abase,abase1; /* Local for array pointer */ +ulong i; + +/* +** Set local array pointer +*/ +abase.ptrs.p=arraybase; +abase1.ptrs.p=arraybase; + +/* +** Set up the first array. Then just copy it into the +** others. +*/ +LoadAssign(*(abase.ptrs.ap)); +if(numarrays>1) + for(i=1;i<numarrays;i++) + { /* abase1.ptrs.p+=i*ASSIGNROWS*ASSIGNCOLS; */ + /* Fixed by Eike Dierks */ + abase1.ptrs.p+=ASSIGNROWS*ASSIGNCOLS; + CopyToAssign(*(abase.ptrs.ap),*(abase1.ptrs.ap)); + } + +return; +} + +/*************** +** LoadAssign ** +**************** +** The array given by arraybase is loaded with positive random +** numbers. Elements in the array are capped at 5,000,000. +*/ +static void LoadAssign(farlong arraybase[][ASSIGNCOLS]) +{ +ushort i,j; + +/* +** Reset random number generator so things repeat. +*/ +/* randnum(13L); */ +randnum((int32)13); + +for(i=0;i<ASSIGNROWS;i++) + for(j=0;j<ASSIGNROWS;j++){ + /* arraybase[i][j]=abs_randwc(5000000L);*/ + arraybase[i][j]=abs_randwc((int32)5000000); + } + +return; +} + +/***************** +** CopyToAssign ** +****************** +** Copy the contents of one array to another. This is called by +** the routine that builds the initial array, and is used to copy +** the contents of the intial array into all following arrays. +*/ +static void CopyToAssign(farlong arrayfrom[ASSIGNROWS][ASSIGNCOLS], + farlong arrayto[ASSIGNROWS][ASSIGNCOLS]) +{ +ushort i,j; + +for(i=0;i<ASSIGNROWS;i++) + for(j=0;j<ASSIGNCOLS;j++) + arrayto[i][j]=arrayfrom[i][j]; + +return; +} + +/*************** +** Assignment ** +***************/ +static void Assignment(farlong arraybase[][ASSIGNCOLS]) +{ +short assignedtableau[ASSIGNROWS][ASSIGNCOLS]; + +/* +** First, calculate minimum costs +*/ +calc_minimum_costs(arraybase); + +/* +** Repeat following until the number of rows selected +** equals the number of rows in the tableau. +*/ +while(first_assignments(arraybase,assignedtableau)!=ASSIGNROWS) +{ second_assignments(arraybase,assignedtableau); +} + +#ifdef DEBUG +{ + int i,j; + printf("\nColumn choices for each row\n"); + for(i=0;i<ASSIGNROWS;i++) + { + printf("R%03d: ",i); + for(j=0;j<ASSIGNCOLS;j++) + if(assignedtableau[i][j]==1) + printf("%03d ",j); + } +} +#endif + +return; +} + +/*********************** +** calc_minimum_costs ** +************************ +** Revise the tableau by calculating the minimum costs on a +** row and column basis. These minima are subtracted from +** their rows and columns, creating a new tableau. +*/ +static void calc_minimum_costs(long tableau[][ASSIGNCOLS]) +{ +ushort i,j; /* Index variables */ +long currentmin; /* Current minimum */ +/* +** Determine minimum costs on row basis. This is done by +** subtracting -- on a row-per-row basis -- the minum value +** for that row. +*/ +for(i=0;i<ASSIGNROWS;i++) +{ + currentmin=MAXPOSLONG; /* Initialize minimum */ + for(j=0;j<ASSIGNCOLS;j++) + if(tableau[i][j]<currentmin) + currentmin=tableau[i][j]; + + for(j=0;j<ASSIGNCOLS;j++) + tableau[i][j]-=currentmin; +} + +/* +** Determine minimum cost on a column basis. This works +** just as above, only now we step through the array +** column-wise +*/ +for(j=0;j<ASSIGNCOLS;j++) +{ + currentmin=MAXPOSLONG; /* Initialize minimum */ + for(i=0;i<ASSIGNROWS;i++) + if(tableau[i][j]<currentmin) + currentmin=tableau[i][j]; + + /* + ** Here, we'll take the trouble to see if the current + ** minimum is zero. This is likely worth it, since the + ** preceding loop will have created at least one zero in + ** each row. We can save ourselves a few iterations. + */ + if(currentmin!=0) + for(i=0;i<ASSIGNROWS;i++) + tableau[i][j]-=currentmin; +} + +return; +} + +/********************** +** first_assignments ** +*********************** +** Do first assignments. +** The assignedtableau[] array holds a set of values that +** indicate the assignment of a value, or its elimination. +** The values are: +** 0 = Item is neither assigned nor eliminated. +** 1 = Item is assigned +** 2 = Item is eliminated +** Returns the number of selections made. If this equals +** the number of rows, then an optimum has been determined. +*/ +static int first_assignments(long tableau[][ASSIGNCOLS], + short assignedtableau[][ASSIGNCOLS]) +{ +ushort i,j,k; /* Index variables */ +ushort numassigns; /* # of assignments */ +ushort totnumassigns; /* Total # of assignments */ +ushort numzeros; /* # of zeros in row */ +int selected=0; /* Flag used to indicate selection */ + +/* +** Clear the assignedtableau, setting all members to show that +** no one is yet assigned, eliminated, or anything. +*/ +for(i=0;i<ASSIGNROWS;i++) + for(j=0;j<ASSIGNCOLS;j++) + assignedtableau[i][j]=0; + +totnumassigns=0; +do { + numassigns=0; + /* + ** Step through rows. For each one that is not currently + ** assigned, see if the row has only one zero in it. If so, + ** mark that as an assigned row/col. Eliminate other zeros + ** in the same column. + */ + for(i=0;i<ASSIGNROWS;i++) + { numzeros=0; + for(j=0;j<ASSIGNCOLS;j++) + if(tableau[i][j]==0L) + if(assignedtableau[i][j]==0) + { numzeros++; + selected=j; + } + if(numzeros==1) + { numassigns++; + totnumassigns++; + assignedtableau[i][selected]=1; + for(k=0;k<ASSIGNROWS;k++) + if((k!=i) && + (tableau[k][selected]==0)) + assignedtableau[k][selected]=2; + } + } + /* + ** Step through columns, doing same as above. Now, be careful + ** of items in the other rows of a selected column. + */ + for(j=0;j<ASSIGNCOLS;j++) + { numzeros=0; + for(i=0;i<ASSIGNROWS;i++) + if(tableau[i][j]==0L) + if(assignedtableau[i][j]==0) + { numzeros++; + selected=i; + } + if(numzeros==1) + { numassigns++; + totnumassigns++; + assignedtableau[selected][j]=1; + for(k=0;k<ASSIGNCOLS;k++) + if((k!=j) && + (tableau[selected][k]==0)) + assignedtableau[selected][k]=2; + } + } + /* + ** Repeat until no more assignments to be made. + */ +} while(numassigns!=0); + +/* +** See if we can leave at this point. +*/ +if(totnumassigns==ASSIGNROWS) return(totnumassigns); + +/* +** Now step through the array by row. If you find any unassigned +** zeros, pick the first in the row. Eliminate all zeros from +** that same row & column. This occurs if there are multiple optima... +** possibly. +*/ +for(i=0;i<ASSIGNROWS;i++) +{ selected=-1; + for(j=0;j<ASSIGNCOLS;j++) + if((tableau[i][j]==0L) && + (assignedtableau[i][j]==0)) + { selected=j; + break; + } + if(selected!=-1) + { assignedtableau[i][selected]=1; + totnumassigns++; + for(k=0;k<ASSIGNCOLS;k++) + if((k!=selected) && + (tableau[i][k]==0L)) + assignedtableau[i][k]=2; + for(k=0;k<ASSIGNROWS;k++) + if((k!=i) && + (tableau[k][selected]==0L)) + assignedtableau[k][selected]=2; + } +} + +return(totnumassigns); +} + +/*********************** +** second_assignments ** +************************ +** This section of the algorithm creates the revised +** tableau, and is difficult to explain. I suggest you +** refer to the algorithm's source, mentioned in comments +** toward the beginning of the program. +*/ +static void second_assignments(long tableau[][ASSIGNCOLS], + short assignedtableau[][ASSIGNCOLS]) +{ +int i,j; /* Indexes */ +short linesrow[ASSIGNROWS]; +short linescol[ASSIGNCOLS]; +long smallest; /* Holds smallest value */ +ushort numassigns; /* Number of assignments */ +ushort newrows; /* New rows to be considered */ +/* +** Clear the linesrow and linescol arrays. +*/ +for(i=0;i<ASSIGNROWS;i++) + linesrow[i]=0; +for(i=0;i<ASSIGNCOLS;i++) + linescol[i]=0; + +/* +** Scan rows, flag each row that has no assignment in it. +*/ +for(i=0;i<ASSIGNROWS;i++) +{ numassigns=0; + for(j=0;j<ASSIGNCOLS;j++) + if(assignedtableau[i][j]==1) + { numassigns++; + break; + } + if(numassigns==0) linesrow[i]=1; +} + +do { + + newrows=0; + /* + ** For each row checked above, scan for any zeros. If found, + ** check the associated column. + */ + for(i=0;i<ASSIGNROWS;i++) + { if(linesrow[i]==1) + for(j=0;j<ASSIGNCOLS;j++) + if(tableau[i][j]==0) + linescol[j]=1; + } + + /* + ** Now scan checked columns. If any contain assigned zeros, check + ** the associated row. + */ + for(j=0;j<ASSIGNCOLS;j++) + if(linescol[j]==1) + for(i=0;i<ASSIGNROWS;i++) + if((assignedtableau[i][j]==1) && + (linesrow[i]!=1)) + { + linesrow[i]=1; + newrows++; + } +} while(newrows!=0); + +/* +** linesrow[n]==0 indicate rows covered by imaginary line +** linescol[n]==1 indicate cols covered by imaginary line +** For all cells not covered by imaginary lines, determine smallest +** value. +*/ +smallest=MAXPOSLONG; +for(i=0;i<ASSIGNROWS;i++) + if(linesrow[i]!=0) + for(j=0;j<ASSIGNCOLS;j++) + if(linescol[j]!=1) + if(tableau[i][j]<smallest) + smallest=tableau[i][j]; + +/* +** Subtract smallest from all cells in the above set. +*/ +for(i=0;i<ASSIGNROWS;i++) + if(linesrow[i]!=0) + for(j=0;j<ASSIGNCOLS;j++) + if(linescol[j]!=1) + tableau[i][j]-=smallest; + +/* +** Add smallest to all cells covered by two lines. +*/ +for(i=0;i<ASSIGNROWS;i++) + if(linesrow[i]==0) + for(j=0;j<ASSIGNCOLS;j++) + if(linescol[j]==1) + tableau[i][j]+=smallest; + +return; +} + +/******************** +** IDEA Encryption ** +********************* +** IDEA - International Data Encryption Algorithm. +** Based on code presented in Applied Cryptography by Bruce Schneier. +** Which was based on code developed by Xuejia Lai and James L. Massey. +** Other modifications made by Colin Plumb. +** +*/ + +/*********** +** DoIDEA ** +************ +** Perform IDEA encryption. Note that we time encryption & decryption +** time as being a single loop. +*/ +void DoIDEA(void) +{ +IDEAStruct *locideastruct; /* Loc pointer to global structure */ +int i; +IDEAkey Z,DK; +u16 userkey[8]; +ulong accumtime; +double iterations; +char *errorcontext; +int systemerror; +faruchar *plain1; /* First plaintext buffer */ +faruchar *crypt1; /* Encryption buffer */ +faruchar *plain2; /* Second plaintext buffer */ + +/* +** Link to global data +*/ +locideastruct=&global_ideastruct; + +/* +** Set error context +*/ +errorcontext="CPU:IDEA"; + +/* +** Re-init random-number generator. +*/ +/* randnum(3L); */ +randnum((int32)3); + +/* +** Build an encryption/decryption key +*/ +for (i=0;i<8;i++) + /* userkey[i]=(u16)(abs_randwc(60000L) & 0xFFFF); */ + userkey[i]=(u16)(abs_randwc((int32)60000) & 0xFFFF); +for(i=0;i<KEYLEN;i++) + Z[i]=0; + +/* +** Compute encryption/decryption subkeys +*/ +en_key_idea(userkey,Z); +de_key_idea(Z,DK); + +/* +** Allocate memory for buffers. We'll make 3, called plain1, +** crypt1, and plain2. It works like this: +** plain1 >>encrypt>> crypt1 >>decrypt>> plain2. +** So, plain1 and plain2 should match. +** Also, fill up plain1 with sample text. +*/ +plain1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + ErrorExit(); +} + +crypt1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)plain1,&systemerror); + ErrorExit(); +} + +plain2=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)plain1,&systemerror); + FreeMemory((farvoid *)crypt1,&systemerror); + ErrorExit(); +} +/* +** Note that we build the "plaintext" by simply loading +** the array up with random numbers. +*/ +for(i=0;i<locideastruct->arraysize;i++) + plain1[i]=(uchar)(abs_randwc(255) & 0xFF); + +/* +** See if we need to perform self adjustment loop. +*/ +if(locideastruct->adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(locideastruct->loops=100L; + locideastruct->loops<MAXIDEALOOPS; + locideastruct->loops+=10L) + if(DoIDEAIteration(plain1,crypt1,plain2, + locideastruct->arraysize, + locideastruct->loops, + Z,DK)>global_min_ticks) break; +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoIDEAIteration(plain1,crypt1,plain2, + locideastruct->arraysize, + locideastruct->loops,Z,DK); + iterations+=(double)locideastruct->loops; +} while(TicksToSecs(accumtime)<locideastruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)plain1,&systemerror); +FreeMemory((farvoid *)crypt1,&systemerror); +FreeMemory((farvoid *)plain2,&systemerror); +locideastruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(locideastruct->adjust==0) + locideastruct->adjust=1; + +return; + +} + +/******************** +** DoIDEAIteration ** +********************* +** Execute a single iteration of the IDEA encryption algorithm. +** Actually, a single iteration is one encryption and one +** decryption. +*/ +static ulong DoIDEAIteration(faruchar *plain1, + faruchar *crypt1, + faruchar *plain2, + ulong arraysize, + ulong nloops, + IDEAkey Z, + IDEAkey DK) +{ +register ulong i; +register ulong j; +ulong elapsed; +#ifdef DEBUG +int status=0; +#endif + +/* +** Start the stopwatch. +*/ +elapsed=StartStopwatch(); + +/* +** Do everything for nloops. +*/ +for(i=0;i<nloops;i++) +{ + for(j=0;j<arraysize;j+=(sizeof(u16)*4)) + cipher_idea((u16 *)(plain1+j),(u16 *)(crypt1+j),Z); /* Encrypt */ + + for(j=0;j<arraysize;j+=(sizeof(u16)*4)) + cipher_idea((u16 *)(crypt1+j),(u16 *)(plain2+j),DK); /* Decrypt */ +} + +#ifdef DEBUG +for(j=0;j<arraysize;j++) + if(*(plain1+j)!=*(plain2+j)){ + printf("IDEA Error! \n"); + status=1; + } +if (status==0) printf("IDEA: OK\n"); +#endif + +/* +** Get elapsed time. +*/ +return(StopStopwatch(elapsed)); +} + +/******** +** mul ** +********* +** Performs multiplication, modulo (2**16)+1. This code is structured +** on the assumption that untaken branches are cheaper than taken +** branches, and that the compiler doesn't schedule branches. +*/ +static u16 mul(register u16 a, register u16 b) +{ +register u32 p; +if(a) +{ if(b) + { p=(u32)(a*b); + b=low16(p); + a=(u16)(p>>16); + return(b-a+(b<a)); + } + else + return(1-a); +} +else + return(1-b); +} + +/******** +** inv ** +********* +** Compute multiplicative inverse of x, modulo (2**16)+1 +** using Euclid's GCD algorithm. It is unrolled twice +** to avoid swapping the meaning of the registers. And +** some subtracts are changed to adds. +*/ +static u16 inv(u16 x) +{ +u16 t0, t1; +u16 q, y; + +if(x<=1) + return(x); /* 0 and 1 are self-inverse */ +t1=0x10001 / x; +y=0x10001 % x; +if(y==1) + return(low16(1-t1)); +t0=1; +do { + q=x/y; + x=x%y; + t0+=q*t1; + if(x==1) return(t0); + q=y/x; + y=y%x; + t1+=q*t0; +} while(y!=1); +return(low16(1-t1)); +} + +/**************** +** en_key_idea ** +***************** +** Compute IDEA encryption subkeys Z +*/ +static void en_key_idea(u16 *userkey, u16 *Z) +{ +int i,j; + +/* +** shifts +*/ +for(j=0;j<8;j++) + Z[j]=*userkey++; +for(i=0;j<KEYLEN;j++) +{ i++; + Z[i+7]=(Z[i&7]<<9)| (Z[(i+1) & 7] >> 7); + Z+=i&8; + i&=7; +} +return; +} + +/**************** +** de_key_idea ** +***************** +** Compute IDEA decryption subkeys DK from encryption +** subkeys Z. +*/ +static void de_key_idea(IDEAkey Z, IDEAkey DK) +{ +IDEAkey TT; +int j; +u16 t1, t2, t3; +u16 *p; +p=(u16 *)(TT+KEYLEN); + +t1=inv(*Z++); +t2=-*Z++; +t3=-*Z++; +*--p=inv(*Z++); +*--p=t3; +*--p=t2; +*--p=t1; + +for(j=1;j<ROUNDS;j++) +{ t1=*Z++; + *--p=*Z++; + *--p=t1; + t1=inv(*Z++); + t2=-*Z++; + t3=-*Z++; + *--p=inv(*Z++); + *--p=t2; + *--p=t3; + *--p=t1; +} +t1=*Z++; +*--p=*Z++; +*--p=t1; +t1=inv(*Z++); +t2=-*Z++; +t3=-*Z++; +*--p=inv(*Z++); +*--p=t3; +*--p=t2; +*--p=t1; +/* +** Copy and destroy temp copy +*/ +for(j=0,p=TT;j<KEYLEN;j++) +{ *DK++=*p; + *p++=0; +} + +return; +} + +/* +** MUL(x,y) +** This #define creates a macro that computes x=x*y modulo 0x10001. +** Requires temps t16 and t32. Also requires y to be strictly 16 +** bits. Here, I am using the simplest form. May not be the +** fastest. -- RG +*/ +/* #define MUL(x,y) (x=mul(low16(x),y)) */ + +/**************** +** cipher_idea ** +***************** +** IDEA encryption/decryption algorithm. +*/ +static void cipher_idea(u16 in[4], + u16 out[4], + register IDEAkey Z) +{ +register u16 x1, x2, x3, x4, t1, t2; +/* register u16 t16; +register u16 t32; */ +int r=ROUNDS; + +x1=*in++; +x2=*in++; +x3=*in++; +x4=*in; + +do { + MUL(x1,*Z++); + x2+=*Z++; + x3+=*Z++; + MUL(x4,*Z++); + + t2=x1^x3; + MUL(t2,*Z++); + t1=t2+(x2^x4); + MUL(t1,*Z++); + t2=t1+t2; + + x1^=t1; + x4^=t2; + + t2^=x2; + x2=x3^t1; + x3=t2; +} while(--r); +MUL(x1,*Z++); +*out++=x1; +*out++=x3+*Z++; +*out++=x2+*Z++; +MUL(x4,*Z); +*out=x4; +return; +} + +/************************ +** HUFFMAN COMPRESSION ** +************************/ + +/************** +** DoHuffman ** +*************** +** Execute a huffman compression on a block of plaintext. +** Note that (as with IDEA encryption) an iteration of the +** Huffman test includes a compression AND a decompression. +** Also, the compression cycle includes building the +** Huffman tree. +*/ +void DoHuffman(void) +{ +HuffStruct *lochuffstruct; /* Loc pointer to global data */ +char *errorcontext; +int systemerror; +ulong accumtime; +double iterations; +farchar *comparray; +farchar *decomparray; +farchar *plaintext; + +/* +** Link to global data +*/ +lochuffstruct=&global_huffstruct; + +/* +** Set error context. +*/ +errorcontext="CPU:Huffman"; + +/* +** Allocate memory for the plaintext and the compressed text. +** We'll be really pessimistic here, and allocate equal amounts +** for both (though we know...well, we PRESUME) the compressed +** stuff will take less than the plain stuff. +** Also note that we'll build a 3rd buffer to decompress +** into, and we preallocate space for the huffman tree. +** (We presume that the Huffman tree will grow no larger +** than 512 bytes. This is actually a super-conservative +** estimate...but, who cares?) +*/ +plaintext=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + ErrorExit(); +} +comparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + ErrorExit(); +} +decomparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + FreeMemory(comparray,&systemerror); + ErrorExit(); +} + +hufftree=(huff_node *)AllocateMemory(sizeof(huff_node) * 512, + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + FreeMemory(comparray,&systemerror); + FreeMemory(decomparray,&systemerror); + ErrorExit(); +} + +/* +** Build the plaintext buffer. Since we want this to +** actually be able to compress, we'll use the +** wordcatalog to build the plaintext stuff. +*/ +/* +** Reset random number generator so things repeat. +** added by Uwe F. Mayer +*/ +randnum((int32)13); +create_text_block(plaintext,lochuffstruct->arraysize-1,(ushort)500); +plaintext[lochuffstruct->arraysize-1L]='\0'; +plaintextlen=lochuffstruct->arraysize; + +/* +** See if we need to perform self adjustment loop. +*/ +if(lochuffstruct->adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(lochuffstruct->loops=100L; + lochuffstruct->loops<MAXHUFFLOOPS; + lochuffstruct->loops+=10L) + if(DoHuffIteration(plaintext, + comparray, + decomparray, + lochuffstruct->arraysize, + lochuffstruct->loops, + hufftree)>global_min_ticks) break; +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoHuffIteration(plaintext, + comparray, + decomparray, + lochuffstruct->arraysize, + lochuffstruct->loops, + hufftree); + iterations+=(double)lochuffstruct->loops; +} while(TicksToSecs(accumtime)<lochuffstruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)plaintext,&systemerror); +FreeMemory((farvoid *)comparray,&systemerror); +FreeMemory((farvoid *)decomparray,&systemerror); +FreeMemory((farvoid *)hufftree,&systemerror); +lochuffstruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(lochuffstruct->adjust==0) + lochuffstruct->adjust=1; + +} + +/********************* +** create_text_line ** +********************** +** Create a random line of text, stored at *dt. The line may be +** no more than nchars long. +*/ +static void create_text_line(farchar *dt, + long nchars) +{ +long charssofar; /* # of characters so far */ +long tomove; /* # of characters to move */ +char myword[40]; /* Local buffer for words */ +farchar *wordptr; /* Pointer to word from catalog */ + +charssofar=0; + +do { +/* +** Grab a random word from the wordcatalog +*/ +/* wordptr=wordcatarray[abs_randwc((long)WORDCATSIZE)];*/ +wordptr=wordcatarray[abs_randwc((int32)WORDCATSIZE)]; +MoveMemory((farvoid *)myword, + (farvoid *)wordptr, + (unsigned long)strlen(wordptr)+1); + +/* +** Append a blank. +*/ +tomove=strlen(myword)+1; +myword[tomove-1]=' '; + +/* +** See how long it is. If its length+charssofar > nchars, we have +** to trim it. +*/ +if((tomove+charssofar)>nchars) + tomove=nchars-charssofar; +/* +** Attach the word to the current line. Increment counter. +*/ +MoveMemory((farvoid *)dt,(farvoid *)myword,(unsigned long)tomove); +charssofar+=tomove; +dt+=tomove; + +/* +** If we're done, bail out. Otherwise, go get another word. +*/ +} while(charssofar<nchars); + +return; +} + +/********************** +** create_text_block ** +*********************** +** Build a block of text randomly loaded with words. The words +** come from the wordcatalog (which must be loaded before you +** call this). +** *tb points to the memory where the text is to be built. +** tblen is the # of bytes to put into the text block +** maxlinlen is the maximum length of any line (line end indicated +** by a carriage return). +*/ +static void create_text_block(farchar *tb, + ulong tblen, + ushort maxlinlen) +{ +ulong bytessofar; /* # of bytes so far */ +ulong linelen; /* Line length */ + +bytessofar=0L; +do { + +/* +** Pick a random length for a line and fill the line. +** Make sure the line can fit (haven't exceeded tablen) and also +** make sure you leave room to append a carriage return. +*/ +linelen=abs_randwc(maxlinlen-6)+6; +if((linelen+bytessofar)>tblen) + linelen=tblen-bytessofar; + +if(linelen>1) +{ + create_text_line(tb,linelen); +} +tb+=linelen-1; /* Add the carriage return */ +*tb++='\n'; + +bytessofar+=linelen; + +} while(bytessofar<tblen); + +} + +/******************** +** DoHuffIteration ** +********************* +** Perform the huffman benchmark. This routine +** (a) Builds the huffman tree +** (b) Compresses the text +** (c) Decompresses the text and verifies correct decompression +*/ +static ulong DoHuffIteration(farchar *plaintext, + farchar *comparray, + farchar *decomparray, + ulong arraysize, + ulong nloops, + huff_node *hufftree) +{ +int i; /* Index */ +long j; /* Bigger index */ +int root; /* Pointer to huffman tree root */ +float lowfreq1, lowfreq2; /* Low frequency counters */ +int lowidx1, lowidx2; /* Indexes of low freq. elements */ +long bitoffset; /* Bit offset into text */ +long textoffset; /* Char offset into text */ +long maxbitoffset; /* Holds limit of bit offset */ +long bitstringlen; /* Length of bitstring */ +int c; /* Character from plaintext */ +char bitstring[30]; /* Holds bitstring */ +ulong elapsed; /* For stopwatch */ +#ifdef DEBUG +int status=0; +#endif + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Do everything for nloops +*/ +while(nloops--) +{ + +/* +** Calculate the frequency of each byte value. Store the +** results in what will become the "leaves" of the +** Huffman tree. Interior nodes will be built in those +** nodes greater than node #255. +*/ +for(i=0;i<256;i++) +{ + hufftree[i].freq=(float)0.0; + hufftree[i].c=(unsigned char)i; +} + +for(j=0;j<arraysize;j++) + hufftree[(int)plaintext[j]].freq+=(float)1.0; + +for(i=0;i<256;i++) + if(hufftree[i].freq != (float)0.0) + hufftree[i].freq/=(float)arraysize; + +/* Reset the second half of the tree. Otherwise the loop below that +** compares the frequencies up to index 512 makes no sense. Some +** systems automatically zero out memory upon allocation, others (like +** for example DEC Unix) do not. Depending on this the loop below gets +** different data and different run times. On our alpha the data that +** was arbitrarily assigned led to an underflow error at runtime. We +** use that zeroed-out bits are in fact 0 as a float. +** Uwe F. Mayer */ +bzero((char *)&(hufftree[256]),sizeof(huff_node)*256); +/* +** Build the huffman tree. First clear all the parent +** pointers and left/right pointers. Also, discard all +** nodes that have a frequency of true 0. */ +for(i=0;i<512;i++) +{ if(hufftree[i].freq==(float)0.0) + hufftree[i].parent=EXCLUDED; + else + hufftree[i].parent=hufftree[i].left=hufftree[i].right=-1; +} + +/* +** Go through the tree. Finding nodes of really low +** frequency. +*/ +root=255; /* Starting root node-1 */ +while(1) +{ + lowfreq1=(float)2.0; lowfreq2=(float)2.0; + lowidx1=-1; lowidx2=-1; + /* + ** Find first lowest frequency. + */ + for(i=0;i<=root;i++) + if(hufftree[i].parent<0) + if(hufftree[i].freq<lowfreq1) + { lowfreq1=hufftree[i].freq; + lowidx1=i; + } + + /* + ** Did we find a lowest value? If not, the + ** tree is done. + */ + if(lowidx1==-1) break; + + /* + ** Find next lowest frequency + */ + for(i=0;i<=root;i++) + if((hufftree[i].parent<0) && (i!=lowidx1)) + if(hufftree[i].freq<lowfreq2) + { lowfreq2=hufftree[i].freq; + lowidx2=i; + } + + /* + ** If we could only find one item, then that + ** item is surely the root, and (as above) the + ** tree is done. + */ + if(lowidx2==-1) break; + + /* + ** Attach the two new nodes to the current root, and + ** advance the current root. + */ + root++; /* New root */ + hufftree[lowidx1].parent=root; + hufftree[lowidx2].parent=root; + hufftree[root].freq=lowfreq1+lowfreq2; + hufftree[root].left=lowidx1; + hufftree[root].right=lowidx2; + hufftree[root].parent=-2; /* Show root */ +} + +/* +** Huffman tree built...compress the plaintext +*/ +bitoffset=0L; /* Initialize bit offset */ +for(i=0;i<arraysize;i++) +{ + c=(int)plaintext[i]; /* Fetch character */ + /* + ** Build a bit string for byte c + */ + bitstringlen=0; + while(hufftree[c].parent!=-2) + { if(hufftree[hufftree[c].parent].left==c) + bitstring[bitstringlen]='0'; + else + bitstring[bitstringlen]='1'; + c=hufftree[c].parent; + bitstringlen++; + } + + /* + ** Step backwards through the bit string, setting + ** bits in the compressed array as you go. + */ + while(bitstringlen--) + { SetCompBit((u8 *)comparray,(u32)bitoffset,bitstring[bitstringlen]); + bitoffset++; + } +} + +/* +** Compression done. Perform de-compression. +*/ +maxbitoffset=bitoffset; +bitoffset=0; +textoffset=0; +do { + i=root; + while(hufftree[i].left!=-1) + { if(GetCompBit((u8 *)comparray,(u32)bitoffset)==0) + i=hufftree[i].left; + else + i=hufftree[i].right; + bitoffset++; + } + decomparray[textoffset]=hufftree[i].c; + +#ifdef DEBUG + if(hufftree[i].c != plaintext[textoffset]) + { + /* Show error */ + printf("Error at textoffset %ld\n",textoffset); + status=1; + } +#endif + textoffset++; +} while(bitoffset<maxbitoffset); + +} /* End the big while(nloops--) from above */ + +/* +** All done +*/ +#ifdef DEBUG + if (status==0) printf("Huffman: OK\n"); +#endif +return(StopStopwatch(elapsed)); +} + +/*************** +** SetCompBit ** +**************** +** Set a bit in the compression array. The value of the +** bit is set according to char bitchar. +*/ +static void SetCompBit(u8 *comparray, + u32 bitoffset, + char bitchar) +{ +u32 byteoffset; +int bitnumb; + +/* +** First calculate which element in the comparray to +** alter. and the bitnumber. +*/ +byteoffset=bitoffset>>3; +bitnumb=bitoffset % 8; + +/* +** Set or clear +*/ +if(bitchar=='1') + comparray[byteoffset]|=(1<<bitnumb); +else + comparray[byteoffset]&=~(1<<bitnumb); + +return; +} + +/*************** +** GetCompBit ** +**************** +** Return the bit value of a bit in the comparession array. +** Returns 0 if the bit is clear, nonzero otherwise. +*/ +static int GetCompBit(u8 *comparray, + u32 bitoffset) +{ +u32 byteoffset; +int bitnumb; + +/* +** Calculate byte offset and bit number. +*/ +byteoffset=bitoffset>>3; +bitnumb=bitoffset % 8; + +/* +** Fetch +*/ +return((1<<bitnumb) & comparray[byteoffset] ); +} + +/******************************** +** BACK PROPAGATION NEURAL NET ** +********************************* +** This code is a modified version of the code +** that was submitted to BYTE Magazine by +** Maureen Caudill. It accomanied an article +** that I CANNOT NOW RECALL. +** The author's original heading/comment was +** as follows: +** +** Backpropagation Network +** Written by Maureen Caudill +** in Think C 4.0 on a Macintosh +** +** (c) Maureen Caudill 1988-1991 +** This network will accept 5x7 input patterns +** and produce 8 bit output patterns. +** The source code may be copied or modified without restriction, +** but no fee may be charged for its use. +** +** ++++++++++++++ +** I have modified the code so that it will work +** on systems other than a Macintosh -- RG +*/ + +/*********** +** DoNNet ** +************ +** Perform the neural net benchmark. +** Note that this benchmark is one of the few that +** requires an input file. That file is "NNET.DAT" and +** should be on the local directory (from which the +** benchmark program in launched). +*/ +void DoNNET(void) +{ +NNetStruct *locnnetstruct; /* Local ptr to global data */ +char *errorcontext; +ulong accumtime; +double iterations; + +/* +** Link to global data +*/ +locnnetstruct=&global_nnetstruct; + +/* +** Set error context +*/ +errorcontext="CPU:NNET"; + +/* +** Init random number generator. +** NOTE: It is important that the random number generator +** be re-initialized for every pass through this test. +** The NNET algorithm uses the random number generator +** to initialize the net. Results are sensitive to +** the initial neural net state. +*/ +/* randnum(3L); */ +randnum((int32)3); + +/* +** Read in the input and output patterns. We'll do this +** only once here at the beginning. These values don't +** change once loaded. +*/ +if(read_data_file()!=0) + ErrorExit(); + + +/* +** See if we need to perform self adjustment loop. +*/ +if(locnnetstruct->adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(locnnetstruct->loops=1L; + locnnetstruct->loops<MAXNNETLOOPS; + locnnetstruct->loops++) + { /*randnum(3L); */ + randnum((int32)3); + if(DoNNetIteration(locnnetstruct->loops) + >global_min_ticks) break; + } +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + /* randnum(3L); */ /* Gotta do this for Neural Net */ + randnum((int32)3); /* Gotta do this for Neural Net */ + accumtime+=DoNNetIteration(locnnetstruct->loops); + iterations+=(double)locnnetstruct->loops; +} while(TicksToSecs(accumtime)<locnnetstruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +locnnetstruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(locnnetstruct->adjust==0) + locnnetstruct->adjust=1; + + +return; +} + +/******************** +** DoNNetIteration ** +********************* +** Do a single iteration of the neural net benchmark. +** By iteration, we mean a "learning" pass. +*/ +static ulong DoNNetIteration(ulong nloops) +{ +ulong elapsed; /* Elapsed time */ +int patt; + +/* +** Run nloops learning cycles. Notice that, counted with +** the learning cycle is the weight randomization and +** zeroing of changes. This should reduce clock jitter, +** since we don't have to stop and start the clock for +** each iteration. +*/ +elapsed=StartStopwatch(); +while(nloops--) +{ + randomize_wts(); + zero_changes(); + iteration_count=1; + learned = F; + numpasses = 0; + while (learned == F) + { + for (patt=0; patt<numpats; patt++) + { + worst_error = 0.0; /* reset this every pass through data */ + move_wt_changes(); /* move last pass's wt changes to momentum array */ + do_forward_pass(patt); + do_back_pass(patt); + iteration_count++; + } + numpasses ++; + learned = check_out_error(); + } +#ifdef DEBUG +printf("Learned in %d passes\n",numpasses); +#endif +} +return(StopStopwatch(elapsed)); +} + +/************************* +** do_mid_forward(patt) ** +************************** +** Process the middle layer's forward pass +** The activation of middle layer's neurode is the weighted +** sum of the inputs from the input pattern, with sigmoid +** function applied to the inputs. +**/ +static void do_mid_forward(int patt) +{ +double sum; +int neurode, i; + +for (neurode=0;neurode<MID_SIZE; neurode++) +{ + sum = 0.0; + for (i=0; i<IN_SIZE; i++) + { /* compute weighted sum of input signals */ + sum += mid_wts[neurode][i]*in_pats[patt][i]; + } + /* + ** apply sigmoid function f(x) = 1/(1+exp(-x)) to weighted sum + */ + sum = 1.0/(1.0+exp(-sum)); + mid_out[neurode] = sum; +} +return; +} + +/********************* +** do_out_forward() ** +********************** +** process the forward pass through the output layer +** The activation of the output layer is the weighted sum of +** the inputs (outputs from middle layer), modified by the +** sigmoid function. +**/ +static void do_out_forward() +{ +double sum; +int neurode, i; + +for (neurode=0; neurode<OUT_SIZE; neurode++) +{ + sum = 0.0; + for (i=0; i<MID_SIZE; i++) + { /* + ** compute weighted sum of input signals + ** from middle layer + */ + sum += out_wts[neurode][i]*mid_out[i]; + } + /* + ** Apply f(x) = 1/(1+exp(-x)) to weighted input + */ + sum = 1.0/(1.0+exp(-sum)); + out_out[neurode] = sum; +} +return; +} + +/************************* +** display_output(patt) ** +************************** +** Display the actual output vs. the desired output of the +** network. +** Once the training is complete, and the "learned" flag set +** to TRUE, then display_output sends its output to both +** the screen and to a text output file. +** +** NOTE: This routine has been disabled in the benchmark +** version. -- RG +**/ +/* +void display_output(int patt) +{ +int i; + + fprintf(outfile,"\n Iteration # %d",iteration_count); + fprintf(outfile,"\n Desired Output: "); + + for (i=0; i<OUT_SIZE; i++) + { + fprintf(outfile,"%6.3f ",out_pats[patt][i]); + } + fprintf(outfile,"\n Actual Output: "); + + for (i=0; i<OUT_SIZE; i++) + { + fprintf(outfile,"%6.3f ",out_out[i]); + } + fprintf(outfile,"\n"); + return; +} +*/ + +/********************** +** do_forward_pass() ** +*********************** +** control function for the forward pass through the network +** NOTE: I have disabled the call to display_output() in +** the benchmark version -- RG. +**/ +static void do_forward_pass(int patt) +{ +do_mid_forward(patt); /* process forward pass, middle layer */ +do_out_forward(); /* process forward pass, output layer */ +/* display_output(patt); ** display results of forward pass */ +return; +} + +/*********************** +** do_out_error(patt) ** +************************ +** Compute the error for the output layer neurodes. +** This is simply Desired - Actual. +**/ +static void do_out_error(int patt) +{ +int neurode; +double error,tot_error, sum; + +tot_error = 0.0; +sum = 0.0; +for (neurode=0; neurode<OUT_SIZE; neurode++) +{ + out_error[neurode] = out_pats[patt][neurode] - out_out[neurode]; + /* + ** while we're here, also compute magnitude + ** of total error and worst error in this pass. + ** We use these to decide if we are done yet. + */ + error = out_error[neurode]; + if (error <0.0) + { + sum += -error; + if (-error > tot_error) + tot_error = -error; /* worst error this pattern */ + } + else + { + sum += error; + if (error > tot_error) + tot_error = error; /* worst error this pattern */ + } +} +avg_out_error[patt] = sum/OUT_SIZE; +tot_out_error[patt] = tot_error; +return; +} + +/*********************** +** worst_pass_error() ** +************************ +** Find the worst and average error in the pass and save it +**/ +static void worst_pass_error() +{ +double error,sum; + +int i; + +error = 0.0; +sum = 0.0; +for (i=0; i<numpats; i++) +{ + if (tot_out_error[i] > error) error = tot_out_error[i]; + sum += avg_out_error[i]; +} +worst_error = error; +average_error = sum/numpats; +return; +} + +/******************* +** do_mid_error() ** +******************** +** Compute the error for the middle layer neurodes +** This is based on the output errors computed above. +** Note that the derivative of the sigmoid f(x) is +** f'(x) = f(x)(1 - f(x)) +** Recall that f(x) is merely the output of the middle +** layer neurode on the forward pass. +**/ +static void do_mid_error() +{ +double sum; +int neurode, i; + +for (neurode=0; neurode<MID_SIZE; neurode++) +{ + sum = 0.0; + for (i=0; i<OUT_SIZE; i++) + sum += out_wts[i][neurode]*out_error[i]; + + /* + ** apply the derivative of the sigmoid here + ** Because of the choice of sigmoid f(I), the derivative + ** of the sigmoid is f'(I) = f(I)(1 - f(I)) + */ + mid_error[neurode] = mid_out[neurode]*(1-mid_out[neurode])*sum; +} +return; +} + +/********************* +** adjust_out_wts() ** +********************** +** Adjust the weights of the output layer. The error for +** the output layer has been previously propagated back to +** the middle layer. +** Use the Delta Rule with momentum term to adjust the weights. +**/ +static void adjust_out_wts() +{ +int weight, neurode; +double learn,delta,alph; + +learn = BETA; +alph = ALPHA; +for (neurode=0; neurode<OUT_SIZE; neurode++) +{ + for (weight=0; weight<MID_SIZE; weight++) + { + /* standard delta rule */ + delta = learn * out_error[neurode] * mid_out[weight]; + + /* now the momentum term */ + delta += alph * out_wt_change[neurode][weight]; + out_wts[neurode][weight] += delta; + + /* keep track of this pass's cum wt changes for next pass's momentum */ + out_wt_cum_change[neurode][weight] += delta; + } +} +return; +} + +/************************* +** adjust_mid_wts(patt) ** +************************** +** Adjust the middle layer weights using the previously computed +** errors. +** We use the Generalized Delta Rule with momentum term +**/ +static void adjust_mid_wts(int patt) +{ +int weight, neurode; +double learn,alph,delta; + +learn = BETA; +alph = ALPHA; +for (neurode=0; neurode<MID_SIZE; neurode++) +{ + for (weight=0; weight<IN_SIZE; weight++) + { + /* first the basic delta rule */ + delta = learn * mid_error[neurode] * in_pats[patt][weight]; + + /* with the momentum term */ + delta += alph * mid_wt_change[neurode][weight]; + mid_wts[neurode][weight] += delta; + + /* keep track of this pass's cum wt changes for next pass's momentum */ + mid_wt_cum_change[neurode][weight] += delta; + } +} +return; +} + +/******************* +** do_back_pass() ** +******************** +** Process the backward propagation of error through network. +**/ +void do_back_pass(int patt) +{ + +do_out_error(patt); +do_mid_error(); +adjust_out_wts(); +adjust_mid_wts(patt); + +return; +} + + +/********************** +** move_wt_changes() ** +*********************** +** Move the weight changes accumulated last pass into the wt-change +** array for use by the momentum term in this pass. Also zero out +** the accumulating arrays after the move. +**/ +static void move_wt_changes() +{ +int i,j; + +for (i = 0; i<MID_SIZE; i++) + for (j = 0; j<IN_SIZE; j++) + { + mid_wt_change[i][j] = mid_wt_cum_change[i][j]; + /* + ** Zero it out for next pass accumulation. + */ + mid_wt_cum_change[i][j] = 0.0; + } + +for (i = 0; i<OUT_SIZE; i++) + for (j=0; j<MID_SIZE; j++) + { + out_wt_change[i][j] = out_wt_cum_change[i][j]; + out_wt_cum_change[i][j] = 0.0; + } + +return; +} + +/********************** +** check_out_error() ** +*********************** +** Check to see if the error in the output layer is below +** MARGIN*OUT_SIZE for all output patterns. If so, then +** assume the network has learned acceptably well. This +** is simply an arbitrary measure of how well the network +** has learned -- many other standards are possible. +**/ +static int check_out_error() +{ +int result,i,error; + +result = T; +error = F; +worst_pass_error(); /* identify the worst error in this pass */ + +/* +#ifdef DEBUG +printf("\n Iteration # %d",iteration_count); +#endif +*/ +for (i=0; i<numpats; i++) +{ +/* printf("\n Error pattern %d: Worst: %8.3f; Average: %8.3f", + i+1,tot_out_error[i], avg_out_error[i]); + fprintf(outfile, + "\n Error pattern %d: Worst: %8.3f; Average: %8.3f", + i+1,tot_out_error[i]); +*/ + + if (worst_error >= STOP) result = F; + if (tot_out_error[i] >= 16.0) error = T; +} + +if (error == T) result = ERR; + + +#ifdef DEBUG +/* printf("\n Error this pass thru data: Worst: %8.3f; Average: %8.3f", + worst_error,average_error); +*/ +/* fprintf(outfile, + "\n Error this pass thru data: Worst: %8.3f; Average: %8.3f", + worst_error, average_error); */ +#endif + +return(result); +} + + +/******************* +** zero_changes() ** +******************** +** Zero out all the wt change arrays +**/ +static void zero_changes() +{ +int i,j; + +for (i = 0; i<MID_SIZE; i++) +{ + for (j=0; j<IN_SIZE; j++) + { + mid_wt_change[i][j] = 0.0; + mid_wt_cum_change[i][j] = 0.0; + } +} + +for (i = 0; i< OUT_SIZE; i++) +{ + for (j=0; j<MID_SIZE; j++) + { + out_wt_change[i][j] = 0.0; + out_wt_cum_change[i][j] = 0.0; + } +} +return; +} + + +/******************** +** randomize_wts() ** +********************* +** Intialize the weights in the middle and output layers to +** random values between -0.25..+0.25 +** Function rand() returns a value between 0 and 32767. +** +** NOTE: Had to make alterations to how the random numbers were +** created. -- RG. +**/ +static void randomize_wts() +{ +int neurode,i; +double value; + +/* +** Following not used int benchmark version -- RG +** +** printf("\n Please enter a random number seed (1..32767): "); +** scanf("%d", &i); +** srand(i); +*/ + +for (neurode = 0; neurode<MID_SIZE; neurode++) +{ + for(i=0; i<IN_SIZE; i++) + { + /* value=(double)abs_randwc(100000L); */ + value=(double)abs_randwc((int32)100000); + value=value/(double)100000.0 - (double) 0.5; + mid_wts[neurode][i] = value/2; + } +} +for (neurode=0; neurode<OUT_SIZE; neurode++) +{ + for(i=0; i<MID_SIZE; i++) + { + /* value=(double)abs_randwc(100000L); */ + value=(double)abs_randwc((int32)100000); + value=value/(double)10000.0 - (double) 0.5; + out_wts[neurode][i] = value/2; + } +} + +return; +} + + +/********************* +** read_data_file() ** +********************** +** Read in the input data file and store the patterns in +** in_pats and out_pats. +** The format for the data file is as follows: +** +** line# data expected +** ----- ------------------------------ +** 1 In-X-size,in-y-size,out-size +** 2 number of patterns in file +** 3 1st X row of 1st input pattern +** 4.. following rows of 1st input pattern pattern +** in-x+2 y-out pattern +** 1st X row of 2nd pattern +** etc. +** +** Each row of data is separated by commas or spaces. +** The data is expected to be ascii text corresponding to +** either a +1 or a 0. +** +** Sample input for a 1-pattern file (The comments to the +** right may NOT be in the file unless more sophisticated +** parsing of the input is done.): +** +** 5,7,8 input is 5x7 grid, output is 8 bits +** 1 one pattern in file +** 0,1,1,1,0 beginning of pattern for "O" +** 1,0,0,0,1 +** 1,0,0,0,1 +** 1,0,0,0,1 +** 1,0,0,0,1 +** 1,0,0,0,0 +** 0,1,1,1,0 +** 0,1,0,0,1,1,1,1 ASCII code for "O" -- 0100 1111 +** +** Clearly, this simple scheme can be expanded or enhanced +** any way you like. +** +** Returns -1 if any file error occurred, otherwise 0. +**/ +static int read_data_file() +{ +FILE *infile; + +int xinsize,yinsize,youtsize; +int patt, element, i, row; +int vals_read; +int val1,val2,val3,val4,val5,val6,val7,val8; + +/* printf("\n Opening and retrieving data from file."); */ + +infile = fopen(inpath, "r"); +if (infile == NULL) +{ + printf("\n CPU:NNET--error in opening file!"); + return -1 ; +} +vals_read =fscanf(infile,"%d %d %d",&xinsize,&yinsize,&youtsize); +if (vals_read != 3) +{ + printf("\n CPU:NNET -- Should read 3 items in line one; did read %d",vals_read); + return -1; +} +vals_read=fscanf(infile,"%d",&numpats); +if (vals_read !=1) +{ + printf("\n CPU:NNET -- Should read 1 item in line 2; did read %d",vals_read); + return -1; +} +if (numpats > MAXPATS) + numpats = MAXPATS; + +for (patt=0; patt<numpats; patt++) +{ + element = 0; + for (row = 0; row<yinsize; row++) + { + vals_read = fscanf(infile,"%d %d %d %d %d", + &val1, &val2, &val3, &val4, &val5); + if (vals_read != 5) + { + printf ("\n CPU:NNET -- failure in reading input!"); + return -1; + } + element=row*xinsize; + + in_pats[patt][element] = (double) val1; element++; + in_pats[patt][element] = (double) val2; element++; + in_pats[patt][element] = (double) val3; element++; + in_pats[patt][element] = (double) val4; element++; + in_pats[patt][element] = (double) val5; element++; + } + for (i=0;i<IN_SIZE; i++) + { + if (in_pats[patt][i] >= 0.9) + in_pats[patt][i] = 0.9; + if (in_pats[patt][i] <= 0.1) + in_pats[patt][i] = 0.1; + } + element = 0; + vals_read = fscanf(infile,"%d %d %d %d %d %d %d %d", + &val1, &val2, &val3, &val4, &val5, &val6, &val7, &val8); + + out_pats[patt][element] = (double) val1; element++; + out_pats[patt][element] = (double) val2; element++; + out_pats[patt][element] = (double) val3; element++; + out_pats[patt][element] = (double) val4; element++; + out_pats[patt][element] = (double) val5; element++; + out_pats[patt][element] = (double) val6; element++; + out_pats[patt][element] = (double) val7; element++; + out_pats[patt][element] = (double) val8; element++; +} + +/* printf("\n Closing the input file now. "); */ + +fclose(infile); +return(0); +} + +/********************* +** initialize_net() ** +********************** +** Do all the initialization stuff before beginning +*/ +/* +static int initialize_net() +{ +int err_code; + +randomize_wts(); +zero_changes(); +err_code = read_data_file(); +iteration_count = 1; +return(err_code); +} +*/ + +/********************** +** display_mid_wts() ** +*********************** +** Display the weights on the middle layer neurodes +** NOTE: This routine is not used in the benchmark +** test -- RG +**/ +/* static void display_mid_wts() +{ +int neurode, weight, row, col; + +fprintf(outfile,"\n Weights of Middle Layer neurodes:"); + +for (neurode=0; neurode<MID_SIZE; neurode++) +{ + fprintf(outfile,"\n Mid Neurode # %d",neurode); + for (row=0; row<IN_Y_SIZE; row++) + { + fprintf(outfile,"\n "); + for (col=0; col<IN_X_SIZE; col++) + { + weight = IN_X_SIZE * row + col; + fprintf(outfile," %8.3f ", mid_wts[neurode][weight]); + } + } +} +return; +} +*/ +/********************** +** display_out_wts() ** +*********************** +** Display the weights on the output layer neurodes +** NOTE: This code is not used in the benchmark +** test -- RG +*/ +/* void display_out_wts() +{ +int neurode, weight; + + fprintf(outfile,"\n Weights of Output Layer neurodes:"); + + for (neurode=0; neurode<OUT_SIZE; neurode++) + { + fprintf(outfile,"\n Out Neurode # %d \n",neurode); + for (weight=0; weight<MID_SIZE; weight++) + { + fprintf(outfile," %8.3f ", out_wts[neurode][weight]); + } + } + return; +} +*/ + +/*********************** +** LU DECOMPOSITION ** +** (Linear Equations) ** +************************ +** These routines come from "Numerical Recipes in Pascal". +** Note that, as in the assignment algorithm, though we +** separately define LUARRAYROWS and LUARRAYCOLS, the two +** must be the same value (this routine depends on a square +** matrix). +*/ + +/********* +** DoLU ** +********** +** Perform the LU decomposition benchmark. +*/ +void DoLU(void) +{ +LUStruct *loclustruct; /* Local pointer to global data */ +char *errorcontext; +int systemerror; +fardouble *a; +fardouble *b; +fardouble *abase; +fardouble *bbase; +LUdblptr ptra; +int n; +int i; +ulong accumtime; +double iterations; + +/* +** Link to global data +*/ +loclustruct=&global_lustruct; + +/* +** Set error context. +*/ +errorcontext="FPU:LU"; + +/* +** Our first step is to build a "solvable" problem. This +** will become the "seed" set that all others will be +** derived from. (I.E., we'll simply copy these arrays +** into the others. +*/ +a=(fardouble *)AllocateMemory(sizeof(double) * LUARRAYCOLS * LUARRAYROWS, + &systemerror); +b=(fardouble *)AllocateMemory(sizeof(double) * LUARRAYROWS, + &systemerror); +n=LUARRAYROWS; + +/* +** We need to allocate a temp vector that is used by the LU +** algorithm. This removes the allocation routine from the +** timing. +*/ +LUtempvv=(fardouble *)AllocateMemory(sizeof(double)*LUARRAYROWS, + &systemerror); + +/* +** Build a problem to be solved. +*/ +ptra.ptrs.p=a; /* Gotta coerce linear array to 2D array */ +build_problem(*ptra.ptrs.ap,n,b); + +/* +** Now that we have a problem built, see if we need to do +** auto-adjust. If so, repeatedly call the DoLUIteration routine, +** increasing the number of solutions per iteration as you go. +*/ +if(loclustruct->adjust==0) +{ + loclustruct->numarrays=0; + for(i=1;i<=MAXLUARRAYS;i++) + { + abase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYCOLS*LUARRAYROWS*(i+1),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL); + ErrorExit(); + } + bbase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYROWS*(i+1),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,abase,(fardouble *)NULL); + ErrorExit(); + } + if(DoLUIteration(a,b,abase,bbase,i)>global_min_ticks) + { loclustruct->numarrays=i; + break; + } + /* + ** Not enough arrays...free them all and try again + */ + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + } + /* + ** Were we able to do it? + */ + if(loclustruct->numarrays==0) + { printf("FPU:LU -- Array limit reached\n"); + LUFreeMem(a,b,abase,bbase); + ErrorExit(); + } +} +else +{ /* + ** Don't need to adjust -- just allocate the proper + ** number of arrays and proceed. + */ + abase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYCOLS*LUARRAYROWS*loclustruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL); + ErrorExit(); + } + bbase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYROWS*loclustruct->numarrays,&systemerror); + if(systemerror) + { + ReportError(errorcontext,systemerror); + LUFreeMem(a,b,abase,(fardouble *)NULL); + ErrorExit(); + } +} +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoLUIteration(a,b,abase,bbase, + loclustruct->numarrays); + iterations+=(double)loclustruct->numarrays; +} while(TicksToSecs(accumtime)<loclustruct->request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +loclustruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(loclustruct->adjust==0) + loclustruct->adjust=1; + +LUFreeMem(a,b,abase,bbase); +return; +} + +/************** +** LUFreeMem ** +*************** +** Release memory associated with LU benchmark. +*/ +static void LUFreeMem(fardouble *a, fardouble *b, + fardouble *abase,fardouble *bbase) +{ +int systemerror; + +FreeMemory((farvoid *)a,&systemerror); +FreeMemory((farvoid *)b,&systemerror); +FreeMemory((farvoid *)LUtempvv,&systemerror); + +if(abase!=(fardouble *)NULL) FreeMemory((farvoid *)abase,&systemerror); +if(bbase!=(fardouble *)NULL) FreeMemory((farvoid *)bbase,&systemerror); +return; +} + +/****************** +** DoLUIteration ** +******************* +** Perform an iteration of the LU decomposition benchmark. +** An iteration refers to the repeated solution of several +** identical matrices. +*/ +static ulong DoLUIteration(fardouble *a,fardouble *b, + fardouble *abase, fardouble *bbase, + ulong numarrays) +{ +fardouble *locabase; +fardouble *locbbase; +LUdblptr ptra; /* For converting ptr to 2D array */ +ulong elapsed; +ulong j,i; /* Indexes */ + + +/* +** Move the seed arrays (a & b) into the destination +** arrays; +*/ +for(j=0;j<numarrays;j++) +{ locabase=abase+j*LUARRAYROWS*LUARRAYCOLS; + locbbase=bbase+j*LUARRAYROWS; + for(i=0;i<LUARRAYROWS*LUARRAYCOLS;i++) + *(locabase+i)=*(a+i); + for(i=0;i<LUARRAYROWS;i++) + *(locbbase+i)=*(b+i); +} + +/* +** Do test...begin timing. +*/ +elapsed=StartStopwatch(); +for(i=0;i<numarrays;i++) +{ locabase=abase+i*LUARRAYROWS*LUARRAYCOLS; + locbbase=bbase+i*LUARRAYROWS; + ptra.ptrs.p=locabase; + lusolve(*ptra.ptrs.ap,LUARRAYROWS,locbbase); +} + +return(StopStopwatch(elapsed)); +} + +/****************** +** build_problem ** +******************* +** Constructs a solvable set of linear equations. It does this by +** creating an identity matrix, then loading the solution vector +** with random numbers. After that, the identity matrix and +** solution vector are randomly "scrambled". Scrambling is +** done by (a) randomly selecting a row and multiplying that +** row by a random number and (b) adding one randomly-selected +** row to another. +*/ +static void build_problem(double a[][LUARRAYCOLS], + int n, + double b[LUARRAYROWS]) +{ +long i,j,k,k1; /* Indexes */ +double rcon; /* Random constant */ + +/* +** Reset random number generator +*/ +/* randnum(13L); */ +randnum((int32)13); + +/* +** Build an identity matrix. +** We'll also use this as a chance to load the solution +** vector. +*/ +for(i=0;i<n;i++) +{ /* b[i]=(double)(abs_randwc(100L)+1L); */ + b[i]=(double)(abs_randwc((int32)100)+(int32)1); + for(j=0;j<n;j++) + if(i==j) + /* a[i][j]=(double)(abs_randwc(1000L)+1L); */ + a[i][j]=(double)(abs_randwc((int32)1000)+(int32)1); + else + a[i][j]=(double)0.0; +} + +#ifdef DEBUG +printf("Problem:\n"); +for(i=0;i<n;i++) +{ +/* + for(j=0;j<n;j++) + printf("%6.2f ",a[i][j]); +*/ + printf("%.0f/%.0f=%.2f\t",b[i],a[i][i],b[i]/a[i][i]); +/* + printf("\n"); +*/ +} +#endif + +/* +** Scramble. Do this 8n times. See comment above for +** a description of the scrambling process. +*/ + +for(i=0;i<8*n;i++) +{ + /* + ** Pick a row and a random constant. Multiply + ** all elements in the row by the constant. + */ + /* k=abs_randwc((long)n); + rcon=(double)(abs_randwc(20L)+1L); + for(j=0;j<n;j++) + a[k][j]=a[k][j]*rcon; + b[k]=b[k]*rcon; +*/ + /* + ** Pick two random rows and add second to + ** first. Note that we also occasionally multiply + ** by minus 1 so that we get a subtraction operation. + */ + /* k=abs_randwc((long)n); */ + /* k1=abs_randwc((long)n); */ + k=abs_randwc((int32)n); + k1=abs_randwc((int32)n); + if(k!=k1) + { + if(k<k1) rcon=(double)1.0; + else rcon=(double)-1.0; + for(j=0;j<n;j++) + a[k][j]+=a[k1][j]*rcon;; + b[k]+=b[k1]*rcon; + } +} + +return; +} + + +/*********** +** ludcmp ** +************ +** From the procedure of the same name in "Numerical Recipes in Pascal", +** by Press, Flannery, Tukolsky, and Vetterling. +** Given an nxn matrix a[], this routine replaces it by the LU +** decomposition of a rowwise permutation of itself. a[] and n +** are input. a[] is output, modified as follows: +** -- -- +** | b(1,1) b(1,2) b(1,3)... | +** | a(2,1) b(2,2) b(2,3)... | +** | a(3,1) a(3,2) b(3,3)... | +** | a(4,1) a(4,2) a(4,3)... | +** | ... | +** -- -- +** +** Where the b(i,j) elements form the upper triangular matrix of the +** LU decomposition, and the a(i,j) elements form the lower triangular +** elements. The LU decomposition is calculated so that we don't +** need to store the a(i,i) elements (which would have laid along the +** diagonal and would have all been 1). +** +** indx[] is an output vector that records the row permutation +** effected by the partial pivoting; d is output as +/-1 depending +** on whether the number of row interchanges was even or odd, +** respectively. +** Returns 0 if matrix singular, else returns 1. +*/ +static int ludcmp(double a[][LUARRAYCOLS], + int n, + int indx[], + int *d) +{ + +double big; /* Holds largest element value */ +double sum; +double dum; /* Holds dummy value */ +int i,j,k; /* Indexes */ +int imax=0; /* Holds max index value */ +double tiny; /* A really small number */ + +tiny=(double)1.0e-20; + +*d=1; /* No interchanges yet */ + +for(i=0;i<n;i++) +{ big=(double)0.0; + for(j=0;j<n;j++) + if((double)fabs(a[i][j]) > big) + big=fabs(a[i][j]); + /* Bail out on singular matrix */ + if(big==(double)0.0) return(0); + LUtempvv[i]=1.0/big; +} + +/* +** Crout's algorithm...loop over columns. +*/ +for(j=0;j<n;j++) +{ if(j!=0) + for(i=0;i<j;i++) + { sum=a[i][j]; + if(i!=0) + for(k=0;k<i;k++) + sum-=(a[i][k]*a[k][j]); + a[i][j]=sum; + } + big=(double)0.0; + for(i=j;i<n;i++) + { sum=a[i][j]; + if(j!=0) + for(k=0;k<j;k++) + sum-=a[i][k]*a[k][j]; + a[i][j]=sum; + dum=LUtempvv[i]*fabs(sum); + if(dum>=big) + { big=dum; + imax=i; + } + } + if(j!=imax) /* Interchange rows if necessary */ + { for(k=0;k<n;k++) + { dum=a[imax][k]; + a[imax][k]=a[j][k]; + a[j][k]=dum; + } + *d=-*d; /* Change parity of d */ + dum=LUtempvv[imax]; + LUtempvv[imax]=LUtempvv[j]; /* Don't forget scale factor */ + LUtempvv[j]=dum; + } + indx[j]=imax; + /* + ** If the pivot element is zero, the matrix is singular + ** (at least as far as the precision of the machine + ** is concerned.) We'll take the original author's + ** recommendation and replace 0.0 with "tiny". + */ + if(a[j][j]==(double)0.0) + a[j][j]=tiny; + + if(j!=(n-1)) + { dum=1.0/a[j][j]; + for(i=j+1;i<n;i++) + a[i][j]=a[i][j]*dum; + } +} + +return(1); +} + +/*********** +** lubksb ** +************ +** Also from "Numerical Recipes in Pascal". +** This routine solves the set of n linear equations A X = B. +** Here, a[][] is input, not as the matrix A, but as its +** LU decomposition, created by the routine ludcmp(). +** Indx[] is input as the permutation vector returned by ludcmp(). +** b[] is input as the right-hand side an returns the +** solution vector X. +** a[], n, and indx are not modified by this routine and +** can be left in place for different values of b[]. +** This routine takes into account the possibility that b will +** begin with many zero elements, so it is efficient for use in +** matrix inversion. +*/ +static void lubksb( double a[][LUARRAYCOLS], + int n, + int indx[LUARRAYROWS], + double b[LUARRAYROWS]) +{ + +int i,j; /* Indexes */ +int ip; /* "pointer" into indx */ +int ii; +double sum; + +/* +** When ii is set to a positive value, it will become +** the index of the first nonvanishing element of b[]. +** We now do the forward substitution. The only wrinkle +** is to unscramble the permutation as we go. +*/ +ii=-1; +for(i=0;i<n;i++) +{ ip=indx[i]; + sum=b[ip]; + b[ip]=b[i]; + if(ii!=-1) + for(j=ii;j<i;j++) + sum=sum-a[i][j]*b[j]; + else + /* + ** If a nonzero element is encountered, we have + ** to do the sums in the loop above. + */ + if(sum!=(double)0.0) + ii=i; + b[i]=sum; +} +/* +** Do backsubstitution +*/ +for(i=(n-1);i>=0;i--) +{ + sum=b[i]; + if(i!=(n-1)) + for(j=(i+1);j<n;j++) + sum=sum-a[i][j]*b[j]; + b[i]=sum/a[i][i]; +} +return; +} + +/************ +** lusolve ** +************* +** Solve a linear set of equations: A x = b +** Original matrix A will be destroyed by this operation. +** Returns 0 if matrix is singular, 1 otherwise. +*/ +static int lusolve(double a[][LUARRAYCOLS], + int n, + double b[LUARRAYROWS]) +{ +int indx[LUARRAYROWS]; +int d; +#ifdef DEBUG +int i,j; +#endif + +if(ludcmp(a,n,indx,&d)==0) return(0); + +/* Matrix not singular -- proceed */ +lubksb(a,n,indx,b); + +#ifdef DEBUG +printf("Solution:\n"); +for(i=0;i<n;i++) +{ + for(j=0;j<n;j++){ + /* + printf("%6.2f ",a[i][j]); + */ + } + printf("%6.2f\t",b[i]); + /* + printf("\n"); + */ +} +printf("\n"); +#endif + +return(1); +} diff --git a/nbench1.h b/nbench1.h new file mode 100644 index 0000000..13a5907 --- /dev/null +++ b/nbench1.h @@ -0,0 +1,428 @@ +/* +** nbench1.h +** Header for nbench1.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** DEFINES +*/ +/* #define DEBUG */ + +/* +** EXTERNALS +*/ +extern ulong global_min_ticks; + +extern SortStruct global_numsortstruct; +extern SortStruct global_strsortstruct; +extern BitOpStruct global_bitopstruct; +extern EmFloatStruct global_emfloatstruct; +extern FourierStruct global_fourierstruct; +extern AssignStruct global_assignstruct; +extern IDEAStruct global_ideastruct; +extern HuffStruct global_huffstruct; +extern NNetStruct global_nnetstruct; +extern LUStruct global_lustruct; + +/* External PROTOTYPES */ +/*extern unsigned long abs_randwc(unsigned long num);*/ /* From MISC */ +/*extern long randnum(long lngval);*/ +extern int32 randwc(int32 num); +extern u32 abs_randwc(u32 num); +extern int32 randnum(int32 lngval); + +extern farvoid *AllocateMemory(unsigned long nbytes, /* From SYSSPEC */ + int *errorcode); +extern void FreeMemory(farvoid *mempointer, + int *errorcode); +extern void MoveMemory(farvoid *destination, + farvoid *source, unsigned long nbytes); +extern void ReportError(char *context, int errorcode); +extern void ErrorExit(); +extern unsigned long StartStopwatch(); +extern unsigned long StopStopwatch(unsigned long startticks); +extern unsigned long TicksToSecs(unsigned long tickamount); +extern double TicksToFracSecs(unsigned long tickamount); + +/***************** +** NUMERIC SORT ** +*****************/ + +/* +** PROTOTYPES +*/ +void DoNumSort(void); +static ulong DoNumSortIteration(farlong *arraybase, + ulong arraysize, + uint numarrays); +static void LoadNumArrayWithRand(farlong *array, + ulong arraysize, + uint numarrays); +static void NumHeapSort(farlong *array, + ulong bottom, + ulong top); +static void NumSift(farlong *array, + ulong i, + ulong j); + + +/**************** +** STRING SORT ** +***************** +*/ + + +/* +** PROTOTYPES +*/ +void DoStringSort(void); +static ulong DoStringSortIteration(faruchar *arraybase, + uint numarrays, + ulong arraysize); +static farulong *LoadStringArray(faruchar *strarray, + uint numarrays, + ulong *strings, + ulong arraysize); +static void stradjust(farulong *optrarray, + faruchar *strarray, + ulong nstrings, + ulong i, + uchar l); +static void StrHeapSort(farulong *optrarray, + faruchar *strarray, + ulong numstrings, + ulong bottom, + ulong top); +static int str_is_less(farulong *optrarray, + faruchar *strarray, + ulong numstrings, + ulong a, + ulong b); +static void strsift(farulong *optrarray, + faruchar *strarray, + ulong numstrings, + ulong i, + ulong j); + +/************************ +** BITFIELD OPERATIONS ** +************************* +*/ + +/* +** PROTOTYPES +*/ +void DoBitops(void); +static ulong DoBitfieldIteration(farulong *bitarraybase, + farulong *bitoparraybase, + long bitoparraysize, + ulong *nbitops); +static void ToggleBitRun(farulong *bitmap, + ulong bit_addr, + ulong nbits, + uint val); +static void FlipBitRun(farulong *bitmap, + ulong bit_addr, + ulong nbits); + +/**************************** +** EMULATED FLOATING POINT ** +****************************/ +typedef struct +{ + u8 type; /* Indicates, NORMAL, SUBNORMAL, etc. */ + u8 sign; /* Mantissa sign */ + short exp; /* Signed exponent...no bias */ + u16 mantissa[INTERNAL_FPF_PRECISION]; +} InternalFPF; + +/* +** PROTOTYPES +*/ +void DoEmFloat(void); + +/* +** EXTERNALS +*/ +extern void SetupCPUEmFloatArrays(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, + ulong arraysize); +extern ulong DoEmFloatIteration(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, + ulong arraysize, ulong loops); + +/************************* +** FOURIER COEFFICIENTS ** +*************************/ + +/* +** PROTOTYPES +*/ +void DoFourier(void); +static ulong DoFPUTransIteration(fardouble *abase, + fardouble *bbase, + ulong arraysize); +static double TrapezoidIntegrate(double x0, + double x1, + int nsteps, + double omegan, + int select); +static double thefunction(double x, + double omegan, + int select); + +/************************* +** ASSIGNMENT ALGORITHM ** +*************************/ + +/* +** DEFINES +*/ + +#define ASSIGNROWS 101L +#define ASSIGNCOLS 101L + +/* +** TYPEDEFS +*/ +typedef struct { + union { + long *p; + long (*ap)[ASSIGNROWS][ASSIGNCOLS]; + } ptrs; +} longptr; + +/* +** PROTOTYPES +*/ +void DoAssign(void); +static ulong DoAssignIteration(farlong *arraybase, + ulong numarrays); +static void LoadAssignArrayWithRand(farlong *arraybase, + ulong numarrays); +static void LoadAssign(farlong arraybase[][ASSIGNCOLS]); +static void CopyToAssign(farlong arrayfrom[][ASSIGNCOLS], + long arrayto[][ASSIGNCOLS]); +static void Assignment(farlong arraybase[][ASSIGNCOLS]); +static void calc_minimum_costs(long tableau[][ASSIGNCOLS]); +static int first_assignments(long tableau[][ASSIGNCOLS], + short assignedtableau[][ASSIGNCOLS]); +static void second_assignments(long tableau[][ASSIGNCOLS], + short assignedtableau[][ASSIGNCOLS]); + +/******************** +** IDEA ENCRYPTION ** +********************/ + +/* +** DEFINES +*/ +#define IDEAKEYSIZE 16 +#define IDEABLOCKSIZE 8 +#define ROUNDS 8 +#define KEYLEN (6*ROUNDS+4) + +/* +** MACROS +*/ +#define low16(x) ((x) & 0x0FFFF) +#define MUL(x,y) (x=mul(low16(x),y)) + + +typedef u16 IDEAkey[KEYLEN]; + +/* +** PROTOTYPES +*/ +void DoIDEA(void); +static ulong DoIDEAIteration(faruchar *plain1, + faruchar *crypt1, faruchar *plain2, + ulong arraysize, ulong nloops, + IDEAkey Z, IDEAkey DK); +static u16 mul(register u16 a, register u16 b); +static u16 inv(u16 x); +static void en_key_idea(u16 userkey[8], IDEAkey Z); +static void de_key_idea(IDEAkey Z, IDEAkey DK); +static void cipher_idea(u16 in[4], u16 out[4], IDEAkey Z); + +/************************ +** HUFFMAN COMPRESSION ** +************************/ + +/* +** DEFINES +*/ +#define EXCLUDED 32000L /* Big positive value */ + +/* +** TYPEDEFS +*/ +typedef struct { + uchar c; /* Byte value */ + float freq; /* Frequency */ + int parent; /* Parent node */ + int left; /* Left pointer = 0 */ + int right; /* Right pointer = 1 */ +} huff_node; + +/* +** GLOBALS +*/ +static huff_node *hufftree; /* The huffman tree */ +static long plaintextlen; /* Length of plaintext */ + +/* +** PROTOTYPES +*/ +void DoHuffman(); +static void create_text_line(farchar *dt,long nchars); +static void create_text_block(farchar *tb, ulong tblen, + ushort maxlinlen); +static ulong DoHuffIteration(farchar *plaintext, + farchar *comparray, farchar *decomparray, + ulong arraysize, ulong nloops, huff_node *hufftree); +static void SetCompBit(u8 *comparray, u32 bitoffset, char bitchar); +static int GetCompBit(u8 *comparray, u32 bitoffset); + +/******************************** +** BACK PROPAGATION NEURAL NET ** +********************************/ + +/* +** DEFINES +*/ +#define T 1 /* TRUE */ +#define F 0 /* FALSE */ +#define ERR -1 +#define MAXPATS 10 /* max number of patterns in data file */ +#define IN_X_SIZE 5 /* number of neurodes/row of input layer */ +#define IN_Y_SIZE 7 /* number of neurodes/col of input layer */ +#define IN_SIZE 35 /* equals IN_X_SIZE*IN_Y_SIZE */ +#define MID_SIZE 8 /* number of neurodes in middle layer */ +#define OUT_SIZE 8 /* number of neurodes in output layer */ +#define MARGIN 0.1 /* how near to 1,0 do we have to come to stop? */ +#define BETA 0.09 /* beta learning constant */ +#define ALPHA 0.09 /* momentum term constant */ +#define STOP 0.1 /* when worst_error less than STOP, training is done */ + +/* +** GLOBALS +*/ +double mid_wts[MID_SIZE][IN_SIZE]; /* middle layer weights */ +double out_wts[OUT_SIZE][MID_SIZE]; /* output layer weights */ +double mid_out[MID_SIZE]; /* middle layer output */ +double out_out[OUT_SIZE]; /* output layer output */ +double mid_error[MID_SIZE]; /* middle layer errors */ +double out_error[OUT_SIZE]; /* output layer errors */ +double mid_wt_change[MID_SIZE][IN_SIZE]; /* storage for last wt change */ +double out_wt_change[OUT_SIZE][MID_SIZE]; /* storage for last wt change */ +double in_pats[MAXPATS][IN_SIZE]; /* input patterns */ +double out_pats[MAXPATS][OUT_SIZE]; /* desired output patterns */ +double tot_out_error[MAXPATS]; /* measure of whether net is done */ +double out_wt_cum_change[OUT_SIZE][MID_SIZE]; /* accumulated wt changes */ +double mid_wt_cum_change[MID_SIZE][IN_SIZE]; /* accumulated wt changes */ + +double worst_error; /* worst error each pass through the data */ +double average_error; /* average error each pass through the data */ +double avg_out_error[MAXPATS]; /* average error each pattern */ + +int iteration_count; /* number of passes thru network so far */ +int numpats; /* number of patterns in data file */ +int numpasses; /* number of training passes through data file */ +int learned; /* flag--if TRUE, network has learned all patterns */ + +/* +** The Neural Net test requires an input data file. +** The name is specified here. +*/ +char *inpath="NNET.DAT"; + +/* +** PROTOTYPES +*/ +void DoNNET(void); +static ulong DoNNetIteration(ulong nloops); +static void do_mid_forward(int patt); +static void do_out_forward(); +void display_output(int patt); +static void do_forward_pass(int patt); +static void do_out_error(int patt); +static void worst_pass_error(); +static void do_mid_error(); +static void adjust_out_wts(); +static void adjust_mid_wts(); +static void do_back_pass(int patt); +static void move_wt_changes(); +static int check_out_error(); +static void zero_changes(); +static void randomize_wts(); +static int read_data_file(); +/* static int initialize_net(); */ + +/*********************** +** LU DECOMPOSITION ** +** (Linear Equations) ** +***********************/ + +/* +** DEFINES +*/ + +#define LUARRAYROWS 101L +#define LUARRAYCOLS 101L + +/* +** TYPEDEFS +*/ +typedef struct +{ union + { fardouble *p; + fardouble (*ap)[][LUARRAYCOLS]; + } ptrs; +} LUdblptr; + +/* +** GLOBALS +*/ +fardouble *LUtempvv; + +/* +** PROTOTYPES +*/ +void DoLU(void); +static void LUFreeMem(fardouble *a, fardouble *b, + fardouble *abase, fardouble *bbase); +static ulong DoLUIteration(fardouble *a, fardouble *b, + fardouble *abase, fardouble *bbase, + ulong numarrays); +static void build_problem( double a[][LUARRAYCOLS], + int n, double b[LUARRAYROWS]); +static int ludcmp(double a[][LUARRAYCOLS], + int n, int indx[], int *d); +static void lubksb(double a[][LUARRAYCOLS], + int n, int indx[LUARRAYROWS], + double b[LUARRAYROWS]); +static int lusolve(double a[][LUARRAYCOLS], + int n, double b[LUARRAYROWS]); + + diff --git a/nmglobal.h b/nmglobal.h new file mode 100644 index 0000000..2b57db5 --- /dev/null +++ b/nmglobal.h @@ -0,0 +1,519 @@ +/* +** nmglobal.h +** Global definitions for native mode benchmarks. +** +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** 10/95 - Added memory array & alignment -- RG +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* is this a 64 bit architecture? If so, this will define LONG64 */ +#include "pointer.h" + +/* +** SYSTEM DEFINES +*/ + +/* +++ MEMORY +++ */ + +/* +** You must define ONLY ONE of the following identifiers +** to specify the mechanism for allocating memory: +** MALLOCMEM +** DOS16MEM +** MACMEM +*/ + +/* +** Define MALLOCMEM to use the standard malloc() call for +** memory. This is the default for most systems. +*/ +#define MALLOCMEM + +/* +** Define DOS16MEM if you're running in the old 16-bit segmented +** model. This enables some fruity memory management routines +** required for that model. NOT defining this assumes that +** you're running in an environment that allows malloc() to +** get > 64K chunks of memory. +*/ +/* #define DOS16MEM */ + +/* Define MACMEM to use the Mac's GetPtr call to allocate +** memory (instead of malloc()). +*/ +/* #define MACMEM */ + +/* +++ TIMING +++ */ +/* +** You must define ONLY ONE of the following identifiers to pick +** the timing routine used. +** CLOCKWCPS +** CLOCKWCT +** MACTIMEMGR +** WIN31TIMER +*/ + +/* +** Define CLOCKWCPS if you are using the clock() routine and the +** constant used as the divisor to determine seconds is +** CLOCKS_PER_SEC. This is the default in most cases. +*/ +#define CLOCKWCPS + +/* +** Define CLOCKWCT if you are using the clock() routine and the +** constant used as the divisor to determine seconds is CLK_TCK +*/ +/* #define CLOCKWCT */ + +/* +** Define MACTIMEMGR to use the Mac Time manager routines. +** You'll need to be running at least system 6.0.3 or +** better...extended time manager is recommended (system 7 or +** better). +*/ +/* #define MACTIMEMGR */ + +/* +** Define WIN31TIMER to user the timing routines in TOOLHELP.DLL. +** Gets accuracy down to the millisecond. +*/ +/* #define WIN31TIMER */ + +/* +++ MISCELLANEOUS +++ */ + +/* +** Define DOS16 if you'll be compiling under DOS in 16-bit +** (non DOS-extended) mode. This will enable proper definitions +** for the far*** typedefs +*/ +/* #define DOS16 */ + +/* +** Define MAC if you're compiling on a Macintosh. This +** does a number of things: +** includes unix.h +** Incorporates code to mimic the command line via either +** the console library (Symantec/Think) or the SIOUX +** library (Code Warrior). +*/ +/* #define MAC */ + +/* +** Define LONG64 if your compiler emits 64-bit longs. +** This is typically true of Alpha compilers on Unix +** systems...though, who knows, this may change in the +** future. I MOVED THIS DEFINTION INTO THE FILE pointer.h. DO NOT +** DEFINE IT HERE. IT WILL AUTOMATICALLY BE DEFINED IF NECESSARY. +** Uwe F. Mayer, Dec 15, 1996, Nov 15, 1997 +*/ +/* #define LONG64 */ + +/* +** Define MACCWPROF if you are profiling on the Mac using +** Code Warrior. This enables code that turns off the +** profiler in an evern of an error exit. +*/ +/* #define MACCWPROF */ + +#ifdef MAC +#include <unix.h> +#endif + +/* +** ERROR CODES +*/ +#define ERROR_MEMORY 1 +#define ERROR_MEMARRAY_FULL 2 +#define ERROR_MEMARRAY_NFOUND 3 +#define ERROR_FILECREATE 10 +#define ERROR_FILEREAD 11 +#define ERROR_FILEWRITE 12 +#define ERROR_FILEOPEN 13 +#define ERROR_FILESEEK 14 + +/* +** MINIMUM_TICKS +** +** This sets the default number of minimum ticks. +** It can, of course, be overridden by the input +** command file. +** This ultimately gets loaded into the variable +** global_min_ticks, which specifies the minimum +** number of ticks that must take place between +** a StartStopwatch() and StopStopwatch() call. +** The idea is to reduce error buildup. +*/ +#define MINIMUM_TICKS 60 + +/* +** MINIMUM_SECONDS +** +** Minimum number of seconds to run each test. +*/ +#define MINIMUM_SECONDS 5 + +/* +** MAXPOSLONG +** +** This is the maximum positive long. +*/ +#ifdef LONG64 +#define MAXPOSLONG 0x7FFFFFFFFFFFFFFFL +#else +#define MAXPOSLONG 0x7FFFFFFFL +#endif + +/* +** OTHER DEFINES +*/ +#ifndef MAC +#define TRUE 1 +#define FALSE 0 +#endif + +/* +** Memory array size. Used in SYSSPEC for keeping track +** of re-aligned memory. +*/ +#define MEM_ARRAY_SIZE 20 + +/* +** TYPEDEFS +*/ +#define ulong unsigned long +#define uchar unsigned char +#define uint unsigned int +#define ushort unsigned short +/* +typedef unsigned char uchar; +typedef unsigned int uint; +typedef unsigned short ushort; +typedef unsigned long ulong; +*/ +/* +** The 'farxxx' typedefs were added in deference to DOS, which +** requires far pointers to handle some of the bigger +** memory structures. Other systems will simply +** map 'farxxx' to 'xxx' +*/ +#ifdef DOS16 +typedef void huge farvoid; +typedef double huge fardouble; +typedef long huge farlong; +typedef unsigned long huge farulong; +typedef char huge farchar; +typedef unsigned char huge faruchar; + +#else + +typedef void farvoid; +typedef double fardouble; +typedef long farlong; +typedef unsigned long farulong; +typedef char farchar; +typedef unsigned char faruchar; + +#endif + +/* +** The following typedefs are used when element size +** is critical. You'll have to alter these for +** your specifical platform/compiler. +*/ +typedef unsigned char u8; /* Unsigned 8-bits */ +typedef unsigned short u16; /* Unsigned 16 bits */ +#ifdef LONG64 +typedef unsigned int u32; /* Unsigned 32 bits */ +typedef int int32; /* Signed 32 bit integer */ +#else +typedef unsigned long u32; /* Unsigned 32 bits */ +typedef long int32; /* Signed 32 bit integer */ +#endif + +/***************** +** NUMERIC SORT ** +*****************/ +/* +** DEFINES +*/ + +/* +** The following constant, NUMNUMARRAYS (no, it is not a +** Peter Sellers joke) is the maximum number of arrays +** that can be built by the numeric sorting benchmark +** before it gives up. This maximum is dependent on the +** amount of memory in the system. +*/ +/*#define NUMNUMARRAYS 1000*/ +#define NUMNUMARRAYS 10000 + +/* +** The following constant NUMARRAYSIZE determines the +** default # of elements in each numeric array. Ordinarily +** this is something you shouldn't fool with, though as +** with most of the constants here, it is adjustable. +*/ +#define NUMARRAYSIZE 8111L + + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + double sortspersec; /* # of sort iterations per sec */ + ushort numarrays; /* # of arrays */ + ulong arraysize; /* # of elements in array */ +} SortStruct; + +/**************** +** STRING SORT ** +***************** +** Note: The string sort benchmark uses the same structure to +** communicate parameters as does the numeric sort benchmark. +** (i.e., SortStruct...see above. +*/ + +/* +** DEFINES +*/ +/* +** The following constant STRINGARRAYSIZE determines +** the default # of bytes allocated to each string array. +** Though the actual size can be pre-set from the command +** file, this constant should be left unchanged. +*/ +#define STRINGARRAYSIZE 8111L + +/************************ +** BITFIELD OPERATIONS ** +************************* +*/ + +/* +** DEFINES +*/ + +/* +** Following field sets the size of the bitfield array (in longs). +*/ +#ifdef LONG64 +#define BITFARRAYSIZE 16384L +#else +#define BITFARRAYSIZE 32768L +#endif + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + double bitopspersec; /* # of bitfield ops per sec */ + ulong bitoparraysize; /* Total # of bitfield ops */ + ulong bitfieldarraysize; /* Bit field array size */ +} BitOpStruct; + +/**************************** +** EMULATED FLOATING POINT ** +****************************/ +/* +** DEFINES +*/ +#define INTERNAL_FPF_PRECISION 4 + +/* +** The following constant is the maximum number of loops +** of the emulated floating point test that the system +** will allow before flagging an error. This is not a +** critical constant, and can be altered if your system is +** a real barn-burner. +*/ +/*#define CPUEMFLOATLOOPMAX 50000L*/ +#define CPUEMFLOATLOOPMAX 500000L + +/* +** Set size of array +*/ +#define EMFARRAYSIZE 3000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + ulong arraysize; /* Size of array */ + ulong loops; /* Loops per iterations */ + double emflops; /* Results */ +} EmFloatStruct; + +/************************* +** FOURIER COEFFICIENTS ** +*************************/ + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of requested seconds */ + ulong arraysize; /* Size of coeff. arrays */ + double fflops; /* Results */ +} FourierStruct; + +/************************* +** ASSIGNMENT ALGORITHM ** +*************************/ + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong numarrays; /* # of arrays */ + double iterspersec; /* Results */ +} AssignStruct; + +/******************** +** IDEA ENCRYPTION ** +********************/ + +/* +** DEFINES +*/ +/* Following constant defines the max number of loops the +** system will attempt. Keeps things from going off into the +** weeds. */ +/*#define MAXIDEALOOPS 50000L*/ +#define MAXIDEALOOPS 500000L + +/* +** Following constant sets the size of the arrays. +** NOTE: For the IDEA algorithm to work properly, this +** number MUST be some multiple of 8. +*/ +#define IDEAARRAYSIZE 4000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong arraysize; /* Size of array */ + ulong loops; /* # of times to convert */ + double iterspersec; /* Results */ +} IDEAStruct; + + +/************************ +** HUFFMAN COMPRESSION ** +************************/ + +/* +** DEFINES +*/ +/* +** MAXHUFFLOOPS +** +** This constant specifies the maximum number of Huffman +** compression loops the system will try for. This keeps +** the test from going off into the weeds. This is not +** a critical constant, and can be increased if your +** system is a real barn-burner. +*/ +/*#define MAXHUFFLOOPS 50000L*/ +#define MAXHUFFLOOPS 500000L + +/* +** Following constant sets the size of the arrays to +** be compressed/uncompressed. +*/ +#define HUFFARRAYSIZE 5000L + +/* +** TYPEDEFS +*/ + +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong arraysize; /* Size of array */ + ulong loops; /* # of times to compress/decompress */ + double iterspersec; /* Results */ +} HuffStruct; + +/******************************** +** BACK PROPAGATION NEURAL NET ** +********************************/ + +/* +** MAXNNETLOOPS +** +** This constant sets the max number of loops through the neural +** net that the system will attempt before giving up. This +** is not a critical constant. You can alter it if your system +** has sufficient horsepower. +*/ +/*#define MAXNNETLOOPS 50000L*/ +#define MAXNNETLOOPS 500000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong loops; /* # of times to learn */ + double iterspersec; /* Results */ +} NNetStruct; + +/*********************** +** LU DECOMPOSITION ** +** (Linear Equations) ** +***********************/ + +/* +** MAXLUARRAYS +** +** This sets the upper limit on the number of arrays +** that the benchmark will attempt to build before +** flagging an error. It is not a critical constant, and +** may be increased if your system has the horsepower. +*/ +/*#define MAXLUARRAYS 1000*/ +#define MAXLUARRAYS 10000 + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong numarrays; /* # of arrays */ + double iterspersec; /* Results */ +} LUStruct; + diff --git a/pointer.c b/pointer.c new file mode 100644 index 0000000..f4de577 --- /dev/null +++ b/pointer.c @@ -0,0 +1,6 @@ +#include <stdio.h> +int main(){ + printf("%d",(int)sizeof(long)); + return(0); +} + diff --git a/sysinfo.c.example b/sysinfo.c.example new file mode 100644 index 0000000..db650f0 --- /dev/null +++ b/sysinfo.c.example @@ -0,0 +1,10 @@ +sprintf(buffer,"**System used for compilation:\n"); +output_string(buffer); +sprintf(buffer,"**Linux mimi 2.0.31 #5 Thu Oct 23 10:02:08 CDT 1997 i486\n"); +output_string(buffer); +sprintf(buffer,"**C compiler: gcc version 2.7.2.3\n"); +output_string(buffer); +sprintf(buffer,"**libc: libc.so.5.4.38\n"); +output_string(buffer); +sprintf(buffer,"**Date of compilation: Thu Nov 20 10:04:43 CST 1997\n"); +output_string(buffer); diff --git a/sysinfo.c.template b/sysinfo.c.template new file mode 100644 index 0000000..c1a986c --- /dev/null +++ b/sysinfo.c.template @@ -0,0 +1,10 @@ +sprintf(buffer,"**System used for compilation:\n"); +output_string(buffer); +sprintf(buffer,"**%SYSTEM%\n"); +output_string(buffer); +sprintf(buffer,"**C compiler: %CCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"**libc: %LIBCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"**Date of compilation: %DATE%\n"); +output_string(buffer); diff --git a/sysinfo.sh b/sysinfo.sh new file mode 100755 index 0000000..57754fe --- /dev/null +++ b/sysinfo.sh @@ -0,0 +1,78 @@ +#!/bin/sh + +# the arguments of this script are the compiler name and flags + +# try to solve a chicken-and-egg problem on SunOS +# ucb's test program does not handle -L like the other test programs +# let's try to find another implementation +if test -x /bin/test; then + TEST=/bin/test; +else + if test -x /usr/bin/test; then + TEST=/usr/bin/test; + else + # cross your fingers that it's not like ucb test + TEST=test; + fi +fi + +compiler=`echo $* | sed -e 's/-static//g' -e 's/-Bstatic//g'` +if $TEST `basename $1` = "gcc" && ($compiler -v) >/dev/null 2>&1 ; then +# Cygwin writes more than one line with "version" in it + gccversion=`$compiler -v 2>&1 | sed -e "/version/!d" | tail -n 1` +else + gccversion="$1" +fi + +libcversion="" +if ($* hello.c -o hello) >/dev/null 2>&1; then + ldd_output=`(ldd hello) 2>&1` + libcversion=`echo $ldd_output | sed -e 's/.*static.*/static/' \ + -e 's/.*not a dynamic.*/static/'` + if $TEST "$libcversion" = "static" ; then + if ($compiler hello.c -o hello) >/dev/null 2>&1; then + if (ldd hello) >/dev/null 2>/dev/null; then + libcversion=`(ldd hello) 2>&1` + libcversion=`echo $libcversion | sed -e '/libc/!d'\ + -e 's/^[ ]*//' \ + -e 's/.*=>[ ][ ]*\([^ ]*\).*/\1/'` + # remember the current directory + current=`pwd` + while $TEST -L "$libcversion" && ! $TEST "$libcversion" = "" ; do + libcitself=`basename $libcversion` + libpath=`echo $libcversion | sed -e "s/$libcitself$//"` + if $TEST -d "$libpath" ; then + cd $libpath + fi + if ls $libcitself >/dev/null 2>/dev/null ; then + libcversion=`ls -l $libcitself | \ + sed -e 's/.*->[ ][ ]*\(.*\)$/\1/'` + else + # something must have gone wrong, let's bail out + libcversion="" + fi + done + # return to the current directory + cd $current + fi + fi + else + libcversion="" + fi +fi + +rm -f sysinfo.crm sysinfoc.c hello + +# this bombs out on Ultrix which expect "cut -d" + +compsystem=`uname -a | cut -b 1-78` +compdate=`date|cut -b1-55` + +# let's hope that ctrl-c is not part of any string here +# this also will barf later if " is in any of the strings + +for i in sysinfo.c sysinfoc.c ; do + sed -e "s%CCVERSION%$gccversion" -e "s%LIBCVERSION%$libcversion"\ + -e "s%SYSTEM%$compsystem" -e "s%DATE%$compdate"\ + ${i}.template > $i +done diff --git a/sysinfoc.c.example b/sysinfoc.c.example new file mode 100644 index 0000000..7da71ac --- /dev/null +++ b/sysinfoc.c.example @@ -0,0 +1,4 @@ +sprintf(buffer,"C compiler : gcc version 2.7.2.3\n"); +output_string(buffer); +sprintf(buffer,"libc : libc.so.5.4.38\n"); +output_string(buffer); diff --git a/sysinfoc.c.template b/sysinfoc.c.template new file mode 100644 index 0000000..922a5de --- /dev/null +++ b/sysinfoc.c.template @@ -0,0 +1,4 @@ +sprintf(buffer,"C compiler : %CCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"libc : %LIBCVERSION%\n"); +output_string(buffer); diff --git a/sysspec.c b/sysspec.c new file mode 100644 index 0000000..a97010d --- /dev/null +++ b/sysspec.c @@ -0,0 +1,884 @@ + +/* +** sysspec.c +** System-specific routines. +** +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/*********************************** +** SYSTEM-SPECIFIC ROUTINES ** +************************************ +** +** These are the routines that provide functions that are +** system-specific. If the benchmarks are to be ported +** to new hardware/new O.S., this is the first place to +** start. +*/ +#include "sysspec.h" + +#ifdef DOS16 +#include <io.h> +#include <fcntl.h> +#include <sys\stat.h> +#endif +/********************************* +** MEMORY MANAGEMENT ROUTINES ** +*********************************/ + + +/**************************** +** AllocateMemory +** This routine returns a void pointer to a memory +** block. The size of the memory block is given in bytes +** as the first argument. This routine also returns an +** error code in the second argument. +** 10/95 Update: +** Added an associative array for memory alignment reasons. +** mem_array[2][MEM_ARRAY_SIZE] +** mem_array[0][n] = Actual address (from malloc) +** mem_array[1][n] = Aligned address +** Currently, mem_array[][] is only used if you use malloc; +** it is not used for the 16-bit DOS and MAC versions. +*/ +farvoid *AllocateMemory(unsigned long nbytes, /* # of bytes to alloc */ + int *errorcode) /* Returned error code */ +{ +#ifdef DOS16MEM +union REGS registers; +unsigned short nparas; /* # of paragraphs */ + +/* +** Set # of paragraphs to nbytes/16 +1. The +1 is a +** slop factor. +*/ +nparas=(unsigned short)(nbytes/16L) + 1; + +/* +** Set incoming registers. +*/ +registers.h.ah=0x48; /* Allocate memory */ +registers.x.bx=nparas; /* # of paragraphs */ + + +intdos(®isters,®isters); /* Call DOS */ + +/* +** See if things succeeded. +*/ +if(registers.x.cflag) +{ printf("error: %d Lgst: %d\n",registers.x.ax,registers.x.bx); + *errorcode=ERROR_MEMORY; + return((farvoid *)NULL); +} + +/* +** Create a farvoid pointer to return. +*/ +*errorcode=0; +return((farvoid *)MK_FP(registers.x.ax,0)); + +#endif + +#ifdef MACMEM +/* +** For MAC CodeWarrior, we'll use the MacOS NewPtr call +*/ +farvoid *returnval; +returnval=(farvoid *)NewPtr((Size)nbytes); +if(returnval==(farvoid *)NULL) + *errorcode=ERROR_MEMORY; +else + *errorcode=0; +return(returnval); +#endif + +#ifdef MALLOCMEM +/* +** Everyone else, its pretty straightforward, given +** that you use a 32-bit compiler which treats size_t as +** a 4-byte entity. +*/ +farvoid *returnval; /* Return value */ +ulong true_addr; /* True address */ +ulong adj_addr; /* Adjusted address */ + +returnval=(farvoid *)malloc((size_t)(nbytes+2L*(long)global_align)); +if(returnval==(farvoid *)NULL) + *errorcode=ERROR_MEMORY; +else + *errorcode=0; + +/* +** Check for alignment +*/ +adj_addr=true_addr=(ulong)returnval; +if(global_align==0) +{ + if(AddMemArray(true_addr, adj_addr)) + *errorcode=ERROR_MEMARRAY_FULL; + return(returnval); +} + +if(global_align==1) +{ + if(true_addr%2==0) adj_addr++; +} +else +{ + while(adj_addr%global_align!=0) ++adj_addr; + if(adj_addr%(global_align*2)==0) adj_addr+=global_align; +} +returnval=(void *)adj_addr; +if(AddMemArray(true_addr,adj_addr)) + *errorcode=ERROR_MEMARRAY_FULL; +return(returnval); +#endif + +} + + +/**************************** +** FreeMemory +** This is the reverse of AllocateMemory. The memory +** block passed in is freed. Should an error occur, +** that error is returned in errorcode. +*/ +void FreeMemory(farvoid *mempointer, /* Pointer to memory block */ + int *errorcode) +{ +#ifdef DOS16MEM +/* +** 16-bit DOS VERSION!! +*/ +unsigned int segment; +unsigned int offset; +union REGS registers; +struct SREGS sregisters; + +/* +** First get the segment/offset of the farvoid pointer. +*/ +segment=FP_SEG(mempointer); +offset=FP_OFF(mempointer); + +/* +** Align the segment properly. For as long as offset > 16, +** subtract 16 from offset and add 1 to segment. +*/ +while(offset>=16) +{ offset-=16; + segment++; +} + +/* +** Build the call to DOS +*/ +registers.h.ah=0x49; /* Free memory */ +sregisters.es=segment; + +intdosx(®isters,®isters,&sregisters); + +/* +** Check for error +*/ +if(registers.x.cflag) +{ *errorcode=ERROR_MEMORY; + return; +} + +*errorcode=0; +return; +#endif + +#ifdef MACMEM +DisposPtr((Ptr)mempointer); +*errorcode=0; +return; +#endif + +#ifdef MALLOCMEM +ulong adj_addr, true_addr; + +/* Locate item in memory array */ +adj_addr=(ulong)mempointer; +if(RemoveMemArray(adj_addr, &true_addr)) +{ *errorcode=ERROR_MEMARRAY_NFOUND; + return; +} +mempointer=(void *)true_addr; +free(mempointer); +*errorcode=0; +return; +#endif +} + +/**************************** +** MoveMemory +** Moves n bytes from a to b. Handles overlap. +** In most cases, this is just a memmove operation. +** But, not in DOS....noooo.... +*/ +void MoveMemory( farvoid *destination, /* Destination address */ + farvoid *source, /* Source address */ + unsigned long nbytes) +{ + +/* +++16-bit DOS VERSION+++ */ +#ifdef DOS16MEM + + FarDOSmemmove( destination, source, nbytes); + +#else + +memmove(destination, source, nbytes); + +#endif +} + +#ifdef DOS16MEM + +/**************************** +** FarDOSmemmove +** Performs the same function as memmove for DOS when +** the arrays are defined with far pointers. +*/ +void FarDOSmemmove(farvoid *destination, /* Destination pointer */ + farvoid *source, /* Source pointer */ + unsigned long nbytes) /* # of bytes to move */ +{ +unsigned char huge *uchsource; /* Temp source */ +unsigned char huge *uchdest; /* Temp destination */ +unsigned long saddr; /* Source "true" address */ +unsigned long daddr; /* Destination "true" address */ + + +/* +** Get unsigned char pointer equivalents +*/ +uchsource=(unsigned char huge *)source; +uchdest=(unsigned char huge *)destination; + +/* +** Calculate true address of source and destination and +** compare. +*/ +saddr=(unsigned long)(FP_SEG(source)*16 + FP_OFF(source)); +daddr=(unsigned long)(FP_SEG(destination)*16 + FP_OFF(destination)); + +if(saddr > daddr) +{ + /* + ** Source is greater than destination. + ** Use a series of standard move operations. + ** We'll move 65535 bytes at a time. + */ + while(nbytes>=65535L) + { _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t) 65535); + uchsource+=65535; /* Advance pointers */ + uchdest+=65535; + nbytes-=65535; + } + + /* + ** Move remaining bytes + */ + if(nbytes!=0L) + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t)(nbytes & 0xFFFF)); + +} +else +{ + /* + ** Destination is greater than source. + ** Advance pointers to the end of their + ** respective blocks. + */ + uchsource+=nbytes; + uchdest+=nbytes; + + /* + ** Again, move 65535 bytes at a time. However, + ** "back" the pointers up before doing the + ** move. + */ + while(nbytes>=65535L) + { + uchsource-=65535; + uchdest-=65535; + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t) 65535); + nbytes-=65535; + } + + /* + ** Move remaining bytes. + */ + if(nbytes!=0L) + { uchsource-=nbytes; + uchdest-=nbytes; + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t)(nbytes & 0xFFFF)); + } +} +return; +} +#endif + +/*********************************** +** MEMORY ARRAY HANDLING ROUTINES ** +***********************************/ +/**************************** +** InitMemArray +** Initialize the memory array. This simply amounts to +** setting mem_array_ents to zero, indicating that there +** isn't anything in the memory array. +*/ +void InitMemArray(void) +{ +mem_array_ents=0; +return; +} + +/*************************** +** AddMemArray +** Add a pair of items to the memory array. +** true_addr is the true address (mem_array[0][n]) +** adj_addr is the adjusted address (mem_array[0][n]) +** Returns 0 if ok +** -1 if not enough room +*/ +int AddMemArray(ulong true_addr, + ulong adj_addr) +{ +if(mem_array_ents>=MEM_ARRAY_SIZE) + return(-1); + +mem_array[0][mem_array_ents]=true_addr; +mem_array[1][mem_array_ents]=adj_addr; +mem_array_ents++; +return(0); +} + +/************************* +** RemoveMemArray +** Given an adjusted address value (mem_array[1][n]), locate +** the entry and remove it from the mem_array. +** Also returns the associated true address. +** Returns 0 if ok +** -1 if not found. +*/ +int RemoveMemArray(ulong adj_addr,ulong *true_addr) +{ +int i,j; + +/* Locate the item in the array. */ +for(i=0;i<mem_array_ents;i++) + if(mem_array[1][i]==adj_addr) + { /* Found it..bubble stuff down */ + *true_addr=mem_array[0][i]; + j=i; + while(j+1<mem_array_ents) + { mem_array[0][j]=mem_array[0][j+1]; + mem_array[1][j]=mem_array[1][j+1]; + j++; + } + mem_array_ents--; + return(0); /* Return if found */ + } + +/* If we made it here...something's wrong...show error */ +return(-1); +} + +/********************************** +** FILE HANDLING ROUTINES ** +**********************************/ + +/**************************** +** CreateFile +** This routine accepts a filename for an argument and +** creates that file in the current directory (unless the +** name contains a path that overrides the current directory). +** Note that the routine does not OPEN the file. +** If the file exists, it is truncated to length 0. +*/ +void CreateFile(char *filename, + int *errorcode) +{ + +#ifdef DOS16 +/* +** DOS VERSION!! +*/ +int fhandle; /* File handle used internally */ + +fhandle=open(filename,O_CREAT | O_TRUNC, S_IREAD | S_IWRITE); + +if(fhandle==-1) + *errorcode=ERROR_FILECREATE; +else + *errorcode=0; + +/* +** Since all we're doing here is creating the file, +** go ahead and close it. +*/ +close(fhandle); + +return; +#endif + +#ifdef LINUX +FILE *fhandle; /* File handle used internally */ + +fhandle=fopen(filename,"w"); + +if(fhandle==NULL) + *errorcode=ERROR_FILECREATE; +else + *errorcode=0; + +/* +** Since all we're doing here is creating the file, +** go ahead and close it. +*/ +fclose(fhandle); + +return; +#endif +} + +/**************************** +** bmOpenFile +** Opens the file given by fname, returning its handle. +** If an error occurs, returns its code in errorcode. +** The file is opened in read-write exclusive mode. +*/ +#ifdef DOS16 +/* +** DOS VERSION!! +*/ + +int bmOpenFile(char *fname, /* File name */ + int *errorcode) /* Error code returned */ +{ + +int fhandle; /* Returned file handle */ + +fhandle=open(fname,O_BINARY | O_RDWR, S_IREAD | S_IWRITE); + +if(fhandle==-1) + *errorcode=ERROR_FILEOPEN; +else + *errorcode=0; + +return(fhandle); +} +#endif + + +#ifdef LINUX + +FILE *bmOpenFile(char *fname, /* File name */ + int *errorcode) /* Error code returned */ +{ + +FILE *fhandle; /* Returned file handle */ + +fhandle=fopen(fname,"w+"); + +if(fhandle==NULL) + *errorcode=ERROR_FILEOPEN; +else + *errorcode=0; + +return(fhandle); +} +#endif + + +/**************************** +** CloseFile +** Closes the file identified by fhandle. +** A more inocuous routine there never was. +*/ +#ifdef DOS16 +/* +** DOS VERSION!!! +*/ +void CloseFile(int fhandle, /* File handle */ + int *errorcode) /* Returned error code */ +{ + +close(fhandle); +*errorcode=0; +return; +} +#endif +#ifdef LINUX +void CloseFile(FILE *fhandle, /* File handle */ + int *errorcode) /* Returned error code */ +{ +fclose(fhandle); +*errorcode=0; +return; +} +#endif + +/**************************** +** readfile +** Read bytes from an opened file. This routine +** is a combination seek-and-read. +** Note that this routine expects the offset to be from +** the beginning of the file. +*/ +#ifdef DOS16 +/* +** DOS VERSION!! +*/ + +void readfile(int fhandle, /* File handle */ + unsigned long offset, /* Offset into file */ + unsigned long nbytes, /* # of bytes to read */ + void *buffer, /* Buffer to read into */ + int *errorcode) /* Returned error code */ +{ + +long newoffset; /* New offset by lseek */ +int readcode; /* Return code from read */ + +/* +** Presume success. +*/ +*errorcode=0; + +/* +** Seek to the proper offset. +*/ +newoffset=lseek(fhandle,(long)offset,SEEK_SET); +if(newoffset==-1L) +{ *errorcode=ERROR_FILESEEK; + return; +} + +/* +** Do the read. +*/ +readcode=read(fhandle,buffer,(unsigned)(nbytes & 0xFFFF)); +if(readcode==-1) + *errorcode=ERROR_FILEREAD; + +return; +} +#endif +#ifdef LINUX +void readfile(FILE *fhandle, /* File handle */ + unsigned long offset, /* Offset into file */ + unsigned long nbytes, /* # of bytes to read */ + void *buffer, /* Buffer to read into */ + int *errorcode) /* Returned error code */ +{ + +long newoffset; /* New offset by fseek */ +size_t nelems; /* Expected return code from read */ +size_t readcode; /* Actual return code from read */ + +/* +** Presume success. +*/ +*errorcode=0; + +/* +** Seek to the proper offset. +*/ +newoffset=fseek(fhandle,(long)offset,SEEK_SET); +if(newoffset==-1L) +{ *errorcode=ERROR_FILESEEK; + return; +} + +/* +** Do the read. +*/ +nelems=(size_t)(nbytes & 0xFFFF); +readcode=fread(buffer,(size_t)1,nelems,fhandle); +if(readcode!=nelems) + *errorcode=ERROR_FILEREAD; + +return; +} +#endif + +/**************************** +** writefile +** writes bytes to an opened file. This routine is +** a combination seek-and-write. +** Note that this routine expects the offset to be from +** the beinning of the file. +*/ +#ifdef DOS16 +/* +** DOS VERSION!! +*/ + +void writefile(int fhandle, /* File handle */ + unsigned long offset, /* Offset into file */ + unsigned long nbytes, /* # of bytes to read */ + void *buffer, /* Buffer to read into */ + int *errorcode) /* Returned error code */ +{ + +long newoffset; /* New offset by lseek */ +int writecode; /* Return code from write */ + +/* +** Presume success. +*/ +*errorcode=0; + +/* +** Seek to the proper offset. +*/ +newoffset=lseek(fhandle,(long)offset,SEEK_SET); +if(newoffset==-1L) +{ *errorcode=ERROR_FILESEEK; + return; +} + +/* +** Do the write. +*/ +writecode=write(fhandle,buffer,(unsigned)(nbytes & 0xFFFF)); +if(writecode==-1) + *errorcode=ERROR_FILEWRITE; + +return; +} +#endif + +#ifdef LINUX + +void writefile(FILE *fhandle, /* File handle */ + unsigned long offset, /* Offset into file */ + unsigned long nbytes, /* # of bytes to read */ + void *buffer, /* Buffer to read into */ + int *errorcode) /* Returned error code */ +{ + +long newoffset; /* New offset by lseek */ +size_t nelems; /* Expected return code from write */ +size_t writecode; /* Actual return code from write */ + +/* +** Presume success. +*/ +*errorcode=0; + +/* +** Seek to the proper offset. +*/ +newoffset=fseek(fhandle,(long)offset,SEEK_SET); +if(newoffset==-1L) +{ *errorcode=ERROR_FILESEEK; + return; +} + +/* +** Do the write. +*/ +nelems=(size_t)(nbytes & 0xFFFF); +writecode=fwrite(buffer,(size_t)1,nelems,fhandle); +if(writecode==nelems) + *errorcode=ERROR_FILEWRITE; + +return; +} +#endif + + +/******************************** +** ERROR HANDLING ROUTINES ** +********************************/ + +/**************************** +** ReportError +** Report error message condition. +*/ +void ReportError(char *errorcontext, /* Error context string */ + int errorcode) /* Error code number */ +{ + +/* +** Display error context +*/ +printf("ERROR CONDITION\nContext: %s\n",errorcontext); + +/* +** Display code +*/ +printf("Code: %d",errorcode); + +return; +} + +/**************************** +** ErrorExit +** Peforms an exit from an error condition. +*/ +void ErrorExit() +{ + +/* +** For profiling on the Mac with MetroWerks -- 11/17/94 RG +** Have to do this to turn off profiler. +*/ +#ifdef MACCWPROF +#if __profile__ +ProfilerTerm(); +#endif +#endif + +/* +** FOR NOW...SIMPLE EXIT +*/ +exit(1); +} + +/***************************** +** STOPWATCH ROUTINES ** +*****************************/ + +/**************************** +** StartStopwatch +** Starts a software stopwatch. Returns the first value of +** the stopwatch in ticks. +*/ +unsigned long StartStopwatch() +{ +#ifdef MACTIMEMGR +/* +** For Mac code warrior, use timer. In this case, what we return is really +** a dummy value. +*/ +InsTime((QElemPtr)&myTMTask); +PrimeTime((QElemPtr)&myTMTask,-MacHSTdelay); +return((unsigned long)1); +#else +#ifdef WIN31TIMER +/* +** Win 3.x timer returns a DWORD, which we coax into a long. +*/ +_Call16(lpfn,"p",&win31tinfo); +return((unsigned long)win31tinfo.dwmsSinceStart); +#else +return((unsigned long)clock()); +#endif +#endif +} + +/**************************** +** StopStopwatch +** Stops the software stopwatch. Expects as an input argument +** the stopwatch start time. +*/ +unsigned long StopStopwatch(unsigned long startticks) +{ + +#ifdef MACTIMEMGR +/* +** For Mac code warrior...ignore startticks. Return val. in microseconds +*/ +RmvTime((QElemPtr)&myTMTask); +return((unsigned long)(MacHSTdelay+myTMTask.tmCount-MacHSTohead)); +#else +#ifdef WIN31TIMER +_Call16(lpfn,"p",&win31tinfo); +return((unsigned long)win31tinfo.dwmsSinceStart-startticks); +#else +return((unsigned long)clock()-startticks); +#endif +#endif +} + +/**************************** +** TicksToSecs +** Converts ticks to seconds. Converts ticks to integer +** seconds, discarding any fractional amount. +*/ +unsigned long TicksToSecs(unsigned long tickamount) +{ +#ifdef CLOCKWCT +return((unsigned long)(tickamount/CLK_TCK)); +#endif + +#ifdef MACTIMEMGR +/* +++ MAC time manager version (using timer in microseconds) +++ */ +return((unsigned long)(tickamount/1000000)); +#endif + +#ifdef CLOCKWCPS +/* Everybody else */ +return((unsigned long)(tickamount/CLOCKS_PER_SEC)); +#endif + +#ifdef WIN31TIMER +/* Each tick is 840 nanoseconds */ +return((unsigned long)(tickamount/1000L)); +#endif + +} + +/**************************** +** TicksToFracSecs +** Converts ticks to fractional seconds. In other words, +** this returns the exact conversion from ticks to +** seconds. +*/ +double TicksToFracSecs(unsigned long tickamount) +{ +#ifdef CLOCKWCT +return((double)tickamount/(double)CLK_TCK); +#endif + +#ifdef MACTIMEMGR +/* +++ MAC time manager version +++ */ +return((double)tickamount/(double)1000000); +#endif + +#ifdef CLOCKWCPS +/* Everybody else */ +return((double)tickamount/(double)CLOCKS_PER_SEC); +#endif + +#ifdef WIN31TIMER +/* Using 840 nanosecond ticks */ +return((double)tickamount/(double)1000); +#endif +} + diff --git a/sysspec.h b/sysspec.h new file mode 100644 index 0000000..ba57a96 --- /dev/null +++ b/sysspec.h @@ -0,0 +1,168 @@ +/* +** sysspec.h +** Header file for sysspec.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** Standard includes +*/ +#include <stdlib.h> +#include <stdio.h> +#include <time.h> +#include <string.h> + +#include "nmglobal.h" + +#if !defined(MAC) && !defined(OSX) +#include <malloc.h> +#endif + + +/* +** System-specific includes +*/ + +#ifdef DOS16MEM +#include "dos.h" +#endif + +/* #include "time.h" +#include "io.h" +#include "fcntl.h" +#include "sys\stat.h" */ +/* Removed for MSVC++ +#include "alloc.h" +*/ + +/* +** MAC Time Manager routines (from Code Warrior) +*/ +#ifdef MACTIMEMGR +#include <memory.h> +#include <lowmem.h> +#include <Types.h> +#include <Timer.h> +extern struct TMTask myTMTask; +extern long MacHSTdelay,MacHSTohead; +#endif + +/* +** Windows 3.1 timer defines +*/ +#ifdef WIN31TIMER +#include <windows.h> +#include <toolhelp.h> +TIMERINFO win31tinfo; +HANDLE hThlp; +FARPROC lpfn; +#endif + +/************** +** EXTERNALS ** +**************/ +extern ulong mem_array[2][MEM_ARRAY_SIZE]; +extern int mem_array_ents; +extern int global_align; + +/**************************** +** FUNCTION PROTOTYPES ** +****************************/ + +farvoid *AllocateMemory(unsigned long nbytes, + int *errorcode); + +void FreeMemory(farvoid *mempointer, + int *errorcode); + +void MoveMemory( farvoid *destination, + farvoid *source, + unsigned long nbytes); + +#ifdef DOS16MEM +void FarDOSmemmove(farvoid *destination, + farvoid *source, + unsigned long nbytes); +#endif + +void InitMemArray(void); + +int AddMemArray(ulong true_addr, ulong adj_addr); + +int RemoveMemArray(ulong adj_addr,ulong *true_addr); + +void ReportError(char *context, int errorcode); + +void ErrorExit(); + +void CreateFile(char *filename, + int *errorcode); + +#ifdef DOS16 +int bmOpenFile(char *fname, + int *errorcode); + +void CloseFile(int fhandle, + int *errorcode); + +void readfile(int fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +void writefile(int fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); +#endif + +#ifdef LINUX +FILE *bmOpenFile(char *fname, + int *errorcode); + +void CloseFile(FILE *fhandle, + int *errorcode); + +void readfile(FILE *fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +void writefile(FILE *fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +#endif + +unsigned long StartStopwatch(); + +unsigned long StopStopwatch(unsigned long startticks); + +unsigned long TicksToSecs(unsigned long tickamount); + +double TicksToFracSecs(unsigned long tickamount); + diff --git a/wordcat.h b/wordcat.h new file mode 100644 index 0000000..9f18b42 --- /dev/null +++ b/wordcat.h @@ -0,0 +1,81 @@ +/* +** wordcat.h +** Word catalog +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** Word catalog +*/ +#define WORDCATSIZE 50 + +char *wordcatarray[WORDCATSIZE] = +{ "Hello", + "He", + "Him", + "the", + "this", + "that", + "though", + "rough", + "cough", + "obviously", + "But", + "but", + "bye", + "begin", + "beginning", + "beginnings", + "of", + "our", + "ourselves", + "yourselves", + "to", + "together", + "togetherness", + "from", + "either", + "I", + "A", + "return", + "However", + "that", + "example", + "yet", + "quickly", + "all", + "if", + "were", + "includes", + "always", + "never", + "not", + "small", + "returns", + "set", + "basic", + "Entered", + "with", + "used", + "shown", + "you", + "know" }; |