From 91b4edf69e5adf9c40edcaff38b880218b7b0d9d Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Tue, 11 Nov 2008 21:27:09 +0000 Subject: Initial Import git-svn-id: svn://mattst88.com/svn/cleanbench/trunk@1 0d43b9a7-5ab2-4d7b-af9d-f64450cef757 --- COM.DAT | 11 + Changes | 42 + Makefile | 153 ++ NNET.DAT | 210 +++ README | 66 + README.motorola | 29 + README.nonlinux | 50 + README.submit | 33 + RESULTS | 138 ++ bdoc.txt | 2109 ++++++++++++++++++++++++ debugbit.good.gz | Bin 0 -> 1019 bytes emfloat.c | 1343 ++++++++++++++++ emfloat.h | 154 ++ hardware | Bin 0 -> 17013 bytes hardware.c | 202 +++ hardware.h | 2 + hello.c | 2 + misc.c | 120 ++ misc.h | 41 + nbench0.c | 1174 ++++++++++++++ nbench0.h | 356 +++++ nbench1.c | 4445 +++++++++++++++++++++++++++++++++++++++++++++++++++ nbench1.h | 428 +++++ nmglobal.h | 519 ++++++ pointer.c | 6 + sysinfo.c.example | 10 + sysinfo.c.template | 10 + sysinfo.sh | 78 + sysinfoc.c.example | 4 + sysinfoc.c.template | 4 + sysspec.c | 884 ++++++++++ sysspec.h | 168 ++ wordcat.h | 81 + 33 files changed, 12872 insertions(+) create mode 100644 COM.DAT create mode 100644 Changes create mode 100644 Makefile create mode 100644 NNET.DAT create mode 100644 README create mode 100644 README.motorola create mode 100644 README.nonlinux create mode 100644 README.submit create mode 100644 RESULTS create mode 100644 bdoc.txt create mode 100644 debugbit.good.gz create mode 100644 emfloat.c create mode 100644 emfloat.h create mode 100755 hardware create mode 100644 hardware.c create mode 100644 hardware.h create mode 100644 hello.c create mode 100644 misc.c create mode 100644 misc.h create mode 100644 nbench0.c create mode 100644 nbench0.h create mode 100644 nbench1.c create mode 100644 nbench1.h create mode 100644 nmglobal.h create mode 100644 pointer.c create mode 100644 sysinfo.c.example create mode 100644 sysinfo.c.template create mode 100755 sysinfo.sh create mode 100644 sysinfoc.c.example create mode 100644 sysinfoc.c.template create mode 100644 sysspec.c create mode 100644 sysspec.h create mode 100644 wordcat.h diff --git a/COM.DAT b/COM.DAT new file mode 100644 index 0000000..8dee49c --- /dev/null +++ b/COM.DAT @@ -0,0 +1,11 @@ +ALLSTATS=T +DONUMSORT=T +DOSTRINGSORT=T +DOBITFIELD=T +DOEMF=T +DOFOUR=T +DOASSIGN=T +DOIDEA=T +DOHUFF=T +DONNET=T +DOLU=T diff --git a/Changes b/Changes new file mode 100644 index 0000000..111d8bd --- /dev/null +++ b/Changes @@ -0,0 +1,42 @@ +This is about BYTE's beta version of the native-algorithm benchmark + +December 16, 1996: + +The source for DOS is obtainable at http://www.byte.com/bmark/bmark.htm +Linux adaptation written by Uwe F. Mayer + +February 7, 1997: + +added -DSOLARIS flag to support solaris + +November 11, 1997: + +added index split suggested by Andrew D. Balsa +re-baselined to a Linux machine +added checking of CPU-type at run-time (cpuinfo.c) +increased maximal number of loops in some tests +removed -DSOLARIS flag, works now automatically (this also removed the + compiler warnings about redefined types and leads to a 20% faster + code for "Bitfield" if compiled with -funroll-loops!) + +November 13-19, 1997: + +changed debugging information +changed random number generator to be always 32 bits even on 64 bit OSs +added data resets to Bitfield and Huffman +created this Changes file +added debug code for Bitfield + +December 6, 1997: + +got rid of cpuinfo.c +added a RESULTS file + +December 7, 1997: + +fixed the statistical analysis used to compute the confidence coefficient +fixed a bug in the DEBUG routine of "Assignment" + +December 11, 1997 +added some entries to RESULTS + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5045c77 --- /dev/null +++ b/Makefile @@ -0,0 +1,153 @@ +# Makefile for nbench, December 11, 1997, Uwe F. Mayer +# Updated February 18, 2003 + +default: nbench + +########################################################################## +# If you are using gcc-2.7.2.3 or earlier: +# The optimizer of gcc has a bug and in general you should not specify +# -funroll-loops together with -O (or -O2, -O3, etc.) +# This bug is supposed to be fixed with release 2.8 of gcc. +# +# This bug does NOT seem to have an effect on the correct compilation +# of this benchmark suite on my Linux box. However, it leads to +# the dreaded "internal compiler error" message on our alpha +# running DEC Unix 4.0b. The Linux-binary that was used to obtain +# the baseline results was nevertheless compiled with +# CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops +# +# You should leave -static in the CFLAGS so that your sysinfo can be +# compiled into the executable. + +CC = gcc + +# generic options for gcc +CFLAGS = -s -static -Wall -O3 + +# if your gcc lets you do it, then try this one +#CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops + +# for gcc on an older Pentium type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -m486 \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# for a newer gcc on a newer Pentium type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -march=i686 \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# for a newer gcc on an Athlon XP type processor you can try the following +#CFLAGS = -s -static -O3 -fomit-frame-pointer -Wall -march=athlon-xp \ +# -fforce-addr -fforce-mem -falign-loops=2 -falign-functions=2 \ +# -falign-jumps=2 -funroll-loops + +# For debugging using gcc +#CFLAGS = -g -O3 -Wall -DDEBUG + +########################################################################## +# For Linux machines with more than one binary format. +# The default binaries, depends on your system whether it's elf or aout. +MACHINE= +# a.out code for linux on an elf machine +#MACHINE= -bi486-linuxaout +# elf code for linux on an a.out machine +#MACHINE= -bi486-linuxelf +# if you want a different compiler version and different binaries, for example +#MACHINE= -V2.7.2 -bi486-linuxaout + +########################################################################## +# Read the file README.nonlinux if you are not using Linux + +# for DEC Unix using cc you can try +#CC = cc +#CFLAGS = -O3 +#LINKFLAGS = -s -non_shared + +# for SunOS using cc +#CC = cc +#CFLAGS = -O3 -s + +# for DEC Ultrix using cc +#CC = cc +#CFLAGS = -O2 +#LINKFLAGS = -s + +# for a Mac with OsX and the Darwin environment +#CC = cc +#CFLAGS = -O3 -DOSX + +# For debugging using cc +#CC = cc +#CFLAGS = -g -DDEBUG + +########################################################################## +# If your system does not understand the system command "uname -s -r" +# then comment this out + +# NO_UNAME= -DNO_UNAME + +########################################################################## +# For any Unix flavor you need -DLINUX +# You also need -DLINUX to get the new indices + +DEFINES= -DLINUX $(NO_UNAME) + +########################################################################## +# For LINUX-like systems with gcc +sysinfoc.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +sysinfo.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +########################################################################## +# For non-LINUX systems +# Edit the files sysinfo.c and sysinfoc.c to include your system information +# and take sysinfo.c and sysinfoc.c out of the dependencies for nbench0.o + +hardware.o: hardware.c hardware.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c hardware.c + +nbench0.o: nbench0.h nbench0.c nmglobal.h pointer.h hardware.h\ + Makefile sysinfo.c sysinfoc.c + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench0.c + +emfloat.o: emfloat.h emfloat.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c emfloat.c + +pointer.h: pointer Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -o pointer pointer.c + rm -f pointer.h + if [ "4" = `./pointer` ] ; then touch pointer.h ;\ + else echo "#define LONG64" >pointer.h ; fi + +misc.o: misc.h misc.c Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c misc.c + +nbench1.o: nbench1.h nbench1.c wordcat.h nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench1.c + +sysspec.o: sysspec.h sysspec.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c sysspec.c + +nbench: emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) $(LINKFLAGS)\ + emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o\ + -o nbench -lm + +########################################################################## + +clean: + - /bin/rm -f *.o *~ \#* core a.out hello sysinfo.c sysinfoc.c \ + bug pointer pointer.h debugbit.dat + +mrproper: clean + - /bin/rm -f nbench diff --git a/NNET.DAT b/NNET.DAT new file mode 100644 index 0000000..5711730 --- /dev/null +++ b/NNET.DAT @@ -0,0 +1,210 @@ +5 7 8 +26 +0 0 1 0 0 +0 1 0 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 0 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 0 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 1 0 0 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 0 1 0 1 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 0 0 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 1 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 0 0 0 +0 1 1 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 1 1 0 +0 1 0 0 1 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 0 1 0 +1 0 0 0 1 +1 0 0 1 0 +1 0 1 0 0 +1 1 0 0 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 0 1 0 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 1 1 0 0 +1 0 0 0 1 +1 1 0 1 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 1 0 1 +1 0 0 0 1 +1 1 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 1 1 +1 0 0 0 1 +0 1 0 0 1 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 1 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 1 0 0 0 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 0 1 1 +0 1 1 1 1 +0 1 0 1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 1 0 0 1 0 +0 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +0 1 1 1 0 +0 0 0 0 1 +0 0 0 0 1 +1 1 1 1 0 +0 1 0 1 0 0 1 1 +1 1 1 1 1 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 0 1 0 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +0 1 0 1 0 +0 1 0 1 0 1 1 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 +0 1 0 1 0 +1 0 0 0 1 +0 1 0 1 1 0 0 0 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 1 0 0 1 +1 1 1 1 1 +0 0 0 1 0 +0 0 0 1 0 +0 0 1 0 0 +0 1 0 0 0 +0 1 0 0 0 +1 1 1 1 1 +0 1 0 1 1 0 1 0 diff --git a/README b/README new file mode 100644 index 0000000..6863d46 --- /dev/null +++ b/README @@ -0,0 +1,66 @@ +February 18, 2003 +----------------- +Bug-fix release. + +December 9, 1997 +---------------- +This release is based on beta release 2 of BYTE Magazine's BYTEmark +benchmark program (previously known as BYTE's Native Mode +Benchmarks). This document covers the Native Mode (a.k.a. Algorithm +Level) tests; benchmarks designed to expose the capabilities of a +system's CPU, FPU, and memory system. + +Running a "make" will create the binary if all goes well. It is called +"nbench" and performs a suite of 10 tests and compares the results to +a Dell Pentium 90 with 16 MB RAM and 256 KB L2 cache running MSDOS and +compiling with the Watcom 10.0 C/C++ compiler. If you define -DLINUX +during compilation (the default) then you also get a comparison to an +AMD K6/233 with 32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and +using a binary which was compiled with GNU gcc version 2.7.2.3 and GNU +libc-5.4.38. + +For more verbose output specify -v as an argument. + +The primary web site is: http://www.tux.org/~mayer/linux/bmark.html + +The port to Linux/Unix was done by Uwe F. Mayer . + +The index-split was done by Andrew D. Balsa, and reflects the +realization that memory management is important in CPU design. The +original tests have been left alone, however, the tests NUMERIC SORT, +FP EMULATION, IDEA, and HUFFMAN now constitute the integer-arithmetic +focused benchmark index, while the tests STRING SORT, BITFIELD, and +ASSIGNMENT make up the new memory index. + +The algorithms were not changed from the source which was obtained +from the BYTE web site at http://www.byte.com/bmark/bmark.htm on +December 14, 1996. However, the source was modified to better work +with 64-bit machines (in particular the random number generator was +modified to always work with 32 bit, no matter what kind of hardware +you run it on). Furthermore, for some of the algorithms additional +resettings of the data was added to increase the consistency across +different hardware. Some extra debugging code was added, which has no +impact on normal runs. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run than on an +unloaded system. This is because the benchmark does some statistical +analysis to make sure that the reported results are statistically +significant, and an increased variation in individual runs requires +more runs to achieve the required statistical confidence. + +This is a single-threaded benchmark and is not designed to measure the +performance gain on multi-processor machines. + +For details and customization read bdoc.txt. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.motorola b/README.motorola new file mode 100644 index 0000000..223001b --- /dev/null +++ b/README.motorola @@ -0,0 +1,29 @@ +The information in this file is old and no longer valid. It seems that +the GNU C library has caught up with Motorola's libmoto, and now +performance is just as good (or better) without libmoto. I'll include +the old notice out of historical reasons only. Currently libmoto is +available at ftp://ftp.mcg.mot.com/pub/SPS/PowerPC/software/mklinux/libmoto/, +but this is subject to change and not under my control. + +February 18, 2003 +Uwe F. Mayer + +--------------------------------------------------------------------------- + +If you have a Motorola CPU or equivalent: + +When linked with the 'libmoto' (floating point library from Motorola) +the results you obtain are much better. (FPU index of 0.896 versus +1.910 in one example.) + +The Motorola math library is currently available at: +http://www.mot.com/SPS/PowerPC/support/rsw_customer_support/mklinux/libmoto/libmoto_reg_mkdev.html + +If you have a Motorola CPU and you submit a result then please let me +know whether you used libmoto or not. Please read the file README.submit. + +I do not have a Motorola CPU, and I can't help you with installing the +library either. + +December 3, 1997 +Uwe F. Mayer \ No newline at end of file diff --git a/README.nonlinux b/README.nonlinux new file mode 100644 index 0000000..641fe09 --- /dev/null +++ b/README.nonlinux @@ -0,0 +1,50 @@ +December 3, 1993 +================ + +DEC Unix 4.0 or DEC OSF1 and gcc +-------------------------------- +Compiles cleanly if you don't use -funroll-loops with gcc-2.7.2.3 or earlier + +DEC UNIX 4.0 or DEC OSF1 and cc +------------------------------- +CC = cc +CFLAGS = -O3 +LINKFLAGS = -s -non_shared + +Compiles cleanly. + +SunOS and gcc +------------- +Compiles cleanly + +SunOS and cc +------------ +CC = cc +CFLAGS = -O3 -s + +Compiles with one warning during compilation of nbench1.c + +"/usr/ucbinclude/strings.h", line 48: warning: identifier redeclared: strlen + current : function() returning int + previous: function() returning uint : "/usr/include/string.h", line 98 + +HP-UX and gcc +------------- +Compiles with one warning during compilation of sysspec.c + +In file included from /usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/malloc.h:9, + from sysspec.h:37, + from sysspec.c:37: +/usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/sys/types.h:117: warning: empty declaration +/usr/local/lib/gcc-lib/hppa1.1-hp-hpux9.05/2.7.2.1/include/sys/types.h:118: warning: empty declaration + +DEC Ultrix and cc +----------------- +CC = cc +CFLAGS = -O2 +LINKFLAGS = -s + +Compiles with a warning about the correct usage of cut when running sysinfo.sh +cut: Usage: cut [-s] [-d] {-c | -f} file ... +cut: Usage: cut [-s] [-d] {-c | -f} file ... + diff --git a/README.submit b/README.submit new file mode 100644 index 0000000..0dd3138 --- /dev/null +++ b/README.submit @@ -0,0 +1,33 @@ +I plan on posting a digest of results in case people mail me any. +The URL will be linked to + +http://www.tux.org/~mayer/linux/bmark.html + +If you want to submit, then run the benchmark (use your own +compilation, I don't care with what flags or compiler, but I want all +numbers from a single benchmark run) and fill in the template as given +in the example below: + +CPU : AMD 5x86P75 (486DX4/133MHz) +L2 CACHE : 256 KB +OS : Linux 2.0.32 +C COMPILER : gcc 2.7.2.3 +LIBC : libc-5.4.38 +Pentium 90 INTEGER INDEX : 1.051 +Pentium 90 FLOATING-POINT INDEX : 0.450 +AMD K6/233 MEMORY INDEX : 0.337 +AMD K6/233 INTEGER INDEX : 0.238 +AMD K6/233 FLOATING-POINT INDEX : 0.230 + +Any other format is fine as long as it contains the same info (write +"unknown" or "?" for data you don't know). For example, you could just +cut the summary from the output of nbench and mail it together with +cache, CPU, and OS info in case it is not already present. Please do +not email me the complete output of nbench, or any other unnecessarily +long email, as this just eats up my hard-disk space. However, long +collections of results are of course welcome. + +Send your result to mayer@tux.org + +Uwe F. Mayer +February 18, 2003 diff --git a/RESULTS b/RESULTS new file mode 100644 index 0000000..ccf2336 --- /dev/null +++ b/RESULTS @@ -0,0 +1,138 @@ +December 7, 1997 + +This file contains a few results so you may compare your machine. +If you read this much after December 1997 then the results herein +are probably obsolete. + +For a longer and hopefully more up-to-date list of results consult +http://www.tux.org/~mayer/linux/bmark.html +This web site, however, currently lists the old Pentium 90 indices! + +The indices below are with respect to the new AMD K6/233 baseline. + +OS : DEC Ultrix 4.4 +C compiler : cc +libc : unknown version +CPU : mips R6000 +L2 cache : ? +MEMORY INDEX : 0.029 +INTEGER INDEX : 0.046 +FLOATING-POINT INDEX: 0.077 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 486DX2/66 MHz +L2 cache : 256 KB +MEMORY INDEX : 0.098 +INTEGER INDEX : 0.141 +FLOATING-POINT INDEX: 0.116 + +OS : LINUX 2.0.32 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : AMD 5x86P75 (486DX4/133MHz) +L2 cache : 256 KB +MEMORY INDEX : 0.234 +INTEGER INDEX : 0.286 +FLOATING-POINT INDEX: 0.249 + +OS : OSF1 V3.2 214 +C compiler : cc +libc : unknown version +CPU : 21064 alpha (DEC 3000 MODEL 300, year 1993) +L2 cache : 256 KB +MEMORY INDEX : 0.358 +INTEGER INDEX : 0.362 +FLOATING-POINT INDEX: 0.656 + +OS : HP-UX A.09.05 +C compiler : gcc version 2.7.2.1 +libc : unknown version +CPU : 9000/715 +L2 cache : ? +MEMORY INDEX : 0.208 +INTEGER INDEX : 0.369 +FLOATING-POINT INDEX: 0.516 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel Pentium 133 MHz +L2 cache : 512 KB +MEMORY INDEX : 0.383 +INTEGER INDEX : 0.444 +FLOATING-POINT INDEX: 0.632 + +OS : SunOS 5.5.1 +C compiler : cc +libc : unknown version +CPU : SUN-Ultra-Enterprise-2 sparc +L2 cache : ? +MEMORY INDEX : 0.417 +INTEGER INDEX : 0.546 +FLOATING-POINT INDEX: 1.028 + +OS : LINUX 2.0.29 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Cyrix 6x86L PR200+ (at 2 x 75 = 150 MHz) +L2 cache : 256 KB +MEMORY INDEX : 0.666 +INTEGER INDEX : 0.599 +FLOATING-POINT INDEX: 0.508 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel Pentium MMX 200 MHz +L2 cache : 512 KB +MEMORY INDEX : 0.601 +INTEGER INDEX : 0.636 +FLOATING-POINT INDEX: 0.970 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 686 PentiumPro 200 MHz +L2 cache : 256 KB (internal) +MEMORY INDEX : 0.699 +INTEGER INDEX : 0.732 +FLOATING-POINT INDEX: 1.140 + +OS : LINUX 2.0.29 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Cyrix 6x86MX PR233 (at 2.5 x 75 = 187.5 MHz) +L2 cache : 512 KB +MEMORY INDEX : 0.861 +INTEGER INDEX : 0.773 +FLOATING-POINT INDEX: 0.730 + +OS : LINUX 2.0.32 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : AMD K6/233 +L2 cache : 512 KB +MEMORY INDEX : 1.000 +INTEGER INDEX : 1.000 +FLOATING-POINT INDEX: 1.000 + +OS : LINUX 2.0.31 +C compiler : gcc version 2.7.2.3 +libc : libc.so.5.4.38 +CPU : Intel 686 Pentium II 300 MHz +L2 cache : 512 KB +MEMORY INDEX : 1.255 +INTEGER INDEX : 1.093 +FLOATING-POINT INDEX: 1.842 + +OS : DEC UNIX 4.0b 564 +C compiler : cc +libc : unknown version +CPU : 21164 Alpha 300 MHz (dual CPU) +L2 cache : 96 KB +L3 cache : 4 MB per CPU +MEMORY INDEX : 0.973 +INTEGER INDEX : 1.124 +FLOATING-POINT INDEX: 3.237 diff --git a/bdoc.txt b/bdoc.txt new file mode 100644 index 0000000..e557bb0 --- /dev/null +++ b/bdoc.txt @@ -0,0 +1,2109 @@ +http://www.byte.com/bmark/bmark.htm +---------------------------------------------------------------------------- + +BYTEmark + +---------------------------------------------------------------------------- + +This is release 2 of BYTE Magazine's BYTEmark benchmark program (previously +known as BYTE's Native Mode Benchmarks). This document covers the Native +Mode (a.k.a. Algorithm Level) tests; benchmarks designed to expose the +capabilities of a system's CPU, FPU, and memory system. Another group of +benchmarks within the BYTEmark suite includes the Application Simulation +Benchmarks. They are detailed in a separate document. [NOTE: The +documentation for the Application simulation benchmarks should appear before +the end of March, 95. -- RG]. + +The Tests + +The Native Mode portion of the BYTEmark consists of a number of well-known +algorithms; some BYTE has used before in earlier versions of the benchmark, +others are new. The complete suite consists of 10 tests: + +Numeric sort - Sorts an array of 32-bit integers. + +String sort - Sorts an array of strings of arbitrary length. + +Bitfield - Executes a variety of bit manipulation functions. + +Emulated floating-point - A small software floating-point package. + +Fourier coefficients - A numerical analysis routine for calculating series +approximations of waveforms. + +Assignment algorithm - A well-known task allocation algorithm. + +Huffman compression - A well-known text and graphics compression algorithm. + +IDEA encryption - A relatively new block cipher algorithm. + +Neural Net - A small but functional back-propagation network simulator. + +LU Decomposition - A robust algorithm for solving linear equations. + +A more complete description of each test can be found in later sections of +this document. + +BYTE built the BYTEmark with the multiplatform world foremost in mind. There +were, of course, other considerations that we kept high on the list: + +Real-world algorithms. The algorithms should actually do something. Previous +benchmarks often moved gobs of bytes from one point to another, added or +subtracted piles and piles of numbers, or (in some cases) actually executed +NOP instructions. We should not belittle those tests of yesterday, they had +their place. However, we think it better that tests be based on activities +that are more complex in nature. + +Easy to port. All the benchmarks are written in "vanilla" ANSI C. This +provides us with the best chance of moving them quickly and accurately to +new processors and operating systems as they appear. It also simplifies +maintenance. + +This means that as new 64-bit (and, perhaps, 128-bit) processors appear, the +benchmarks can test them as soon as a compiler is available. + +Comprehensive. The algorithms were derived from a variety of sources. Some +are routines that BYTE had been using for some time. Others are routines +derived from well-known texts in the computer science world. Furthermore, +the algorithms differ in structure. Some simply "walk" sequentially through +one-dimensional arrays. Others build and manipulate two-dimensional arrays. +Finally, some benchmarks are "integer" tests, while others exercise the +floating-point coprocessor (if one is available). + +Scalable. We wanted these benchmarks to be useful across as wide a variety +of systems as possible. We also wanted to give them a lifetime beyond the +next wave of new processors. + +To that end, we incorporated "dynamic workload adjustment." A complete +description of this appears in a later section. In a nutshell, this allows +the tests to "expand or contract" depending on the capabilities of the +system under test, all the while providing consistent results so that fair +and accurate comparisons are possible. + +Honesty In Advertising + +We'd be lying if we said that the BYTEmark was all the benchmarking that +anyone would ever need to run on a system. It would be equally inaccurate to +suggest that the tests are completely free of inadequacies. There are many +things the tests do not do, there are shortcomings, and there are problems. + +BYTE will continue to improve the BYTEmark. The source code is freely +available, and we encourage vendors and users to examine the routines and +provide us with their feedback. In this way, we assure fairness, +comprehensiveness, and accuracy. + +Still, as we mentioned, there are some shortcomings. Here are those we +consider the most significant. Keep them in mind as you examine the results +of the benchmarks now and in the future. + +At the mercy of C compilers. Being written in ANSI C, the benchmark program +is highly portable. This is a reflection of the "world we live in." If this +were a one-processor world, we might stand a chance at hand-crafting a +benchmark in assembly language. (At one time, that's exactly what BYTE did.) +Not today, no way. + +The upshot is that the benchmarks must be compiled. For broadest coverage, +we selected ANSI C. And when they're compiled, the resulting executable's +performance can be highly dependent on the capabilities of the C compiler. +Today's benchmark results can be blown out of the water tomorrow if someone +new enters the scene with an optimizing strategy that outperforms existing +competition. + +This concern is not easily waved off. It will require you to keep careful +track of compiler version and optimization switches. As BYTE builds its +database of benchmark results, version number and switch setting will become +an integral part of that data. This will be true for published information +as well, so that you can make comparisons fairly and accurately. BYTE will +control the distribution of test results so that all relevant compiler +information is attached to the data. + +As a faint justification -- for those who think this situation results in +"polluted" tests -- we should point out that we are in the same boat as all +the other developers (at least, all those using C compilers -- and that's +quite a sizeable group). If the only C compilers for a given system happen +to be poor ones, everyone suffers. It's a fact that a given platform's +ultimate potential depends as much on the development software available as +on the technical achievements of the hardware design. + +It's just CPU and FPU. It's very tempting to try to capture the performance +of a machine in a single number. That has never been possible -- though it's +been tried a lot -- and the gap between that ideal and reality will forever +widen. + +These benchmarks are meant to expose the theoretical upper limit of the CPU, +FPU, and memory architecture of a system. They cannot measure video, disk, +or network throughput (those are the domains of a different set of +benchmarks). You should, therefore, use the results of these tests as part, +not all, of any evaluation of a system. + +Single threaded. Currently, each benchmark test uses only a single execution +thread. It's unlikely that you'll find any modern operating system that does +not have some multitasking component. How a system "scales" as more tasks +are run simultaneously is an effect that the current benchmarks cannot +explore. + +BYTE is working on a future version of the tests that will solve this +problem. + +The tests are synthetic. This quite reasonable argument is based on the fact +that people don't run benchmarks for a living, they run applications. +Consequently, the only true measure of a system is how well it performs +whatever applications you will be running. This, in fact, is the philosophy +behind the BAPCo benchmarks. + +This is not a point with which we would disagree. BYTE regularly makes use +of a variety of application benchmarks. None of this suggests, however, that +the BYTEmark benchmarks serve no purpose. + +BYTEmark's results should be used as predictors. They can be moved to a new +platform long before native applications will be ported. The BYTEmark +benchmarks will therefore provide an early look at the potential of the +machine. Additionally, the BYTEmark permits you to "home in" on an aspect of +the overall architecture. How well does the system perform when executing +floating-point computations? Does its memory architecture help or hinder the +management of memory buffers that may fall on arbitrary address boundaries? +How does the cache work with a program whose memory access favors moving +randomly through memory as opposed to moving sequentially through memory? + +The answers to these questions can give you a good idea of how well a system +would support a particular class of applications. Only a synthetic benchmark +can give the narrow view necessary to find the answers. + +Dynamic Workloads + +Our long history of benchmarking has taught us one thing above all others: +Tomorrow's system will go faster than today's by an amount exceeding your +wildest guess -- and then some. Dealing with this can become an unending +race. + +It goes like this: You design a benchmark algorithm, you specify its +parameters (how big the array is, how many loops, etc.), you run it on +today's latest super-microcomputer, collect your data, and go home. A new +machine arrives the next day, you run your benchmark, and discover that the +test executes so quickly that the resolution of the clock routine you're +using can't keep up with it (i.e., the test is over and done before the +system clock even has a chance to tick). + +If you modify your routine, the figures you collected yesterday are no good. +If you create a better clock routine by sneaking down into the system +hardware, you can kiss portability goodbye. + +The BYTEmark benchmarks solve this problem by a process we'll refer to as +"dynamic workload adjustment." In principle, it simply means that if the +test runs so fast that the system clock can't time it, the benchmark +increases the test workload -- and keeps increasing it -- until enough time +is consumed to gather reliable test results. + +Here's an example. + +The BYTEmark benchmarks perform timing using a "stopwatch" paradigm. The +routine StartStopwatch() begins timing; StopStopwatch() ends timing and +reports the elapsed time in clock ticks. Now, "clock ticks" is a value that +varies from system to system. We'll presume that our test system provides +1000 clock ticks per second. (We'll also presume that the system actually +updates its clock 1000 times per second. Surprisingly, some systems don't do +that. One we know of will tell you that the clock provides 100 ticks per +second, but updates the clock in 5- or 6-tick increments. The resolution is +no better than somewhere around 1/18th of a second.) Here, when we say +"system" we mean not only the computer system, but the environment provided +by the C compiler. Interestingly, different C compilers for the same system +will report different clock ticks per second. + +Built into the benchmarks is a global variable called GLOBALMINTICKS. This +variable is the minimum number of clock ticks that the benchmark will allow +StopStopwatch() to report. + +Suppose you run the Numeric Sort benchmark. The benchmark program will +construct an array filled with random numbers, call StartStopwatch(), sort +the array, and call StopStopwatch(). If the time reported in StopStopwatch() +is less than GLOBALMINTICKS, then the benchmark will build two arrays, and +try again. If sorting two arrays took less time than GLOBALMINTICKS, the +process repeats with more arrays. + +This goes on until the benchmark makes enough work so that an interval +between StartStopwatch() and StopStopwatch() exceeds GLOBALMINTICKS. Once +that happens, the test is actually run, and scores are calculated. + +Notice that the benchmark didn't make bigger arrays, it made more arrays. +That's because the time taken by the sort test does not increase linearly as +the array grows, it increases by a factor of N*log(N) (where N is the size +of the array). + +This principle is applied to all the benchmark tests. A machine with a less +accurate clock may be forced to sort more arrays at a time, but the results +are given in arrays per second. In this way fast machines, slow machines, +machines with accurate clocks, machines with less accurate clocks, can all +be tested with the same code. + +Confidence Intervals + +Another built-in feature of the BYTEmark is a set of statistical-analysis +routines. Running benchmarks is one thing; the question arises as to how +many times should a test be run until you know you have a good sampling. +Also, can you determine whether the test is stable (i.e., do results vary +widely from one execution of the benchmark to the next)? + +The BYTEmark keeps score as follows: Each test (a test being a numeric +sort, a string sort, etc.) is run five times. These five scores are +averaged, the standard deviation is determined, and a 95% confidence +half-interval for the mean is calculated (using the student t +distribution). This tells us that the true average lies -- with a 95% +probability -- within plus or minus the confidence half-interval of +the calculated average. If this half-interval is within 5% of the +calculated average, the benchmarking stops. Otherwise, a new test is +run and the calculations are repeated with all of the runs done so +far, including the new one. The benchmark proceeds this way up to a +total of 30 runs. If the length of the half-interval is still bigger +than 5% of the calculated average then a warning issued that the +results might not be statistically certain before the average is +displayed. + +** Fixed a statistical bug here. Uwe F. Mayer + +The upshot is that, for each benchmark test, the true average is -- with a +95% level of confidence -- within 5% of the average reported. Here, the +"true average" is the average we would get were we able to run the tests +over and over again an infinite number of times. + +This specification ensures that the calculation of results is controlled; +that someone running the tests in California will use the same technique for +determining benchmark results as someone running the tests in New York. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run the benchmark suite +as compared to a run an unloaded system. This is because the benchmark does +some statistical analysis to make sure that the reported results are +statistically significant (as explained above), and a high variation in +individual runs requires more runs to achieve the required statistical +confidence. + +*** added last the paragraph, Uwe F. Mayer + +Interpreting Results + +Of course, running the benchmarks can present you with a boatload of data. +It can get mystifying, and some of the more esoteric statistical information +is valuable only to a limited audience. The big question is: What does it +all mean? + +First, we should point out that the BYTEmark reports both "raw" and indexed +scores for each test. The raw score for a particular test amounts to the +"iterations per second" of that test. For example, the numeric sort test +reports as its raw score the number of arrays it was able to sort per +second. + +The indexed score is the raw score of the system under test divided by the +raw score obtained on the baseline machine. As of this release, the +baseline machine is a DELL 90 Mhz Pentium XPS/90 with 16 MB of RAM and 256K +of external processor cache. (The compiler used was the Watcom C/C++ 10.0 +compiler; optimizations set to "fastest possible code", 4-byte structure +alignment, Pentium code generation with Pentium register-based calling. The +operating system was MSDOS.) The indexed score serves to "normalize" the +raw scores, reducing their dynamic range and making them easier to +grasp. Simply put, if your machine has an index score of 2.0 on the numeric +sort test, it performed that test twice as fast as this 90 Mhz Pentium. + +If you run all the tests (as you'll see, it is possible to perform "custom +runs", which execute only a subset of the tests) the BYTEmark will also +produce two overall index figures: Integer index and Floating-point index. +The Integer index is the geometric mean of those tests that involve only +integer processing -- numeric sort, string sort, bitfield, emulated +floating-point, assignment, Huffman, and IDEA -- while the Floating-point +index is the geometric mean of those tests that require the floating-point +coprocessor -- Fourier, neural net, and LU decomposition. You can use these +scores to get a general feel for the performance of the machine under test +as compared to the baseline 90 Mhz Pentium. + +The Linux/Unix port has a second baseline machine, it is an AMD K6/233 with +32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and using GNU gcc +version 2.7.2.3 and libc-5.4.38. The integer index was split as suggested +by Andrew D. Balsa , and reflects the realization that +memory management is important in CPU design. The original tests have been +left alone, however, the geometric mean of the tests NUMERIC SORT, FP +EMULATION, IDEA, and HUFFMAN now constitutes the integer-arithmetic focused +benchmark index, while the geometric mean of the tests STRING SORT, +BITFIELD, and ASSIGNMENT makes up the new memory index. The floating point +index has been left alone, it is still the geometric mean of FOURIER, +NEURAL NET, and LU DECOMPOSITION. + +*** added the section on Linux, Uwe F. Mayer + +What follows is a list of the benchmarks and associated brief remarks that +describe what the tests do: What they exercise; what a "good" result or a +"bad" result means. Keep in mind that, in this expanding universe of faster +processors, bigger caches, more elaborate memory architectures, "good" and +"bad" are indeed relative terms. A good score on today's hot new processor +will be a bad score on tomorrow's hot new processor. + +These remarks are based on empirical data and profiling that we have done to +date. (NOTE: The profiling is limited to Intel and Motorola 68K on this +release. As more data is gathered, we will be refining this section. +3/14/95--RG) + +Benchmark Description + +Numeric sort Generic integer performance. Should + exercise non-sequential performance + of cache (or memory if cache is less + than 8K). Moves 32-bit longs at a + time, so 16-bit processors will be + at a disadvantage. + + + +String sort Tests memory-move performance. + Should exercise non-sequential + performance of cache, with added + burden that moves are byte-wide and + can occur on odd address boundaries. + May tax the performance of + cell-based processors that must + perform additional shift operations + to deal with bytes. + + + +Bitfield Exercises "bit twiddling" + performance. Travels through memory + in a somewhat sequential fashion; + different from sorts in that data is + merely altered in place. If + properly compiled, takes into + account 64-bit processors, which + should see a boost. + + + +Emulated F.P. Past experience has shown this test + to be a good measurement of overall + performance. + + + +Fourier Good measure of transcendental and + trigonometric performance of FPU. + Little array activity, so this test + should not be dependent of cache or + memory architecture. + + + +Assignment The test moves through large integer + arrays in both row-wise and + column-wise fashion. Cache/memory + with good sequential performance + should see a boost (memory is + altered in place -- no moving as in + a sort operation). Processing is + done in 32-bit chunks -- no + advantage given to 64-bit + processors. + + + +Huffman A combination of byte operations, + bit twiddling, and overall integer + manipulation. Should be a good + general measurement. + + + +IDEA Moves through data sequentially in + 16-bit chunks. Should provide a + good indication of raw speed. + + + +Neural Net Small-array floating-point test + heavily dependent on the exponential + function; less dependent on overall + FPU performance. Small arrays, so + cache/memory architecture should not + come into play. + + + +LU decomposition. A floating-point test that moves + through arrays in both row-wise and + column-wise fashion. Exercises only + fundamental math operations (+, -, + *, /). + +The Command File + +Purpose + +The BYTEmark program allows you to override many of its default parameters +using a command file. The command file also lets you request statistical +information, as well as specify an output file to hold the test results for +later use. + +You identify the command file using a command-line argument. E.G., + +C:NBENCH -cCOMFILE.DAT + +tells the benchmark program to read from COMFILE.DAT in the current +directory. + +The content of the command file is simply a series of parameter names and +values, each on a single line. The parameters control internal variables +that are either global in nature (i.e., they effect all tests in the +program) or are specific to a given benchmark test. + +The parameters are listed in a reference guide that follows, arranged in the +following groups: + +Global Parameters + +Numeric Sort + +String Sort + +Bitfield + +Emulated floating-point + +Fourier coefficients + +Assignment algorithm + +IDEA encryption + +Huffman compression + +Neural net + +LU decomposition + +As mentioned above, those items listed under "Global Parameters" affect all +tests; the rest deal with specific benchmarks. There is no required ordering +to parameters as they appear in the command file. You can specify them in +any sequence you wish. + +You should be judicious in your use of a command file. Some parameters will +override the "dynamic workload" adjustment that each test performs. Doing +this completely bypasses the benchmark code that is designed to produce an +accurate reading from your system clock. Other parameters will alter default +settings, yielding test results that cannot be compared with published +benchmark results. + +A Sample Command File + +Suppose you built a command file that contained the following: + +ALLSTATS=T + +CUSTOMRUN=T + +OUTFILE=D:\DATA.DAT + +DONUMSORT=T + +DOLU=T + +Here's what this file tells the benchmark program: + +ALLSTATS=T means that you've requested a "dump" of all the statistics the +test gathers. This includes not only the standard deviations of tests run, +it also produces test-specific information such as the number of arrays +built, the array size, etc. + +CUSTOMRUN=T tells the system that this is a custom run. Only tests +explicitly specified will be executed. + +OUTFILE=D:\DATA.DAT will write the output of the benchmark to the file +DATA.DAT on the root of the D: drive. (If DATA.DAT already exists, output +will be appended to the file.) + +DONUMSORT=T tells the system to run the numeric sort benchmark. (This was +necessary on account of the CUSTOMRUN=T line, above.) + +DOLU=T tells the system to run the LU decomposition benchmark. + +Command File Parameters Reference + +(NOTE: Altering some global parameters can invalidate results for comparison +purposes. Those parameters are indicated in the following section by a bold +asterisk (*). If you alter any parameters so indicated, you may NOT publish +the resulting data as BYTEmark scores.) + +Global Parameters + +GLOBALMINTICKS= + +This overrides the default global_min_ticks value (defined in NBENCH1.H). +The global_min_ticks value is defined as the minimum number of clock ticks +per iteration of a particular benchmark. For example, if global_min_ticks is +set to 100 and the numeric sort benchmark is run; each iteration MUST take +at least 100 ticks, or the system will expand the work-per-iteration. + +MINSECONDS= + +Sets the minimum number of seconds any particular test will run. This has +the effect of controlling the number of repetitions done. Default: 5. + +ALLSTATS= + +Set this flag to T for a "dump" of all statistics. The information displayed +varies from test to test. Default: F. + +OUTFILE= + +Specifies that output should go to the specified output file. Any test +results and statistical data displayed on-screen will also be written to the +file. If the file does not exist, it will be created; otherwise, new output +will be appended to an existing file. This allows you to "capture" several +runs into a single file for later review. + +Note: the path should not appear in quotes. For example, something like the +following would work: OUTFILE=C:\BENCH\DUMP.DAT + +CUSTOMRUN= + +Set this flag to T for a custom run. A "custom run" means that the program +will run only the benchmark tests that you explicitly specify. So, use this +flag to run a subset of the tests. Default: F. + +Numeric Sort + +DONUMSORT= + +Indicates whether to do the numeric sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case default is F. + +NUMNUMARRAYS= + +Indicates the number of numeric arrays the system will build. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +NUMARRAYSIZE= + +Indicates the number of elements in each numeric array. Default is 8001 +entries. (NOTE: Altering this value will invalidate the test for comparison +purposes. The performance of the numeric sort test is not related to the +array size as a linear function; i.e., an array twice as big will not take +twice as long. The relationship involves a logarithmic function.)* + +NUMMINSECONDS= + +Overrides MINSECONDS for the numeric sort test. + +String Sort + +DOSTRINGSORT= + +Indicates whether to do the string sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +STRARRAYSIZE= + +Sets the size of the string array. Default is 8111. (NOTE: Altering this +value will invalidate the test for comparison purposes. The performance of +the string sort test is not related to the array size as a linear function; +i.e., an array twice as big will not take twice as long. The relationship +involves a logarithmic function.)* + +NUMSTRARRAYS= + +Sets the number of string arrays that will be created to run the test. +Setting this value will override the program's "dynamic workload" adjustment +for this test.* + +STRMINSECONDS= + +Overrides MINSECONDS for the string sort test. + +Bitfield + +DOBITFIELD= + +Indicates whether to do the bitfield test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NUMBITOPS= + +Sets the number of bitfield operations that will be performed. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +BITFIELDSIZE= + +Sets the number of 32-bit elements in the bitfield arrays. The default value +is dependent on the size of a long as defined by the current compiler. For a +typical compiler that defines a long to be 32 bits, the default is 32768. +(NOTE: Altering this parameter will invalidate test results for comparison +purposes.)* + +BITMINSECONDS= + +Overrides MINSECONDS for the bitfield test. + +Emulated floating-point + +DOEMF= + +Indicates whether to do the emulated floating-point test. Default is T, +unless this is a custom run (CUSTOMRUN=T), in which case the default is F. + +EMFARRAYSIZE= + +Sets the size (number of elements) of the emulated floating-point benchmark. +Default is 3000. The test builds three arrays, each of equal size. This +parameter sets the number of elements for EACH array. (NOTE: Altering this +parameter will invalidate test results for comparison purposes.)* + +EMFLOOPS= + +Sets the number of loops per iteration of the floating-point test. Setting +this value will override the program's "dynamic workload" adjustment for +this test.* + +EMFMINSECONDS= + +Overrides MINSECONDS for the emulated floating-point test. + +Fourier coefficients + +DOFOUR= + +Indicates whether to do the Fourier test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +FOURASIZE= + +Sets the size of the array for the Fourier test. This sets the number of +coefficients the test will derive. NOTE: Specifying this value will override +the system's "dynamic workload" adjustment for this test, and may make the +results invalid for comparison purposes.* + +FOURMINSECONDS= + +Overrides MINSECONDS for the Fourier test. + +Assignment Algorithm + +DOASSIGN= + +Indicates whether to do the assignment algorithm test. Default is T, unless +this is a custom run (CUSTOMRUN=T), in which case the default is F. + +ASSIGNARRAYS= + +Indicates the number of arrays that will be built for the test. Specifying +this value will override the system's "dynamic workload" adjustment for this +test. (NOTE: The size of the arrays in the assignment algorithm is fixed at +101 x 101. Altering the array size requires adjusting global constants and +recompiling; to do so, however, would invalidate test results.)* + +ASSIGNMINSECONDS= + +Overrides MINSECONDS for the assignment algorithm test. + +IDEA encryption + +DOIDEA= + +Indicates whether to do the IDEA encryption test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +IDEAARRAYSIZE= + +Sets the size of the plain-text character array that will be encrypted by the +test. Default is 4000. The benchmark actually builds 3 arrays: 1st +plain-text, encrypted version, and 2nd plain-text. The 2nd plain-text array is +the destination for the decryption process [part of the test]. All arrays +are set to the same size. (NOTE: Specifying this value will invalidate test +results for comparison purposes.)* + +IDEALOOPS= + +Indicates the number of loops in the IDEA test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +IDEAMINSECONDS= + +Overrides MINSECONDS for the IDEA test. + +Huffman compression + +DOHUFF= + +Indicates whether to do the Huffman test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +HUFFARRAYSIZE= + +Sets the size of the string buffer that will be compressed using the Huffman +test. The default is 5000. (NOTE: Altering this value will invalidate test +results for comparison purposes.)* + +HUFFLOOPS= + +Sets the number of loops in the Huffman test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +HUFFMINSECONDS= + +Overrides MINSECONDS for the Huffman test. + +Neural net + +DONNET= + +Indicates whether to do the Neural Net test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NNETLOOPS= + +Sets the number of loops in the Neural Net test. NOTE: Altering this value +overrides the benchmark's "dynamic workload" adjustment algorithm, and may +invalidate the results for comparison purposes.* + +NNETMINSECONDS= + +Overrides MINSECONDS for the Neural Net test. + +LU decomposition + +DOLU= + +Indicates whether to do the LU decomposition test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +LUNUMARRAYS= + +Sets the number of arrays in each iteration of the LU decomposition test. +Specifying this value will override the system's "dynamic workload" +adjustment for this test.* + +LUMINSECONDS= + +Overrides MINSECONDS for the LU decomposition test. + +Numeric Sort + +Description + +This benchmark is designed to explore how well the system sorts a numeric +array. In this case, a numeric array is a one-dimensional collection of +signed, 32-bit integers. The actual sorting is performed by a heapsort +algorithm (see the text box following for a description of the heapsort +algorithm). + +It's probably unnecessary to point out (but we'll do it anyway) that sorting +is a fundamental operation in computer application software. You'll likely +find sorting routines nestled deep inside a variety of applications; +everything from database systems to operating-systems kernels. + +The numeric sort benchmark reports the number of arrays it was able to sort +per second. The array size is set by a global constant (it can be overridden +by the command file -- see below). + +Analysis + +Optimized 486 code: Profiling of the numeric sort benchmark using Watcom's +profiler (Watcom C/C++ 10.0) indicates that the algorithm spends most of its +time in the numsift() function (specifically, about 90% of the benchmark's +time takes place in numsift()). Within numsift(), two if statements dominate +time spent: + +if(array[k] can be used to change this value, but results produced by +doing this will make your results incompatible with other runs of the +benchmark (since results will be skewed -- see preceding paragraph). + +To test for a correct execution of the numeric sort benchmark, #define the +DEBUG symbol. This will enable code that verifies that arrays are properly +sorted. You should run the benchmark program using a command file that has +only the numeric sort test enabled. If there is an error, the program will +display "SORT ERROR" (If this happens, it's possible that tons of "SORT +ERROR" messages will be emitted, so it's best not to redirect output to a +file), otherwise it will print "Numeric sort: OK" (also quite a few times). + +References + +Gonnet, G.H. 1984, Handbook of Algorithms and Data Structures (Reading, MA: +Addison-Wesley). + +Knuth, Donald E. 1968, Fundamental Algorithms, vol 1 of The Art of Computer +Programming (Reading, MA: Addison-Wesley). + +Press, William H., Flannery, Brian P., Teukolsky, Saul A., and Vetterling, +William T. 1989, Numerical Recipes in Pascal (Cambridge: Cambridge +University Press). + +Heapsort + +The heapsort algorithm is well-covered in a number of the popular +computer-science textbooks. In fact, it gets a pat on the back in Numerical +Recipes (Press et. al.), where the authors write: + +Heapsort is our favorite sorting routine. It can be recommended +wholeheartedly for a variety of sorting applications. It is a true +"in-place" sort, requiring no auxiliary storage. + +Heapsort works by building the array into a kind of a queue called a heap. +You can imagine this heap as being a form of in-memory binary tree. The +topmost (root) element of the tree is the element that -- were the array +sorted -- would be the largest element in the array. Sorting takes place by +first constructing the heap, then pulling the root off the tree, promoting +the next largest element to the root, pulling it off, and so on. (The +promotion process is known as "sifting up.") + +Heapsort executes in N log2 N time even in its worst case. Unlike some other +sorting algorithms, it does not benefit from a partially sorted array +(though Gonnet does refer to a variation of heapsort, called "smoothsort," +which does -- see references). + +String Sort + +Description + +This benchmark is designed to gauge how well the system moves bytes around. +By that we mean, how well the system can copy a string of bytes from one +location to another; source and destination being aligned to arbitrary +addresses. (This is unlike the numeric sort array, which moves bytes +longword-at-a-time.) The strings themselves are built so as to be of random +length, ranging from no fewer than 4 bytes and no greater than 80 bytes. The +mixture of random lengths means that processors will be forced to deal with +strings that begin and end on arbitrary address boundaries. + +The string sort benchmark uses the heapsort algorithm; this is the same +algorithm as is used in the numeric sort benchmark (see the sidebar on the +heapsort for a detailed description of the algorithm). + +Manipulation of the strings is actually handled by two arrays. One array +holds the strings themselves; the other is a pointers array. Each member of +the pointers array carries an offset that points into the string array, so +that the ith pointer carries the offset to the ith string. This allows the +benchmark to rapidly locate the position of the ith string. (The sorting +algorithm requires exchanges of items that might be "distant" from one +another in the array. It's critical that the routine be able to rapidly find +a string based on its indexed position in the array.) + +The string sort benchmark reports the number of string arrays it was able to +sort per second. The size of the array is set by a global constant. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): Profiling of the string sort +benchmark indicates that it spends most of its time in the C library routine +memmove(). Within that routine, most of the execution is consumed by a pair +of instructions: rep movsw and rep movsd. These are repeated string move -- +word width and repeated string move -- doubleword width, respectively. + +This is precisely where we want to see the time spent. It's interesting to +note that the memmove() of the particular compiler/profiler tested (Watcom +C/C++ 10.0) was "smart" enough to do most of the moving on word or +doubleword boundaries. The string sort benchmark specifically sets arbitrary +boundaries, so we'd expect to see lots of byte-wide moves. The "smart" +memmove() is able to move bytes only when it has to, and does the remainder +of the work via words and doublewords (which can move more bits at a time). + +680x0 Code (Macintosh CodeWarrior): Because CodeWarrior's profiler is +function based, it is impossible to get an idea of how much time the test +spends in library routines such as memmove(). Fortunately, as an artifact of +the early version of the benchmark, the string sort algorithm makes use of +the MoveMemory() routine in the sysspec.c file (system specific routines). +This call, on anything other than a 16-bit DOS system, calls memmove() +directly. Hence, we can get a good approximation of how much time is spent +moving bytes. + +The answer is that nearly 78% of the benchmark's time is consumed by +MoveMemory(), the rest being taken up by the other routines (the +str_is_less() routine, which performs string comparisons, takes about 7% of +the time). As above, we can guess that most of the benchmark's time is +dependent on the performance of the library's memmove() routine. + +Porting Considerations + +As with the numeric sort routine, the string sort benchmark should be simple +to port. Simpler, in fact. The string sort benchmark routine is not +dependent on any typedef that may change from machine to machine (unless a +char type is not 8 bits). + +The string sort benchmark depends on the following global definitions: + +NUMSTRARRAYS - Sets the upper limit on the number of arrays that the +benchmark will attempt to build. The string sort benchmark creates work for +itself by requiring the system to sort more and more arrays, not bigger and +bigger arrays. (See section on Numeric Sort for an explanation.) This +constant sets the upper limit to the number of arrays the system will build +before it signals an error. The default value is 100, and may be changed if +your system exceeds this limit. + +STRARRAYSIZE - Sets the default size of the string arrays built. We say +"arrays" because, as with the numeric sort benchmark, the system adds work +not by expanding the size of the array, but by adding more arrays. This +value is set to 8111, and should not be modified, since results would not be +comparable with other runs of the same benchmark on other machines. + +To test for a correct execution of the string sort benchmark, #define +the DEBUG symbol. This will enable code that verifies the arrays are +properly sorted. Set up a command file that runs only the string sort, +and execute the benchmark program. If the routine is operating +properly, the benchmark will print "String sort: OK", this message is +printed quite often. Otherwise, the program will display "SORT ERROR" +for each pair of strings it finds out of order (which can be really +often). + +References + +See the references for the Numeric Sort benchmark. + +Bitfield Operations + +Description + +The purpose of this benchmark is to explore how efficiently the system +executes operations that deal with "twiddling bits." The test is set up to +simulate a "bit map"; a data structure used to keep track of storage usage. +(Don't confuse this meaning of "bitmap" with its use in describing a +graphics data structure.) + +Systems often use bit maps to keep an inventory of memory blocks or (more +frequently) disk blocks. In the case of a bit map that manages disk usage, +an operating system will set aside a buffer in memory so that each bit in +that buffer corresponds to a block on the disk drive. A 0 bit means that the +corresponding block is free; a 1 bit means the block is in use. Whenever a +file requests a new block of disk storage, the operating system searches the +bit map for the first 0 bit, sets the bit (to indicate that the block is now +spoken for), and returns the number of the corresponding disk block to the +requesting file. + +These types of operations are precisely what this test simulates. A block of +memory is set allocated for the bit map. Another block of memory is +allocated, and set up to hold a series of "bit map commands". Each bitmap +command tells the simulation to do 1 of 3 things: + +1) Clear a series of consecutive bits, + +2) Set a series of consecutive bits, or + +3) Complement (1->0 and 0->1) a series of consecutive bits. + +The bit map command block is loaded with a set of random bit map commands +(each command covers an random number of bits), and simulation routine steps +sequentially through the command block, grabbing a command and executing it. + +The bitfield benchmark reports the number of bits it was able to operate on +per second. The size of the bit map is constant; the bitfield operations +array is adjusted based on the capabilities of the processor. (See the +section describing the auto-adjust feature of the benchmarks.) + +Analysis + +Optimized 486 code: Using the Watcom C/C++ 10.0 profiler, the Bitfield +benchmark appears to spend all of its time in two routines: ToggleBitRun() +(74% of the time) and DoBitFieldIteration() (24% of the time). We say +"appears" because this is misleading, as we will explain. + +First, it is important to recall that the test performs one of three +operations for each run of bits (see above). The routine ToggleBitRun() +handles two of those three operations: setting a run of bits and clearing a +run of bits. An if() statement inside ToggleBitRun() decides which of the +two operations is performed. (Speed freaks will quite rightly point out that +this slows the entire algorithm. ToggleBitRun() is called by a switch() +statement which has already decided whether bits should be set or cleared; +it's a waste of time to have ToggleBitRun() have to make that decision yet +again.) + +DoBitFieldIteration() is the "outer" routine that calls ToggleBitRun(). +DoBitFieldIteration() also calls FlipBitRun(). This latter routine is the +one that performs the third bitfield operation: complementing a run of bits. +FlipBitRun() gets no "air time" at all (while DoBitFieldIteration() gets 24 +% of the time) simply because the compiler's optimizer recognizes that +FlipBitRun() is only called by DoBitFieldIteration(), and is called only +once. Consequently, the optimizer moves FlipBitRun() "inline", i.e., into +DoBitFieldIteration(). This removes an unnecessary call/return cycle (and is +probably part of the reason why the FlipBitRun() code gets 24% of the +algorithm's time, instead of something closer to 30% of its time.) + +Within the routines, those lines of code that actually do the shifting, the +and operations, and the or operations, consume time evenly. This should make +for a good test of a processor's "bit twiddling" capabilities. + +680x0 Code (Macintosh CodeWarrior): The CodeWarrior profiler is function +based. Consequently, it is impossible to produce a profile of machine +instruction execution time. We can, however, get a good picture of how the +algorithm divides its time among the various functions. + +Unlike the 486 compiler, the CodeWarrior compiler did not appear to collapse +the FlipBitRun() routine into the outer DoBitFieldIteration() routine. (We +don't know this for certain, of course. It's possible that the compiler +would have done this had we not been profiling.) + +In any case, the time spent in the two "core" routines of the bitfield test +are shown below: + +FlipBitRun() - 18031.2 microsecs (called 509 times) + +ToggleBitRun() - 50770.6 microsecs (called 1031 times) + +In terms of total time, FlipBitRun() takes about 35% of the time (it gets +about 33% of the calls). Remember, ToggleBitRun() is a single routine that +is called both to set and clear bits. Hence, ToggleBitRun() is called twice +as often as FlipBitRun(). + +We can conclude that time spent setting bits to 1, setting bits to 0, and +changing the state of bits, is about equal; the load is balanced close to +what we'd expect it to be, based on the structure of the algorithm. + +Porting Considerations + +The bitfield operations benchmark is dependent on the size of the long +datatype. On most systems, this is 32 bits. However, on some of the newer +RISC chips, a long can be 64 bits long. If your system does use 64-bit +longs, you'll need to #define the symbol LONG64. + +If you are unsure of the size of a long in your system (some C compiler +manuals make it difficult to discover), simply place an ALLSTATS=T line in +the command file and run the benchmarks. This will cause the benchmark +program to display (among other things) the size of the data types int, +short, and long in bytes. + +BITFARRAYSIZE - Sets the number of longs in the bit map array. This number +is fixed, and should not be altered. The bitfield test adjusts itself by +adding more bitfield commands (see above), not by creating a larger bit map. + +Currently, there is no code added to test for correct execution. If you are +concerned that your port was incorrect, you'll need to step through your +favorite debugger and verify execution against the original source code. + +** I added a resetting of the random number generator, and a resetting +** of the bitfield to each loop. Those operations are outside of the +** timed loop, and should add to make the benchmark more consistent. +** There also is now debugging information available. If you define +** DEBUG then the program will write a file named "debugbit.dat", +** which is the contents of the bitfield after the calibration loop of +** 30 operations. You can compare this file with the file +** "debugbit.good" that comes with the distribution. +** Uwe F. Mayer + +References + +None. + +Emulated Floating-point + +Description + +The emulated floating-point benchmark includes routines that are similar to +those that would be executed whenever a system performs floating-point +operations in the absence of a coprocessor. In general, this amounts to a +mixture of integer instructions, including shift operations, integer +addition and subtraction, and bit testing (among others). + +The benchmark itself is remarkably simple. The test builds three +1-dimensional arrays and loads the first two up with random floating-point +numbers. The arrays are then partitioned into 4 equal-sized groups, and the +test proceeds by performing addition, subtraction, multiplication, and +division -- one operation on each group. (For example, for the addition +group, an element from the first array is added to the second array and the +result is placed in the third array.) + +Of course, most of the work takes place inside the routines that perform the +addition, subtraction, multiplication, and division. These routines operate +on a special data type (referred to as an InternalFPF number) that -- though +not strictly IEEE compliant -- carries all the necessary data fields to +support an IEEE-compatible floating-point system. Specifically, an +InternalFPF number is built up of the following fields: + +Type (indicates a NORMAL, SUBNORMAL, etc.) + +Mantissa sign + +Unbiased, signed 16-bit exponent + +4-word (16 bits) mantissa. + +The emulated floating-point test reports its results in number of loops per +second (where a "loop" is one pass through the arrays as described above). + +Finally, we are aware that this test could be on its way to becoming an +anachronism. A growing number of systems are appearing that have +coprocessors built into the main CPU. It's possible that floating-point +emulation will one day be a thing of the past. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The algorithm's time is distributed +across a number of routines. The distribution is: + +ShiftMantLeft1() - 60% of the time + +ShiftMantRight1() - 17% of the time + +DivideInternalFPF() - 14% of the time + +MultiplyInternalFPF() - 5% of the time. + +The first two routines are similar to one another; both shift bits about in +a floating-point number's mantissa. It's reasonable that ShiftMantLeft1() +should take a larger share of the system's time; it is called as part of the +normalization process that concludes every emulated addition, subtraction, +mutiplication, and division. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is +function-based; consequently, it isn't possible to get timing at the machine +instruction level. However, the output to CodeWarrior's profiler has +provided insight into the breakdown of time spent in various functions that +forces us to rethink our 486 code analysis. + +Analyzing what goes on inside the emulated floating-point tests is a tough +one to call because some of the routines that are part of the test are +called by the function that builds the arrays. Consequently, a quick look at +the profiler's output can be misleading; it's not obvious how much time a +particular routine is spending in the test and how much time that same +routine is spending setting up the test (an operation that does not get +timed). + +Specifically, the routine that loads up the arrays with test data calls +LongToInternalFPF() and DivideInternalFPF(). LongToInternalFPF() makes one +call to normalize() if the number is not a true zero. In turn, normalize() +makes an indeterminate number of calls to ShiftMantLeft1(), depending on the +structure of the mantissa being normalized. + +What's worse, DivideInternalFPF() makes all sorts of calls to all kinds of +important low-level routines such as Sub16Bits() and ShiftMantLeft1(). +Untangling the wiring of which routine is being called as part of the test, +and which is being called as part of the setup could probably be done with +the computer equivalent of detective work and spelunking, but in the +interest of time we'll opt for approximation. + +Here's a breakdown of some of the important routines and their times: + +AddSubInternalFPF() - 1003.9 microsecs (called 9024 times) + +MultiplyInternalFPF() - 20143 microsecs (called 5610 times) + +DivideInternalFPF() - 18820.9 microsecs (called 3366 times). + +The 3366 calls to DivideInternalFPF() are timed calls, not setup calls -- +the profiler at least gives outputs of separate calls made to the same +routine, so we can determine which call is being made by the benchmark, and +which is being made by the setup routine. It turns out that the setup +routine calls DivideInternalFPF() 30,000 times. + +Notice that though addition/subtraction are called most often, +multiplication next, then finally division; the time spent in each is the +reverse. Division takes the most time, then multiplication, finally +addition/subtraction. (There's probably some universal truth lurking here +somewhere, but we haven't found it yet.) + +Other routines, and their breakdown: + +Add16Bits() - 115.3 microsecs + +ShiftMantRight1() - 574.2 microsecs + +Sub16Bits() - 1762 microsecs + +StickySiftRightMant - 40.4 microsecs + +ShiftMantLeft1() - 17486.1 microsecs + +The times for the last three routines are suspect, since they are called by +DivideInternalFPF(), and a large portion of their time could be part of the +setup process. This is what leads us to question the results obtained in the +486 analysis, since it, too, is unable to determine precisely who is calling +whom. + +Porting Considerations + +Earlier versions of this benchmark were extremely sensitive to porting; +particularly to the "endianism" of the target system. We have tried to +eliminate many of these problems. The test is nonetheless more "sensitive" +to porting than most others. + +Pay close attention to the following defines and typedefs. They can be found +in the files EMFLOAT.H, NMGLOBAL.H, and NBENCH1.H: + +u8 - Stands for unsigned, 8-bit. Usually defined to be unsigned char. + +u16 - Stands for unsigned, 16-bit. Usually defined to be unsigned short. + +u32 - Stands for unsigned, 32-bit. Usually defined to be unsigned long. + +INTERNAL_FPF_PRECISION - Indicates the number of elements in the mantissa of +an InternalFPF number. Should be set to 4. + +The exponent field of an InternalFPF number is of type short. It should be +set to whatever minimal data type can hold a signed, 16-bit number. + +Other global definitions you will want to be aware of: + +CPUEMFLOATLOOPMAX - Sets the maximum number of loops the benchmark will +attempt before flagging an error. Each execution of a loop in the emulated +floating-point test is "non-destructive," since the test takes factors from +two arrays, operates on the factors, and places the result in a third array. +Consequently, the test makes more work for itself by increasing the number +of times it passes through the arrays (# of loops). If the system exceeds +the limit set by CPUEMFLOATLOOPMAX, it will signal an error. + +This value may be altered to suit your system; it will not effect the +benchmark results (unless you reduce it so much the system can never +generate enough loops to produce a good test run). + +EMFARRAYSIZE - Sets the size of the arrays to be used in the test. This +value is the number of entries (InternalFPF numbers) per array. Currently, +the number is fixed at 3000, and should not be altered. + +Currently, there is no means of testing correct execution of the benchmark +other than via debugger. There are routines available to decode the internal +floating point format and print out the numbers, but no formal correctness +test has been constructed. (This should be available soon. -- 3/14/95 RG) + +** It now prints out the operations of 8 of the entries used in the +** test. Assuming you leave EMFARRAYSIZE at 3000, your results should +** look like the ones below. The number in front of the colon is the +** index of the entry. +** +** 2: (-1.1160E 0) + (-4.5159E 0) = -5.6320E 0 +** 6: (-4.4507E -1) - (-8.2050E -1) = +3.7543E -1 +** 10: (+1.2465E 0) * (+7.4667E -1) = +9.3075E -1 +** 14: (-1.2781E 0) / (-1.7367E 0) = +7.3596E -1 +** 2986: (-7.0390E 0) * (-2.0752E 0) = +1.4607E 1 +** 2990: (+8.3753E -1) / (+2.3876E 1) = +3.5078E -2 +** 2994: (-1.1393E 0) + (-1.6080E 1) = -1.7219E 1 +** 2998: (+7.2450E 0) - (-8.2654E -1) = +8.0716E 0 +** +** Uwe F. Mayer + +References + +Microprocessor Programming for Computer Hobbyists, Neill Graham, Tab Books, +Blue Ridge Summit, PA, 1977. + +Apple Numerica Manual, Second edition, Apple Computer, Addison-Wesley +Publishing Co., Reading, MA, 1988. + +Fourier Series + +Description + +This is a floating-point benchmark designed primarily to exercise the +trigonometric and transcendental functions of the system. It calculates the +first n Fourier coefficients of the function (x+1)x on the interval 0,2. In +this case, the function (x+1)x is being treated as a cyclic waveform with a +period of 2. + +The Fourier coefficients, when applied as factors to a properly constructed +series of sine and cosine functions, allow you to approximate the original +waveform. (In fact, if you can calculate all the Fourier coefficients -- +there'll be an infinite number -- you can reconstruct the waveform exactly). +You have to calculate the coefficients via integration, and the algorithm +does this using a simple trapezoidal rule for its numeric integration +function. + +The upshot of all this is that it provides an exercise for the +floating-point routines that calculate sine, cosine, and raising a number to +a power. There are also some floating-point multiplications, divisions, +additions, and subtractions mixed in. + +The benchmark reports its results as the number of coefficients calculated +per second. + +As an additional note, we should point out that the performance of this +benchmark is heavily dependent on how well-built the compiler's math library +is. We have seen at least two cases where recompilation with new (and +improved!) math libraries have resulted in two-fold and five-fold +performance improvements. (Apparently, when a compiler gets moved to a new +platform, the trigonometric and transcendental functions in the math +libraries are among the last routines to be "hand optimized" for the new +platform.) About all we can say about this is that whenever you run this +test, verify that you have the latest and greatest math libraries. + +Analysis + +Optimized 486 code: The benchmark partitions its time almost evenly among +the modules pow387, exp386, and trig387; giving between 25% and 28% of its +time to each. This is based on profiling with the Watcom compiler running +under Windows NT. These modules hold the routines that handle raising a +number to a power and performing trigonometric (sine and cosine) +calculations. For example, within trig387, time was nearly equally divided +between the routine that calculates sine and the routine that calculates +cosine. + +The remaining time (between 17% and 18%) was spent in the balance of the +test. We noticed that most of that time occurred in the routine +thefunction(). This is at the heart of the numerical integration routine the +benchmark uses. + +Consequently, this benchmark should be a good test of the exponential and +trigonometric capabilities of a processor. (Note that we recognize that the +performance also depends on how well the compiler's math library is built.) + +680x0 Code (Macintosh CodeWarrior): The CodeWarrior profiler is function +based, therefore it is impossible to get performance results for individual +machine instructions. The CodeWarrior compiler is also unable to tell us how +much time is spent within a given library routine; we can't see how much +time gets spent executing the sin(), cos(), or pow() functions (which, +unfortunately, was the whole idea behind the benchmark). + +About all we can glean from the results is that thefunction() takes about +74% of the time in the test (this is where the heavy math calculations take +place) while trapezoidintegrate() accounts for about 26% of the time on its +own. + +Porting Considerations + +Necessarily, this benchmark is at the mercy of the efficiency of the +floating-point support provided by whatever compiler you are using. It is +recommended that, if you are doing the port yourself, you contact the +designers of the compiler, and discuss with them what optimization switches +should be set to produce the fastest code. (This sounds simple; usually it's +not. Some systems let you decide between speed and true IEEE compliance.) + +As far as global definitions go, this benchmark is happily free of them. All +the math is done using double data types. We have noticed that, on some Unix +systems, you must be careful to include the correct math libraries. +Typically, you'll discover this at link time. + +To test for correct execution of the benchmark: It's unlikely you'll need to +do this, since the algorithm is so cut-and-dried. Furthermore, there are no +explicit provisions made to verify the correctness. You can, however, either +dip into your favorite debugger, or alter the code to print out the contents +of the abase (which holds the A[i] terms) and bbase (which holds the B[i] +terms) arrays as they are being filled (see routine DoFPUTransIteration). +** This is exactly what I have done, it now prints out A[i] and B[i] data. +** Uwe F. Mayer +Run the benchmark with a command file set to execute only the Fourier test, +and examine the contents of the arrays. The first 100 are listed below. + +A[i]= + 2.84 1.05 0.274 0.0824 0.0102 -0.024 -0.0426 -0.0536 -0.0605 -0.065 +-0.0679 -0.0698 -0.0709 -0.0715 -0.0717 -0.0715 -0.0711 -0.0704 +-0.0696 -0.0685 -0.0674 -0.0661 -0.0647 -0.0632 -0.0615 -0.0598 -0.058 +-0.0561 -0.0542 -0.0521 -0.0501 -0.0479 -0.0457 -0.0434 -0.0411 +-0.0387 -0.0363 -0.0338 -0.0313 -0.0288 -0.0262 -0.0236 -0.0209 +-0.0183 -0.0156 -0.0129 -0.0102 -0.00744 -0.0047 -0.00196 0.000794 +0.00355 0.0063 0.00905 0.0118 0.0145 0.0172 0.0199 0.0226 0.0253 +0.0279 0.0305 0.0331 0.0357 0.0382 0.0407 0.0431 0.0455 0.0479 0.0502 +0.0525 0.0547 0.0569 0.059 0.061 0.063 0.0649 0.0668 0.0686 0.0703 +0.072 0.0736 0.0751 0.0765 0.0779 0.0792 0.0804 0.0816 0.0826 0.0836 +0.0845 0.0853 0.0861 0.0867 0.0873 0.0877 0.0881 0.0884 0.0887 0.0888 + +B[i]= +(undefined) -1.88 -1.16 -0.806 -0.61 -0.487 -0.402 -0.34 -0.293 -0.255 +-0.224 -0.199 -0.177 -0.158 -0.141 -0.126 -0.113 -0.101 -0.0901 +-0.0802 -0.071 -0.0625 -0.0546 -0.0473 -0.0404 -0.034 -0.0279 -0.0222 +-0.0168 -0.0117 -0.00693 -0.00238 0.00193 0.00601 0.00988 0.0135 0.017 +0.0203 0.0234 0.0263 0.0291 0.0317 0.0341 0.0364 0.0385 0.0405 0.0424 +0.0441 0.0457 0.0471 0.0484 0.0496 0.0507 0.0516 0.0525 0.0532 0.0538 +0.0543 0.0546 0.0549 0.055 0.0551 0.055 0.0549 0.0546 0.0543 0.0538 +0.0533 0.0527 0.052 0.0512 0.0503 0.0493 0.0483 0.0472 0.046 0.0447 +0.0434 0.042 0.0405 0.039 0.0374 0.0358 0.0341 0.0323 0.0305 0.0287 +0.0268 0.0249 0.023 0.021 0.019 0.0169 0.0149 0.0128 0.0107 0.00857 +0.00644 0.0043 0.00215 + +Note that there is no B[0] coefficient. If the above numbers are in the +arrays shown, you can feel pretty confident that the benchmark it working +properly. + +References + +Engineering and Scientific Computations in Pascal, Lawrence P. Huelsman, +Harper & Row, New York, 1986. + +Assignment Algorithm + +Description + +This test is built on an algorithm with direct application to the business +world. The assignment algorithm solves the following problem: Say you have X +machines and Y jobs. Any of the machines can do any of the jobs; however, the +machines are sufficiently different so that the cost of doing a particular +job can vary depending what machine does it. Furthermore, the jobs are +sufficiently different that the cost varies depending on which job a given +machine does. You therefore construct a matrix; machines are the rows, jobs +are the columns, and the [i,j] element of the array is the cost of doing the +jth job on the ith machine. How can you assign the jobs so that the cost of +completing them all is minimal? (This also assumes that one machine does one +job.) + +Did you get that? + +The assignment algorithm benchmark is largely a test of how well the +processor handles problems built around array manipulation. It is not a +floating-point test; the "cost matrix" built by the algorithm is simply a 2D +array of long integers. This benchmark considers an iteration to be a run of +the assignment algorithm on a 101 x 101 - element matrix. It reports its +results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): There are numerous loops within the +assignment algorithm. The development system we were using (Watcom C/C++ +10.0) appears to have a fine time unrolling many of them. Consequently, it +is difficult to pin down the execution impact of single lines (as in, for +example, the numeric sort benchmark). + +On the level of functions, the benchmark spends around 70% of its time in +the routine first_assignments(). This is where a) lone zeros in rows and +columns are found and selected, and b) a choice is made between duplicate +zeros. Around 23% of the time is spent in the second_assignments() routine +where (if first_assignments() fails) the matrix is partitioned into smaller +submatrices. + +Overall, we did a tally of instruction mix execution. The approximate +breakdowns are: + +move - 38% + +conditional jump - 12% + +unconditional jump - 11% + +comparison - 14% + +math/logical/shift - 24% + +Many of the move instructions that appeared to consume the most amounts of +time were referencing items on the local stack frame. This required an +indirect reference through EBP, plus a constant offset to resolve the +address. + +This should be a good exercise of a cache, since operations in the +first_assignments() routine require both row-wise and column-wise movement +through the array. Note that the routine could be made more "severe" by +chancing the assignedtableau[][] array to an array of unsigned char -- +forcing fetches on byte boundaries. + +680x0 Code (CodeWarrior): The CodeWarrior profiler is function-based. +Consequently, it's not possible to determine what's going on at the machine +instruction level. We can, however, get a good idea of how much time the +algorithm spends in each routine. The important routines are broken down as +follows: + +calc_minimum_costs() - approximately 0.3% of the time + +(250 microsecs) + +first_assignments() - approximately 79% of the time + +(96284.6 microsecs) + +second_assignments() - approximately 19% of the time + +(22758 microsecs) + +These times are approximate; some time is spent in the Assignment() routine +itself. + +These figures are reasonably close to those of the 486, at least in terms of +the mixture of time spent in a particular routine. Hence, this should still +be a good test of system cache (as described in the preceding section), +given the behavior of the first_assignments() routine. + +Porting Considerations + +The assignment algorithm test is purely an integer benchmark, and requires +no special data types that might be affected by ports to different +architectures. There are only two global constants that affect the +algorithm: + +ASSIGNROWS and ASSIGNCOLS - These set the size of the assignment array. Both +are defined to be 101 (so, the array that is benchmarked is a 101 x 101 +-element array of longs). These values should not be altered. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, set up a command file that executes only the assignment +algorithm, and run the benchmark. (You may want to pipe the output through a +paging filter, like the more program.) The act of defining DEBUG will enable +a section of code that displays the assigned columns on a per-row basis. If +the benchmark is working properly, the numbers to be displayed +should be: + +R000: 056 R001: 066 R002: 052 R003: 065 R004: 043 R005: 023 R006: 016 +R007: 077 R008: 095 R009: 004 R010: 064 R011: 076 R012: 078 R013: 091 +R014: 013 R015: 029 R016: 044 R017: 014 R018: 041 R019: 042 R020: 020 +R021: 071 R022: 024 R023: 017 R024: 055 R025: 040 R026: 070 R027: 025 +R028: 031 R029: 019 R030: 073 R031: 002 R032: 047 R033: 009 R034: 035 +R035: 045 R036: 005 R037: 063 R038: 081 R039: 039 R040: 087 R041: 008 +R042: 053 R043: 093 R044: 049 R045: 092 R046: 061 R047: 046 R048: 026 +R049: 034 R050: 088 R051: 000 R052: 028 R053: 018 R054: 072 R055: 021 +R056: 037 R057: 082 R058: 006 R059: 058 R060: 096 R061: 068 R062: 069 +R063: 054 R064: 057 R065: 086 R066: 097 R067: 084 R068: 099 R069: 051 +R070: 098 R071: 003 R072: 074 R073: 062 R074: 080 R075: 033 R076: 011 +R077: 094 R078: 012 R079: 050 R080: 010 R081: 038 R082: 089 R083: 059 +R084: 022 R085: 079 R086: 015 R087: 007 R088: 075 R089: 083 R090: 060 +R091: 048 R092: 032 R093: 067 R094: 001 R095: 030 R096: 027 R097: 085 +R098: 090 R099: 036 R100: 100 + +These are the column choices for each row made by the algorithm. If +you see these numbers displayed, the algorithm is working correctly. + +*** The original debugging information was incorrect, as it not only +*** display the chosen columns, but also displayed eliminated columns. +*** Changed to show all 101 entries. Uwe F. Mayer + +References + +Quantitative Decision Making for Business, Gordon, Pressman, and Cohn, +Prentice-Hall, Englewood Cliffs, NJ, 1990. + +Quantitative Decision Making, Guiseppi A. Forgionne, Wadsworth Publishing +Co., California, 1986. + +Huffman Compression + +Description + +This is a compression algorithm that -- while helpful for some time as a +text compression technique -- has since fallen out of fashion on account of +the superior performance by algorithms such as LZW compression. It is, +however, still used in some graphics file formats in one form or another. + +The benchmark consists of three parts: + +Building a "Huffman Tree" (explained below), + +Compression, and + +Decompression. + +A "Huffman Tree" is a special data structure that guides the compression and +decompression processes. If you were to diagram one, it would look like a +large binary tree (i.e., two branches per each node). Describing its +function in detail is beyond the scope of this paper (see the references for +more information). We should, however, point out that the tree is built from +the "bottom up"; and the procedure for constructing it requires that the +algorithm scan the uncompressed buffer, building a frequency table for all +the characters appearing in the buffer. (This version of the Huffman +algorithm compresses byte-at-a-time, though there's no reason why the same +principle could not be applied to tokens larger than one byte.) + +Once the tree is built, text compression is relatively straightforward. The +algorithm fetches a character from the uncompressed buffer, navigates the +tree based on the character's value, and produces a bit stream that is +concatenated to the compressed buffer. Decompression is the reverse of that +process. (We recognize that we are simplifying the algorithm. Again, we +recommend you check the references.) + +The Huffman Compression benchmark considers an iteration to be the three +operations described above, performed on an uncompressed text buffer of 5000 +bytes. It reports its results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The Huffman compression algorithm -- +tree building, compression, and decompression -- is written as a single, +large routine: DoHuffIteration(). All the benchmark's time is spent within +that routine. + +Components of DoHuffIteration() that consume the most time are those that +perform the compression and decompression . + +The code for performing the compression spends most of its time (accounting +for about 13%) constructing the bit string for a character that is being +compressed. It does this by seeking up the tree from a leaf, emitting 1's +and 0's in the process, until it reaches the root. The stream of 1's and 0's +are loaded into a character array; the algorithm then walks "backward" +through the array, setting (or clearing) bits in the compression buffer as +it goes. + +Similarly, the decompression portion takes about 12% of the time as the +algorithm pulls bits out of the compressed buffer -- using them to navigate +the Huffman tree -- and reconstructs the original text. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based. Consequently, it's impossible to get performance scores for +individual machine instructions. Furthermore, as mentioned above, the +Huffman compression algorithm is written as a monolithic routine. This makes +the results from the CodeWarrior profiler all the more sparse. + +We can at least point out that the lowmost routines (GetCompBit() and +SetCompBit()) that read and write individual bits, though called nearly 13 +million times each, account for only 0.7% and 0.3% of the total time, +respectively. + +Porting Considerations + +The Huffman algorithm relies on no special data types. It should port +readily. Global constants of interest include: + +EXCLUDED - This is a large, positive value. Currently it is set to 32000, +and should be left alone. Basically, this is a token that the system uses to +indicate an excluded character (one that does not appear in the plain-text). +It is set to a ridiculously high value that will never appear in the +pointers of the tree during normal construction. + +MAXHUFFLOOPS - This is another one of those "governor" constants. The +Huffman benchmark creates more work for itself by doing multiple +compression/decompression loops. This constant sets the maximum number of +loops it will attempt per iteration before it gives up. Currently, it is set +to 50000. Though it is unlikely you'll ever need to modify this value, you +can increase it if your machine is too fast for the adjustment algorithm. Do +not reduce the number. + +HUFFARRAYSIZE - This value sets the size of the plain-text array to be +compressed. You can override this value with the command file to see how +well your machine performs for larger or smaller arrays. The subsequent +results, however, are invalid for comparison with other systems. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the Huffman compression +algorithm, and run the benchmark. Defining DEBUG will enable a section of +code that verifies the decompression as it takes place (i.e., the routine +compares -- character at a time -- the uncompressed data with the original +plain-text). If there's an error, the program will repeatedly display: "Error +at textoffset xxx". + +** If everything is correct it will emit quite a few "Huffman: OK" messages. +** +** I added a resetting of the random number generator, outside of the +** timed loop, and a resetting of the Huffman tree, inside of the +** timed loop. That should help to make the benchmark more consistent. +** The program did originally only reset half of the tree, which lead +** to runtime errors on some systems. The effect on the benchmark +** should be negligible, and in fact comes out as being of the order +** of less than 1% on my test system. +** Uwe F. Mayer + +References + +Data Compression: Methods and Theory, James A. Storer, Computer Science +Press, Rockville, MD, 1988. + +An Introduction to Text Processing, Peter D. Smith, MIT Press, Cambridge, +MA, 1990. + +IDEA Encryption + +Description + +This is another benchmark based on a "higher-level" algorithm; "higher +-level" in the sense that it is more complex than a sort or a search +operation. + +Security -- and, therefore, cryptography -- are becoming increasingly +important issues in the computer realm. It's likely that more and more +machines will be running routines like the IDEA encryption algorithm. (IDEA +is an acronym for the International Data Encryption Algorithm.) + +A good description of the algorithm (and, in fact, the reference we used to +create the source code for the test) can be found in Bruce Schneier's +exhaustive exploration of encryption, "Applied Cryptography" (see +references). To quote Mr. Schneier: "In my opinion, it [IDEA] is the best +and most secure block algorithm available to the public at this time." + +IDEA is a symmetrical, block cipher algorithm. Symmetrical means that the +same routine used to encrypt the data also decrypts the data. A block cipher +works on the plain-text (the message to be encrypted) in fixed, discrete +chunks. In the case of IDEA, the algorithm encrypts and decrypts 64 bits at +a time. + +As pointed out in Schneier's book, there are three operations that the IDEA +uses to do its work: + +XOR (exclusive-or) + +Addition modulo 216 (ignoring overflow) + +Multiplication modulo 216+1 (ignoring overflow). + +IDEA requires a key of 128 bits. However, keys and blocks are further +subdivided into 16-bit chunks, so that any given operation within the IDEA +encryption is performed on 16-bit quantities. (This is one of the many +advantages of the algorithm, it is efficient even on 16-bit processors.) + +The IDEA benchmark considers an "iteration" to be an encryption and +decryption of a buffer of 4000 bytes. The test actually builds 3 buffers: +The first to hold the original plain-text, the second to hold the encrypted +text, and the third to hold the decrypted text (the contents of which should +match that of the first buffer). It reports its results in iterations per +second. + +Analysis + +Optimized 486 code: The algorithm actually spends most of its time (nearly +75%) within the mul() routine, which performs the multiplication modulo +216+1. This is a super-simple routine, consisting primarily of if +statements, shifts, and additions. + +The remaining time (around 24%) is spent in the balance of the cipher_idea() +routine. (Note that cipher_idea() calls the mul() routine frequently; so, +the 24% is comprised of the other lines of cipher_idea()). cipher_idea() is +littered with simple pointer-fetch-and-increment operations, some addition, +and some exclusive-or operations. + +Note that IDEA's exercise of system capabilities probably doesn't extend +beyond testing simple integer math operations. Since the buffer size is set +to 4000 bytes, the test will run entirely in processor cache on most +systems. Even the cache won't get a heavy "internal" workout, since the +algorithm proceeds sequentially through each buffer from lower to higher +addresses. + +680x0 code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based; consequently, it is impossible to determine execution profiles for +individual machine instructions. We can, however, get an idea of how much +time is spent in each routine. + +As with Huffman compression, the IDEA algorithm is written monolithically -- +a single, large routine does most of the work. However, a special +multiplication routine, mul(), is frequently called within each +encryption/decryption iteration (see above). + +In this instance, the results for the 68K system diverges widely from those +of the 486 system. The CodeWarrior profiler shows the mul() routine as +taking only 4% of the total time in the benchmark, even though it is called +over 20 million times. The outer routine is called 600,000 times, and +accounts for about 96% of the whole program's entire time. + +Porting Considerations + +Since IDEA does its work in 16-bit units, it is particularly important that +u16 be defined to whatever datatype provides an unsigned 16-bit integer on +the test platform. Usually, unsigned short works for this. (You can verify +the size of a short by running the benchmarks with a command file that +includes ALLSTATS=T as one of the commands. This will cause the benchmark +program to display a message that tells the size of the int, short, and long +data-types in bytes.) + +Also, the mul() routine in IDEA requires the u32 datatype to define an +unsigned 32-bit integer. In most cases, unsigned long works. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the IDEA algorithm, and +run the benchmark. Defining DEBUG will enable a section of code that +compares the original plain-text with the output of the test. (Remember, the +benchmark performs both encryption and decryption.) If the algorithm has +failed, the output will not match the input, and you'll see "IDEA Error" +messages all over your display. + +References + +Applied Cryptography: Protocols, Algorithms, and Source Code in C, Bruce +Schneier, John Wiley & Sons, Inc., New York, 1994. + +Neural Net + +Description + +The Neural Net simulation benchmark is based on a simple back-propagation +neural network presented by Maureen Caudill as part of a BYTE article that +appeared in the October, 1991 issue (see "Expert Networks" in that issue). +The network involved is a simple 3-layer (input neurodes, middle-layer +neurodes, and output neurodes) network that accepts a number of 5 x 7 input +patterns and produce a single 8-bit output pattern. + +The test involves sending the network an input pattern that is the 5 x 7 +"image" of a character (1's and 0's -- 1's representing lit pixels, 0's +representing unlit pixels), and teaching it the 8-bit ASCII code for the +character. + +A thorough description of how the back propagation algorithm works is beyond +the scope of this paper. We recommend you search through the references +given at the end of this paper, particularly Ms. Caudill's article, for +detailed discussion. In brief, the benchmark is primarily an exercise in +floating-point operations, with some frequent use of the exp() function. It +also performs a great deal of array references, though the arrays in use are +well under 300 elements each (and less than 100 in most cases). + +The Neural Net benchmark considers an iteration to be a single learning +cycle. (A "learning cycle" is defined as the time it takes the network to be +able to associate all input patterns to the correct output patterns within a +specified tolerance.) It reports its results in iterations per second. + +Analysis + +Optimized 486 code: The forward pass of the network (i.e., calculating +outputs from inputs) utilize a sigmoid function. This function has, at its +heart, a call to the exp() library routine. A small but non-negligible +amount of time is spent in that function (a little over 5% for the 486 +system we tested). + +The learning portion of the network benchmark depends on the derivative of +the sigmoid function, which turns out to require only multiplications and +subtractions. Consequently, each learning pass exercises only simple +floating-point operations. + +If we divide the time spent in the test into two parts -- forward pass and +backward pass (the latter being the learning pass) -- then the test appears +to spend the greatest part of its time in the learning phase. In fact, most +time is spent in the adjust_mid_wts() routine. This is the part of the +routine that alters the weights on the middle layer neurodes. (It accounts +for over 40% of the benchmark's time.) + +680x0 Code (Macintosh CodeWarrior): Though CodeWarrior's profiler is +function based, the neural net benchmark is highly modular. We can therefore +get a good breakdown of routine usage: + +worst_pass_error() - 304 microsecs (called 4680 times) + +adjust_mid_wts() - 83277 microsecs (called 46800 times) + +adjust_out_wts() - 17394 microsecs (called 46800 times) + +do_mid_error() - 11512 microsecs (called 46800 times) + +do_out_error() - 3002 microsecs (called 46800 times) + +do_mid_forward() - 49559 microsecs (called 46800 times) + +do_out_forward() - 20634 microsecs (called 46800 times) + +Again, most time was spent in adjust_mid_wts() (as on the 486), accounting +for almost twice as much time as do_mid_forward(). + +Porting Consideration + +The Neural Net benchmark is not dependent on any special data types. There +are a number of global variables and arrays that should not be altered in +any way. Most importantly, the #defines found in NBENCH1.H under the Neural +Net section should not be changed. These control not only the number of +neurodes in each layer; they also include constants that govern the learning +processes. + +Other globals to be aware of: + +MAXNNETLOOPS - This constant simply sets the upper limit on the number of +training loops the test will permit per iteration. The Neural Net benchmark +adjusts its workload by re-teaching itself over and over (each time it +begins a new training session, the network is "cleared" -- loaded with +random values). It is unlikely you will ever need to modify this constant. + +inpath - This string pointer is set to the path from which the neural net's +input data is read. It is currently hardwired to "NNET.DAT". You shouldn't +have to change this name, unless your file system requires directory +information as part of the path. + +Note that the Neural Net benchmark is the only test that requires an +external data file. The contents of the file are listed in an attachment to +this paper. You should use the attachment to reconstruct the file should it +become lost or corrupted. Any changes to the file will invalidate the test +results. + +To test for correct execution of the benchmark: #define the symbol DEBUG, +recompile, build a command file that executes only the Neural Net test, and +run the benchmark. Defining DEBUG will enable a section of code that +displays how many passes through the learning process were required for the +net to learn. It should learn in 780 passes. + +References + +"Expert Networks," Maureen Caudill, BYTE Magazine, October, 1991. + +Simulating Neural Networks, Norbert Hoffmann, Verlag Vieweg, Wiesbaden, +1994. + +Signal and Image Processing with Neural Networks, Timothy Masters, John +Wiley and Sons, New York, 1994. + +Introduction to Neural Networks, Jeannette Stanley, California Scientific +Software, CA, 1989. + +LU Decomposition + +Description + +LU Decomposition is an algorithm that can be used as the heart of a program +for solving linear equations. Suppose you have a matrix A. LU Decomposition +determines the matrices L and U such that + +L . U = A + +where L is a lower triangular matrix and U is an upper triangular matrix. (A +lower triangular matrix has nonzero elements only on the main diagonal and +below. An upper triangular matrix has nonzero elements only on the main +diagonal and above.) + +Without going into the mathematical details too deeply, having the L and U +matrices makes the solution of linear equations (i.e., equations of the form +A . x = b) quite easy. It turns out that you can also use LU decomposition +to determine matrix inverses and determinants. + +The algorithm used in the benchmarks was derived from Numerical Recipes in +Pascal (there is a C version of the book, which we did not have on hand), a +book we heartily recommend to anyone serious about mathematical and +scientific computing. The authors are approving of LU decomposition as a +means of solving linear equations, pointing out that their version (which +makes use of what we would have to call "Crout's method with partial +implicit pivoting") is a factor of 3 better than one of their Gauss-Jordan +routines, a factor of 1.5 better than another. They go on to demonstrate the +use of LU decomposition for iterative improvement of linear equation +solutions. + +The benchmark begins by creating a "solvable" linear system. This is easily +done by loading up the column vector b with random integers, then +initializing A with an identity matrix. The equations are then "scrambled" +by either multiplying a row by a constant, or adding one row to another. The +scrambled matrices are handed to the LU algorithm. + +The LU Decomposition benchmark considers a single iteration to be the +solution of one set of equations (the size of A is fixed at 101 x 101 +elements). It reports its results in iterations per second. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): The entire algorithm consists of two +parts: the LU decomposition itself, and the back substitution algorithm that +builds the solution vector. The majority of the algorithm's time takes place +within the former; the algorithm that builds the L and U matrices (this +takes place in routine ludcmp()). + +Within ludcmp(), there are two extremely tight for loops forming the heart +of Crout's algorithm that consume the majority of the time. The loops are +"tight" in that they each consist of only one line of code; in both cases, +the line of code is a "multiply and accumulate" operation (actually, it's +sort of a multiply and de-accumulate, since the result of the multiplication +is subtracted, not added). + +In both cases, the items multiplied are elements from the A array; and one +factor's row index is varying more rapidly, while another factor's column +index is varying more rapidly. + +Note that this is a good overall test of floating-point operations within +matrices. Most of the math is floating-point; primarily additions, +subtractions, and multiplications (only a few divisions). + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based. It is therefore impossible to determine execution profiles at the +machine-code level. The profiler does, however, allow us to determine how +much time the benchmark spends in each routine. This breakdown is as +follows: + +lusolve() - 3.4 microsecs (about 0% of the time) + +lubksb() 1198 microsec (about 2% of the time) + +ludcmp() - 63171 microsec (about 91% of the time) + +The above percentages are for the whole program. Consequently, as a portion +of actual benchmark time, the amount attributed to each will be slightly +larger (though the proportions will remain the same). + +Since ludcmp() performs the actual LU decomposition, this is exactly where +we'd want the benchmark to spend its time. The lubksb() routine calls +ludcmp(), using the resulting matrix to "back-solve" the linear equation. + +Porting Considerations + +The LU Decomposition routine requires no special data types, and is immune +to byte ordering. It does make use of a typedef (LUdblptr) that includes an +embedded union; this allows the benchmark to "coerce" a pointer to double +into a pointer to a 2D array of double. This arrangement has not caused +problems with the compilers we have tested to date. + +Other constants and globals to be aware of: + +LUARRAYROWS and LUARRAYCOLS - These constants set the size of the +coefficient matrix, A. They cannot be altered by command file. In fact, you +shouldn't alter them at all, or your results will be invalid. Currently, +they are both set to 101. + +MAXLUARRAYS - This is another "governor" constant. The algorithm performs +dynamic workload adjustment by building more and more arrays to solve per +timing round. This sets the maximum upper limit of arrays that it will +build. Currently, it is set to 1000, which should be more than enough for +the reasonable future (1000 arrays of 101 x 101 floating-point doubles would +require somewhere around 80 megabytes of RAM -- and that's not counting the +column vectors). + +To test for correct execution of the benchmark: Currently, there is no +simple technique for doing this. You can, however, either use your favorite +debugger (or embed a printf() statement) at the conclusion of the lubksb() +routine. When this routine concludes, the array b will hold the solution +vector. These items will be stored as floating-point doubles, and the first +14 are (with rounding): + +46 20 23 22 85 86 97 95 8 89 75 67 6 86 + +If you find these numbers as the first 14 in the array b[], then you're +virtually guaranteed that the algorithm is working correctly. + +*** The above is not correct, as the initial matrix is not the identity, +*** but a matrix with random nonzero entries on the diagonal (they have +*** altered the algorithm since they wrote the documentation). +*** I changed the output of the debugging routine, it now prints first +*** what the array b should hold (as righthand side divided by diagonal +*** entry), and then it prints what the array b does hold after the +*** decomposition has been done to compute the solution of the system. If +*** you get the same, then fine. +*** And, by the way, my original right hand sides are +*** 46 23 85 97 8 75 6 81 88 76 6 84 31 53 2 ... +*** and the diagonal entries are +*** 520 922 186 495 89 267 786 571 175 600 738 321 897 541 859 ... +*** You notice that one has every other number of the original sequence. +*** This is due to BYTE's change of the algorithm, as they now also use the +*** random number generator to generate the diagonal elements. +*** Here is the complete set of data: +*** 46/520=0.09 23/922=0.02 85/186=0.46 97/495=0.20 8/89=0.09 +*** 75/267=0.28 6/786=0.01 81/571=0.14 88/175=0.50 76/600=0.13 +*** 6/738=0.01 84/321=0.26 31/897=0.03 53/541=0.10 2/859=0.00 +*** 86/92=0.93 51/121=0.42 29/248=0.12 51/789=0.06 84/6=14.00 +*** 21/180=0.12 33/48=0.69 2/899=0.00 12/820=0.01 69/372=0.19 +*** 59/809=0.07 74/18=4.11 40/788=0.05 39/56=0.70 86/91=0.95 +*** 33/878=0.04 82/165=0.50 42/561=0.07 8/274=0.03 84/694=0.12 +*** 32/352=0.09 25/969=0.03 59/816=0.07 33/112=0.29 5/125=0.04 +*** 89/740=0.12 7/223=0.03 54/994=0.05 33/80=0.41 55/676=0.08 +*** 6/524=0.01 36/544=0.07 21/160=0.13 58/596=0.10 15/717=0.02 +*** 84/311=0.27 98/530=0.18 46/713=0.06 41/233=0.18 73/640=0.11 +*** 40/343=0.12 72/586=0.12 100/965=0.10 59/764=0.08 37/866=0.04 +*** 27/682=0.04 3/652=0.00 41/352=0.12 87/786=0.11 45/79=0.57 +*** 83/761=0.11 41/817=0.05 46/209=0.22 78/930=0.08 85/210=0.40 +*** 80/756=0.11 18/931=0.02 30/669=0.04 47/127=0.37 85/891=0.10 +*** 66/364=0.18 83/955=0.09 58/637=0.09 58/778=0.07 82/288=0.28 +*** 42/540=0.08 76/290=0.26 59/36=1.64 29/463=0.06 63/476=0.13 +*** 6/340=0.02 73/341=0.21 59/737=0.08 81/492=0.16 98/443=0.22 +*** 58/32=1.81 53/562=0.09 54/263=0.21 46/367=0.13 58/390=0.15 +*** 96/845=0.11 30/746=0.04 2/687=0.00 28/849=0.03 84/180=0.47 +*** 85/382=0.22 +*** Uwe F. Mayer + +References + +Numerical Recipes in Pascal: The Art of Scientific Computing, Press, +Flannery, Teukolsky, Vetterling, Cambridge University Press, New York, 1989. diff --git a/debugbit.good.gz b/debugbit.good.gz new file mode 100644 index 0000000..fdc893e Binary files /dev/null and b/debugbit.good.gz differ diff --git a/emfloat.c b/emfloat.c new file mode 100644 index 0000000..5e73890 --- /dev/null +++ b/emfloat.c @@ -0,0 +1,1343 @@ +/* +** emfloat.c +** Source for emulated floating-point routines. +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine. +** +** Created: +** Last update: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + + +#include +#include +#include "nmglobal.h" +#include "emfloat.h" + +/* +** Floating-point emulator. +** These routines are only "sort of" IEEE-compliant. All work is +** done using an internal representation. Also, the routines do +** not check for many of the exceptions that might occur. +** Still, the external formats produced are IEEE-compatible, +** with the restriction that they presume a low-endian machine +** (though the endianism will not effect the performance). +** +** Some code here was based on work done by Steve Snelgrove of +** Orem, UT. Other code comes from routines presented in +** the long-ago book: "Microprocessor Programming for +** Computer Hobbyists" by Neill Graham. +*/ + +/************************** +** SetupCPUEmFloatArrays ** +*************************** +** Set up the arrays that will be used in the emulated +** floating-point tests. +** This is done by loading abase and bbase elements with +** random numbers. We use our long-to-floating point +** routine to set them up. +** NOTE: We really don't need the pointer to cbase...cbase +** is overwritten in the benchmark. +*/ +void SetupCPUEmFloatArrays(InternalFPF *abase, + InternalFPF *bbase, + InternalFPF *cbase, + ulong arraysize) +{ +ulong i; +InternalFPF locFPF1,locFPF2; +/* +** Reset random number generator so things repeat. Inserted by Uwe F. Mayer. +*/ +extern int32 randnum(int32 lngval); +randnum((int32)13); + +for(i=0;itype=IFPF_IS_ZERO; +dest->sign=sign; +dest->exp=MIN_EXP; +for(i=0;imantissa[i]=0; +return; +} + +/*************************** +** SetInternalFPFInfinity ** +**************************** +** Set an internal floating-point-format number to infinity. +** This can happen if the exponent exceeds MAX_EXP. +** As above, sign picks the sign of infinity. +*/ +static void SetInternalFPFInfinity(InternalFPF *dest, + uchar sign) +{ +int i; /* Index */ + +dest->type=IFPF_IS_INFINITY; +dest->sign=sign; +dest->exp=MIN_EXP; +for(i=0;imantissa[i]=0; +return; +} + +/********************** +** SetInternalFPFNaN ** +*********************** +** Set an internal floating-point-format number to Nan +** (not a number). Note that we "emulate" an 80x87 as far +** as the mantissa bits go. +*/ +static void SetInternalFPFNaN(InternalFPF *dest) +{ +int i; /* Index */ + +dest->type=IFPF_IS_NAN; +dest->exp=MAX_EXP; +dest->sign=1; +dest->mantissa[0]=0x4000; +for(i=1;imantissa[i]=0; + +return; +} + +/******************* +** IsMantissaZero ** +******************** +** Pass this routine a pointer to an internal floating point format +** number's mantissa. It checks for an all-zero mantissa. +** Returns 0 if it is NOT all zeros, !=0 otherwise. +*/ +static int IsMantissaZero(u16 *mant) +{ +int i; /* Index */ +int n; /* Return value */ + +n=0; +for(i=0;i=0;i--) +{ accum=mantissa[i]; + new_carry=accum & 0x8000; /* Get new carry */ + accum=accum<<1; /* Do the shift */ + if(*carry) + accum|=1; /* Insert previous carry */ + *carry=new_carry; + mantissa[i]=accum; /* Return shifted value */ +} +return; +} + +/******************** +** ShiftMantRight1 ** +********************* +** Shift a mantissa right by 1 bit. Provides carry, as +** above +*/ +static void ShiftMantRight1(u16 *carry, + u16 *mantissa) +{ +int i; /* Index */ +int new_carry; +u16 accum; + +for(i=0;i>1; + if(*carry) + accum|=0x8000; + *carry=new_carry; + mantissa[i]=accum; +} +return; +} + + +/***************************** +** StickyShiftMantRight ** +****************************** +** This is a shift right of the mantissa with a "sticky bit". +** I.E., if a carry of 1 is shifted out of the least significant +** bit, the least significant bit is set to 1. +*/ +static void StickyShiftRightMant(InternalFPF *ptr, + int amount) +{ +int i; /* Index */ +u16 carry; /* Self-explanatory */ +u16 *mantissa; + +mantissa=ptr->mantissa; + +if(ptr->type!=IFPF_IS_ZERO) /* Don't bother shifting a zero */ +{ + /* + ** If the amount of shifting will shift everyting + ** out of existence, then just clear the whole mantissa + ** and set the lowmost bit to 1. + */ + if(amount>=INTERNAL_FPF_PRECISION * 16) + { + for(i=0;imantissa[0] & 0x8000) == 0) +{ + carry = 0; + ShiftMantLeft1(&carry, ptr->mantissa); + ptr->exp--; +} +return; +} + +/**************** +** denormalize ** +***************** +** Denormalize an internal-representation number. This means +** shifting it right until its exponent is equivalent to +** minimum_exponent. (You have to do this often in order +** to perform additions and subtractions). +*/ +static void denormalize(InternalFPF *ptr, + int minimum_exponent) +{ +long exponent_difference; + +if (IsMantissaZero(ptr->mantissa)) +{ + printf("Error: zero significand in denormalize\n"); +} + +exponent_difference = ptr->exp-minimum_exponent; +if (exponent_difference < 0) +{ + /* + ** The number is subnormal + */ + exponent_difference = -exponent_difference; + if (exponent_difference >= (INTERNAL_FPF_PRECISION * 16)) + { + /* Underflow */ + SetInternalFPFZero(ptr, ptr->sign); + } + else + { + ptr->exp+=exponent_difference; + StickyShiftRightMant(ptr, exponent_difference); + } +} +return; +} + + +/********************* +** RoundInternalFPF ** +********************** +** Round an internal-representation number. +** The kind of rounding we do here is simplest...referred to as +** "chop". "Extraneous" rightmost bits are simply hacked off. +*/ +void RoundInternalFPF(InternalFPF *ptr) +{ +/* int i; */ + +if (ptr->type == IFPF_IS_NORMAL || + ptr->type == IFPF_IS_SUBNORMAL) +{ + denormalize(ptr, MIN_EXP); + if (ptr->type != IFPF_IS_ZERO) + { + + /* clear the extraneous bits */ + ptr->mantissa[3] &= 0xfff8; +/* for (i=4; imantissa[i] = 0; + } +*/ + /* + ** Check for overflow + */ +/* Does not do anything as ptr->exp is a short and MAX_EXP=37268 + if (ptr->exp > MAX_EXP) + { + SetInternalFPFInfinity(ptr, ptr->sign); + } +*/ + } +} +return; +} + +/******************************************************* +** ARITHMETIC OPERATIONS ON INTERNAL REPRESENTATION ** +*******************************************************/ + +/*************** +** choose_nan ** +**************** +** Called by routines that are forced to perform math on +** a pair of NaN's. This routine "selects" which NaN is +** to be returned. +*/ +static void choose_nan(InternalFPF *x, + InternalFPF *y, + InternalFPF *z, + int intel_flag) +{ +int i; + +/* +** Compare the two mantissas, +** return the larger. Note that we will be emulating +** an 80387 in this operation. +*/ +for (i=0; imantissa[i] > y->mantissa[i]) + { + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + return; + } + if (x->mantissa[i] < y->mantissa[i]) + { + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + return; + } +} + +/* +** They are equal +*/ +if (!intel_flag) + /* if the operation is addition */ + memmove((void *)x,(void *)z,sizeof(InternalFPF)); +else + /* if the operation is multiplication */ + memmove((void *)y,(void *)z,sizeof(InternalFPF)); +return; +} + + +/********************** +** AddSubInternalFPF ** +*********************** +** Adding or subtracting internal-representation numbers. +** Internal-representation numbers pointed to by x and y are +** added/subtracted and the result returned in z. +*/ +static void AddSubInternalFPF(uchar operation, + InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int exponent_difference; +u16 borrow; +u16 carry; +int i; +InternalFPF locx,locy; /* Needed since we alter them */ + +/* +** Following big switch statement handles the +** various combinations of operand types. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case ZERO_ZERO: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + if (x->sign ^ y->sign ^ operation) + { + z->sign = 0; /* positive */ + } + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: +case SUBNORMAL_ZERO: +case NORMAL_ZERO: +case INFINITY_ZERO: +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_SUBNORMAL: +case ZERO_NORMAL: +case ZERO_INFINITY: +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + z->sign ^= operation; + break; + +case SUBNORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_SUBNORMAL: +case NORMAL_NORMAL: + /* + ** Copy x and y to locals, since we may have + ** to alter them. + */ + memmove((void *)&locx,(void *)x,sizeof(InternalFPF)); + memmove((void *)&locy,(void *)y,sizeof(InternalFPF)); + + /* compute sum/difference */ + exponent_difference = locx.exp-locy.exp; + if (exponent_difference == 0) + { + /* + ** locx.exp == locy.exp + ** so, no shifting required + */ + if (locx.type == IFPF_IS_SUBNORMAL || + locy.type == IFPF_IS_SUBNORMAL) + z->type = IFPF_IS_SUBNORMAL; + else + z->type = IFPF_IS_NORMAL; + + /* + ** Assume that locx.mantissa > locy.mantissa + */ + z->sign = locx.sign; + z->exp= locx.exp; + } + else + if (exponent_difference > 0) + { + /* + ** locx.exp > locy.exp + */ + StickyShiftRightMant(&locy, + exponent_difference); + z->type = locx.type; + z->sign = locx.sign; + z->exp = locx.exp; + } + else /* if (exponent_difference < 0) */ + { + /* + ** locx.exp < locy.exp + */ + StickyShiftRightMant(&locx, + -exponent_difference); + z->type = locy.type; + z->sign = locy.sign ^ operation; + z->exp = locy.exp; + } + + if (locx.sign ^ locy.sign ^ operation) + { + /* + ** Signs are different, subtract mantissas + */ + borrow = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + Sub16Bits(&borrow, + &z->mantissa[i], + locx.mantissa[i], + locy.mantissa[i]); + + if (borrow) + { + /* The y->mantissa was larger than the + ** x->mantissa leaving a negative + ** result. Change the result back to + ** an unsigned number and flip the + ** sign flag. + */ + z->sign = locy.sign ^ operation; + borrow = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + { + Sub16Bits(&borrow, + &z->mantissa[i], + 0, + z->mantissa[i]); + } + } + else + { + /* The assumption made above + ** (i.e. x->mantissa >= y->mantissa) + ** was correct. Therefore, do nothing. + ** z->sign = x->sign; + */ + } + + if (IsMantissaZero(z->mantissa)) + { + z->type = IFPF_IS_ZERO; + z->sign = 0; /* positive */ + } + else + if (locx.type == IFPF_IS_NORMAL || + locy.type == IFPF_IS_NORMAL) + { + normalize(z); + } + } + else + { + /* signs are the same, add mantissas */ + carry = 0; + for (i=(INTERNAL_FPF_PRECISION-1); i>=0; i--) + { + Add16Bits(&carry, + &z->mantissa[i], + locx.mantissa[i], + locy.mantissa[i]); + } + + if (carry) + { + z->exp++; + carry=0; + ShiftMantRight1(&carry,z->mantissa); + z->mantissa[0] |= 0x8000; + z->type = IFPF_IS_NORMAL; + } + else + if (z->mantissa[0] & 0x8000) + z->type = IFPF_IS_NORMAL; + } + break; + +case INFINITY_INFINITY: + SetInternalFPFNaN(z); + break; + +case NAN_NAN: + choose_nan(x, y, z, 1); + break; +} + +/* +** All the math is done; time to round. +*/ +RoundInternalFPF(z); +return; +} + + +/************************ +** MultiplyInternalFPF ** +************************* +** Two internal-representation numbers x and y are multiplied; the +** result is returned in z. +*/ +static void MultiplyInternalFPF(InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int i; +int j; +u16 carry; +u16 extra_bits[INTERNAL_FPF_PRECISION]; +InternalFPF locy; /* Needed since this will be altered */ +/* +** As in the preceding function, this large switch +** statement selects among the many combinations +** of operands. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: +case INFINITY_INFINITY: +case ZERO_ZERO: +case ZERO_SUBNORMAL: +case ZERO_NORMAL: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + z->sign ^= y->sign; + break; + +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: +case SUBNORMAL_ZERO: +case NORMAL_ZERO: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + z->sign ^= x->sign; + break; + +case ZERO_INFINITY: +case INFINITY_ZERO: + SetInternalFPFNaN(z); + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + + +case SUBNORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_SUBNORMAL: +case NORMAL_NORMAL: + /* + ** Make a local copy of the y number, since we will be + ** altering it in the process of multiplying. + */ + memmove((void *)&locy,(void *)y,sizeof(InternalFPF)); + + /* + ** Check for unnormal zero arguments + */ + if (IsMantissaZero(x->mantissa) || IsMantissaZero(y->mantissa)) + SetInternalFPFInfinity(z, 0); + + /* + ** Initialize the result + */ + if (x->type == IFPF_IS_SUBNORMAL || + y->type == IFPF_IS_SUBNORMAL) + z->type = IFPF_IS_SUBNORMAL; + else + z->type = IFPF_IS_NORMAL; + + z->sign = x->sign ^ y->sign; + z->exp = x->exp + y->exp ; + for (i=0; imantissa[i] = 0; + extra_bits[i] = 0; + } + + for (i=0; i<(INTERNAL_FPF_PRECISION*16); i++) + { + /* + ** Get rightmost bit of the multiplier + */ + carry = 0; + ShiftMantRight1(&carry, locy.mantissa); + if (carry) + { + /* + ** Add the multiplicand to the product + */ + carry = 0; + for (j=(INTERNAL_FPF_PRECISION-1); j>=0; j--) + Add16Bits(&carry, + &z->mantissa[j], + z->mantissa[j], + x->mantissa[j]); + } + else + { + carry = 0; + } + + /* + ** Shift the product right. Overflow bits get + ** shifted into extra_bits. We'll use it later + ** to help with the "sticky" bit. + */ + ShiftMantRight1(&carry, z->mantissa); + ShiftMantRight1(&carry, extra_bits); + } + + /* + ** Normalize + ** Note that we use a "special" normalization routine + ** because we need to use the extra bits. (These are + ** bits that may have been shifted off the bottom that + ** we want to reclaim...if we can. + */ + while ((z->mantissa[0] & 0x8000) == 0) + { + carry = 0; + ShiftMantLeft1(&carry, extra_bits); + ShiftMantLeft1(&carry, z->mantissa); + z->exp--; + } + + /* + ** Set the sticky bit if any bits set in extra bits. + */ + if (IsMantissaZero(extra_bits)) + { + z->mantissa[INTERNAL_FPF_PRECISION-1] |= 1; + } + break; + +case NAN_NAN: + choose_nan(x, y, z, 0); + break; +} + +/* +** All math done...do rounding. +*/ +RoundInternalFPF(z); +return; +} + + +/********************** +** DivideInternalFPF ** +*********************** +** Divide internal FPF number x by y. Return result in z. +*/ +static void DivideInternalFPF(InternalFPF *x, + InternalFPF *y, + InternalFPF *z) +{ +int i; +int j; +u16 carry; +u16 extra_bits[INTERNAL_FPF_PRECISION]; +InternalFPF locx; /* Local for x number */ + +/* +** As with preceding function, the following switch +** statement selects among the various possible +** operands. +*/ +switch ((x->type * IFPF_TYPE_COUNT) + y->type) +{ +case ZERO_ZERO: +case INFINITY_INFINITY: + SetInternalFPFNaN(z); + break; + +case ZERO_SUBNORMAL: +case ZERO_NORMAL: + if (IsMantissaZero(y->mantissa)) + { + SetInternalFPFNaN(z); + break; + } + +case ZERO_INFINITY: +case SUBNORMAL_INFINITY: +case NORMAL_INFINITY: + SetInternalFPFZero(z, x->sign ^ y->sign); + break; + +case SUBNORMAL_ZERO: +case NORMAL_ZERO: + if (IsMantissaZero(x->mantissa)) + { + SetInternalFPFNaN(z); + break; + } + +case INFINITY_ZERO: +case INFINITY_SUBNORMAL: +case INFINITY_NORMAL: + SetInternalFPFInfinity(z, 0); + z->sign = x->sign ^ y->sign; + break; + +case NAN_ZERO: +case NAN_SUBNORMAL: +case NAN_NORMAL: +case NAN_INFINITY: + memmove((void *)x,(void *)z,sizeof(InternalFPF)); + break; + +case ZERO_NAN: +case SUBNORMAL_NAN: +case NORMAL_NAN: +case INFINITY_NAN: + memmove((void *)y,(void *)z,sizeof(InternalFPF)); + break; + +case SUBNORMAL_SUBNORMAL: +case NORMAL_SUBNORMAL: +case SUBNORMAL_NORMAL: +case NORMAL_NORMAL: + /* + ** Make local copy of x number, since we'll be + ** altering it in the process of dividing. + */ + memmove((void *)&locx,(void *)x,sizeof(InternalFPF)); + + /* + ** Check for unnormal zero arguments + */ + if (IsMantissaZero(locx.mantissa)) + { + if (IsMantissaZero(y->mantissa)) + SetInternalFPFNaN(z); + else + SetInternalFPFZero(z, 0); + break; + } + if (IsMantissaZero(y->mantissa)) + { + SetInternalFPFInfinity(z, 0); + break; + } + + /* + ** Initialize the result + */ + z->type = x->type; + z->sign = x->sign ^ y->sign; + z->exp = x->exp - y->exp + + ((INTERNAL_FPF_PRECISION * 16 * 2)); + for (i=0; imantissa[i] = 0; + extra_bits[i] = 0; + } + + while ((z->mantissa[0] & 0x8000) == 0) + { + carry = 0; + ShiftMantLeft1(&carry, locx.mantissa); + ShiftMantLeft1(&carry, extra_bits); + + /* + ** Time to subtract yet? + */ + if (carry == 0) + for (j=0; jmantissa[j] > extra_bits[j]) + { + carry = 0; + goto no_subtract; + } + if (y->mantissa[j] < extra_bits[j]) + break; + } + /* + ** Divisor (y) <= dividend (x), subtract + */ + carry = 0; + for (j=(INTERNAL_FPF_PRECISION-1); j>=0; j--) + Sub16Bits(&carry, + &extra_bits[j], + extra_bits[j], + y->mantissa[j]); + carry = 1; /* 1 shifted into quotient */ + no_subtract: + ShiftMantLeft1(&carry, z->mantissa); + z->exp--; + } + break; + +case NAN_NAN: + choose_nan(x, y, z, 0); + break; +} + +/* +** Math complete...do rounding +*/ +RoundInternalFPF(z); +} + +/********************** +** LongToInternalFPF ** +** Int32ToInternalFPF ** +*********************** +** Convert a signed (long) 32-bit integer into an internal FPF number. +*/ +/* static void LongToInternalFPF(long mylong, */ +static void Int32ToInternalFPF(int32 mylong, + InternalFPF *dest) +{ +int i; /* Index */ +u16 myword; /* Used to hold converted stuff */ +/* +** Save the sign and get the absolute value. This will help us +** with 64-bit machines, since we use only the lower 32 +** bits just in case. (No longer necessary after we use int32.) +*/ +/* if(mylong<0L) */ +if(mylong<(int32)0) +{ dest->sign=1; + mylong=(int32)0-mylong; +} +else + dest->sign=0; +/* +** Prepare the destination floating point number +*/ +dest->type=IFPF_IS_NORMAL; +for(i=0;imantissa[i]=0; + +/* +** See if we've got a zero. If so, make the resultant FP +** number a true zero and go home. +*/ +if(mylong==0) +{ dest->type=IFPF_IS_ZERO; + dest->exp=0; + return; +} + +/* +** Not a true zero. Set the exponent to 32 (internal FPFs have +** no bias) and load the low and high words into their proper +** locations in the mantissa. Then normalize. The action of +** normalizing slides the mantissa bits into place and sets +** up the exponent properly. +*/ +dest->exp=32; +myword=(u16)((mylong >> 16) & 0xFFFFL); +dest->mantissa[0]=myword; +myword=(u16)(mylong & 0xFFFFL); +dest->mantissa[1]=myword; +normalize(dest); +return; +} + +#ifdef DEBUG +/************************ +** InternalFPFToString ** +************************* +** FOR DEBUG PURPOSES +** This routine converts an internal floating point representation +** number to a string. Used in debugging the package. +** Returns length of converted number. +** NOTE: dest must point to a buffer big enough to hold the +** result. Also, this routine does append a null (an effect +** of using the sprintf() function). It also returns +** a length count. +** NOTE: This routine returns 5 significant digits. Thats +** about all I feel safe with, given the method of +** conversion. It should be more than enough for programmers +** to determine whether the package is properly ported. +*/ +static int InternalFPFToString(char *dest, + InternalFPF *src) +{ +InternalFPF locFPFNum; /* Local for src (will be altered) */ +InternalFPF IFPF10; /* Floating-point 10 */ +InternalFPF IFPFComp; /* For doing comparisons */ +int msign; /* Holding for mantissa sign */ +int expcount; /* Exponent counter */ +int ccount; /* Character counter */ +int i,j,k; /* Index */ +u16 carryaccum; /* Carry accumulator */ +u16 mycarry; /* Local for carry */ + +/* +** Check first for the simple things...Nan, Infinity, Zero. +** If found, copy the proper string in and go home. +*/ +switch(src->type) +{ + case IFPF_IS_NAN: + memcpy(dest,"NaN",3); + return(3); + + case IFPF_IS_INFINITY: + if(src->sign==0) + memcpy(dest,"+Inf",4); + else + memcpy(dest,"-Inf",4); + return(4); + + case IFPF_IS_ZERO: + if(src->sign==0) + memcpy(dest,"+0",2); + else + memcpy(dest,"-0",2); + return(2); +} + +/* +** Move the internal number into our local holding area, since +** we'll be altering it to print it out. +*/ +memcpy((void *)&locFPFNum,(void *)src,sizeof(InternalFPF)); + +/* +** Set up a floating-point 10...which we'll use a lot in a minute. +*/ +/* LongToInternalFPF(10L,&IFPF10); */ +Int32ToInternalFPF((int32)10,&IFPF10); + +/* +** Save the mantissa sign and make it positive. +*/ +msign=src->sign; + +/* src->sign=0 */ /* bug, fixed Nov. 13, 1997 */ +(&locFPFNum)->sign=0; + +expcount=0; /* Init exponent counter */ + +/* +** See if the number is less than 10. If so, multiply +** the number repeatedly by 10 until it's not. For each +** multiplication, decrement a counter so we can keep track +** of the exponent. +*/ + +while(1) +{ AddSubInternalFPF(1,&locFPFNum,&IFPF10,&IFPFComp); + if(IFPFComp.sign==0) break; + MultiplyInternalFPF(&locFPFNum,&IFPF10,&IFPFComp); + expcount--; + memcpy((void *)&locFPFNum,(void *)&IFPFComp,sizeof(InternalFPF)); +} +/* +** Do the reverse of the above. As long as the number is +** greater than or equal to 10, divide it by 10. Increment the +** exponent counter for each multiplication. +*/ + +while(1) +{ + AddSubInternalFPF(1,&locFPFNum,&IFPF10,&IFPFComp); + if(IFPFComp.sign!=0) break; + DivideInternalFPF(&locFPFNum,&IFPF10,&IFPFComp); + expcount++; + memcpy((void *)&locFPFNum,(void *)&IFPFComp,sizeof(InternalFPF)); +} + +/* +** About time to start storing things. First, store the +** mantissa sign. +*/ +ccount=1; /* Init character counter */ +if(msign==0) + *dest++='+'; +else + *dest++='-'; + +/* +** At this point we know that the number is in the range +** 10 > n >=1. We need to "strip digits" out of the +** mantissa. We do this by treating the mantissa as +** an integer and multiplying by 10. (Not a floating-point +** 10, but an integer 10. Since this is debug code and we +** could care less about speed, we'll do it the stupid +** way and simply add the number to itself 10 times. +** Anything that makes it to the left of the implied binary point +** gets stripped off and emitted. We'll do this for +** 5 significant digits (which should be enough to +** verify things). +*/ +/* +** Re-position radix point +*/ +carryaccum=0; +while(locFPFNum.exp>0) +{ + mycarry=0; + ShiftMantLeft1(&mycarry,locFPFNum.mantissa); + carryaccum=(carryaccum<<1); + if(mycarry) carryaccum++; + locFPFNum.exp--; +} + +while(locFPFNum.exp<0) +{ + mycarry=0; + ShiftMantRight1(&mycarry,locFPFNum.mantissa); + locFPFNum.exp++; +} + +for(i=0;i<6;i++) + if(i==1) + { /* Emit decimal point */ + *dest++='.'; + ccount++; + } + else + { /* Emit a digit */ + *dest++=('0'+carryaccum); + ccount++; + + carryaccum=0; + memcpy((void *)&IFPF10, + (void *)&locFPFNum, + sizeof(InternalFPF)); + + /* Do multiply via repeated adds */ + for(j=0;j<9;j++) + { + mycarry=0; + for(k=(INTERNAL_FPF_PRECISION-1);k>=0;k--) + Add16Bits(&mycarry,&(IFPFComp.mantissa[k]), + locFPFNum.mantissa[k], + IFPF10.mantissa[k]); + carryaccum+=mycarry ? 1 : 0; + memcpy((void *)&locFPFNum, + (void *)&IFPFComp, + sizeof(InternalFPF)); + } + } + +/* +** Now move the 'E', the exponent sign, and the exponent +** into the string. +*/ +*dest++='E'; + +/* sprint is supposed to return an integer, but it caused problems on SunOS + * with the native cc. Hence we force it. + * Uwe F. Mayer + */ +ccount+=(int)sprintf(dest,"%4d",expcount); + +/* +** All done, go home. +*/ +return(ccount); + +} + +#endif diff --git a/emfloat.h b/emfloat.h new file mode 100644 index 0000000..41cc6d9 --- /dev/null +++ b/emfloat.h @@ -0,0 +1,154 @@ + +/* +** emfloat.h +** Header for emfloat.c +** +** BYTEmark (tm) +** BYTE Magazine's Native Mode benchmarks +** Rick Grehan, BYTE Magazine +** +** Create: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include + +/* Is this a 64 bit architecture? If so, this will define LONG64 */ +/* Uwe F. Mayer 15 November 1997 */ +#include "pointer.h" + +/* +** DEFINES +*/ +#define u8 unsigned char +#define u16 unsigned short +#ifdef LONG64 +#define u32 unsigned int +#else +#define u32 unsigned long +#endif +#define uchar unsigned char +#define ulong unsigned long + +#define MAX_EXP 32767L +#define MIN_EXP (-32767L) + +#define IFPF_IS_ZERO 0 +#define IFPF_IS_SUBNORMAL 1 +#define IFPF_IS_NORMAL 2 +#define IFPF_IS_INFINITY 3 +#define IFPF_IS_NAN 4 +#define IFPF_TYPE_COUNT 5 + +#define ZERO_ZERO 0 +#define ZERO_SUBNORMAL 1 +#define ZERO_NORMAL 2 +#define ZERO_INFINITY 3 +#define ZERO_NAN 4 + +#define SUBNORMAL_ZERO 5 +#define SUBNORMAL_SUBNORMAL 6 +#define SUBNORMAL_NORMAL 7 +#define SUBNORMAL_INFINITY 8 +#define SUBNORMAL_NAN 9 + +#define NORMAL_ZERO 10 +#define NORMAL_SUBNORMAL 11 +#define NORMAL_NORMAL 12 +#define NORMAL_INFINITY 13 +#define NORMAL_NAN 14 + +#define INFINITY_ZERO 15 +#define INFINITY_SUBNORMAL 16 +#define INFINITY_NORMAL 17 +#define INFINITY_INFINITY 18 +#define INFINITY_NAN 19 + +#define NAN_ZERO 20 +#define NAN_SUBNORMAL 21 +#define NAN_NORMAL 22 +#define NAN_INFINITY 23 +#define NAN_NAN 24 +#define OPERAND_ZERO 0 +#define OPERAND_SUBNORMAL 1 +#define OPERAND_NORMAL 2 +#define OPERAND_INFINITY 3 +#define OPERAND_NAN 4 + +/* +** Following already defined in NMGLOBAL.H +** +#define INTERNAL_FPF_PRECISION 4 +*/ + +/* +** TYPEDEFS +*/ + +typedef struct +{ + u8 type; /* Indicates, NORMAL, SUBNORMAL, etc. */ + u8 sign; /* Mantissa sign */ + short exp; /* Signed exponent...no bias */ + u16 mantissa[INTERNAL_FPF_PRECISION]; +} InternalFPF; + +/* +** PROTOTYPES +*/ +void SetupCPUEmFloatArrays(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, ulong arraysize); +ulong DoEmFloatIteration(InternalFPF *abase, + InternalFPF *bbase, InternalFPF *cbase, + ulong arraysize, ulong loops); +static void SetInternalFPFZero(InternalFPF *dest, + uchar sign); +static void SetInternalFPFInfinity(InternalFPF *dest, + uchar sign); +static void SetInternalFPFNaN(InternalFPF *dest); +static int IsMantissaZero(u16 *mant); +static void Add16Bits(u16 *carry,u16 *a,u16 b,u16 c); +static void Sub16Bits(u16 *borrow,u16 *a,u16 b,u16 c); +static void ShiftMantLeft1(u16 *carry,u16 *mantissa); +static void ShiftMantRight1(u16 *carry,u16 *mantissa); +static void StickyShiftRightMant(InternalFPF *ptr,int amount); +static void normalize(InternalFPF *ptr); +static void denormalize(InternalFPF *ptr,int minimum_exponent); +void RoundInternalFPF(InternalFPF *ptr); +static void choose_nan(InternalFPF *x,InternalFPF *y,InternalFPF *z, + int intel_flag); +static void AddSubInternalFPF(uchar operation,InternalFPF *x, + InternalFPF *y,InternalFPF *z); +static void MultiplyInternalFPF(InternalFPF *x,InternalFPF *y, + InternalFPF *z); +static void DivideInternalFPF(InternalFPF *x,InternalFPF *y, + InternalFPF *z); +/* static void LongToInternalFPF(long mylong, */ +static void Int32ToInternalFPF(int32 mylong, + InternalFPF *dest); +#ifdef DEBUG +static int InternalFPFToString(char *dest, + InternalFPF *src); +#endif + +/* +** EXTERNALS +*/ +extern ulong StartStopwatch(); +extern ulong StopStopwatch(ulong elapsed); +/* extern long randwc(long num); */ +extern int32 randwc(int32 num); diff --git a/hardware b/hardware new file mode 100755 index 0000000..6fb3293 Binary files /dev/null and b/hardware differ diff --git a/hardware.c b/hardware.c new file mode 100644 index 0000000..4838b2f --- /dev/null +++ b/hardware.c @@ -0,0 +1,202 @@ +#include +#include +#include + +#define BUF_SIZ 1024 + +/****************** +** output_string ** +******************* +** Displays a string on the screen. Also, if the flag +** write_to_file is set, outputs the string to the output file. +** Note, this routine presumes that you've included a carriage +** return at the end of the buffer. +*/ +static void output_string(const char *buffer, const int write_to_file, + FILE *global_ofile){ + printf("%s",buffer); + if(write_to_file!=0) + fprintf(global_ofile,"%s",buffer); + return; +} + + +/****************** +** removeNewLine ** +******************* +** Removes a trailing newline character if present +*/ +static void removeNewLine(char * s) { + if(strlen(s)>0 && s[strlen(s)-1] == '\n') { + s[strlen(s)-1] = '\0'; + } +} + + +/*************** +** runCommand ** +**************** +** Run the system command through a pipe +** The pointer result must point to a pre-allocated array of at least BUF_SIZ +*/ +static void runCommand (const char *command, char *result) { + FILE * pipe; + + pipe = popen(command, "r"); + if(pipe == NULL) { + /* command failed */ + result[0] = '\0'; + } else { + if(NULL == fgets(result, BUF_SIZ, pipe)){ + /* command failed */ + result[0] = '\0'; + } + pclose(pipe); + } + removeNewLine(result); +} + + +/******************** +** readProcCpuInfo ** +********************* +** Reads and parses /proc/cpuinfo on a Linux system +** The pointers must point to pre-allocated arrays of at least BUF_SIZ +*/ +static void readProcCpuInfo (char *model, char *cache) { + FILE * info; + char * cp; + int cpus = 0; + char * buffer_end; + char buffer[BUF_SIZ]; + char vendor_id[BUF_SIZ]; + char model_name[BUF_SIZ]; + char cpu_MHz[BUF_SIZ]; + int i; + float f; + + vendor_id[0] = model_name[0] = cpu_MHz[0] = model[0] = cache[0] = '\0'; + info = fopen("/proc/cpuinfo", "r"); + if(info != NULL) { + /* command did not fail */ + while(NULL != fgets(buffer, BUF_SIZ, info)){ + buffer_end = buffer + strlen(buffer); + cp = buffer; + if(! strncmp(buffer, "processor", 9)) { + cpus++; + } else if(! strncmp(buffer, "vendor_id", 9)) { + cp+=strlen("vendor_id"); + while(cp < buffer_end && ( *cp == ' ' || *cp == ':'|| *cp == '\t')) + cp++; + if(cp1) { + if (cpus==2) { + strcpy(model, "Dual"); + } else { + sprintf(model, "%d CPU", cpus); + } + } + cp = model + strlen(model); + if(vendor_id[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + strcpy(cp, vendor_id); + cp += strlen(vendor_id); + } + if(model_name[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + strcpy(cp, model_name); + cp += strlen(model_name); + } + if(cpu_MHz[0] != '\0'){ + if(cp != model){ + *cp++ = ' '; + } + f = atof(cpu_MHz); + i = (int)(f+0.5f); + sprintf(cpu_MHz, "%dMHz", i); + strcpy(cp, cpu_MHz); + cp += strlen(cpu_MHz); + } + fclose(info); + } +} + + +/************* +** hardware ** +************** +** Runs the system command "uname -s -r" +** Reads /proc/cpuinfo if on a linux system +** Writes output +*/ +void hardware(const int write_to_file, FILE *global_ofile) { + char buffer[BUF_SIZ]; + char os[BUF_SIZ]; + char model[BUF_SIZ]; + char cache[BUF_SIZ]; + char os_command[] = "uname -s -r"; +#ifdef NO_UNAME + os[0] = '\0'; +#else + runCommand(os_command, os); +#endif + if(NULL != strstr(os, "Linux")) { + readProcCpuInfo (model, cache); + } else { + model[0] = '\0'; + cache[0] = '\0'; + } + sprintf(buffer, "CPU : %s\n", model); + output_string(buffer, write_to_file, global_ofile); + sprintf(buffer, "L2 Cache : %s\n", cache); + output_string(buffer, write_to_file, global_ofile); + sprintf(buffer, "OS : %s\n", os); + output_string(buffer, write_to_file, global_ofile); +} + + +/************************ +** main for hardware.c ** +************************* +** For testing of code only +** Should be commented out +*/ +/* +int main(int argc, char * argv[]) { + hardware(0, NULL); + return 0; +} +*/ diff --git a/hardware.h b/hardware.h new file mode 100644 index 0000000..2a07934 --- /dev/null +++ b/hardware.h @@ -0,0 +1,2 @@ +extern +void hardware(const int write_to_file, FILE *global_ofile); diff --git a/hello.c b/hello.c new file mode 100644 index 0000000..c664483 --- /dev/null +++ b/hello.c @@ -0,0 +1,2 @@ +#include +int main () {printf("hello.\n");return(0);} diff --git a/misc.c b/misc.c new file mode 100644 index 0000000..a5144e4 --- /dev/null +++ b/misc.c @@ -0,0 +1,120 @@ + +/* +** misc.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include +#include "misc.h" + +/*********************************************************** +** MISCELLANEOUS BUT OTHERWISE NECESSARY ROUTINES ** +***********************************************************/ + +/**************************** +** RANDOM NUMBER GENERATOR ** +***************************** +** This is a second-order linear congruential random number +** generator. Its advantage is (of course) that it can be +** seeded and will thus produce repeatable sequences of +** random numbers. +*/ + +/**************************** +* randwc() * +***************************** +** Returns signed long random modulo num. +*/ +/* +long randwc(long num) +{ + return(randnum(0L)%num); +} +*/ +/* +** Returns signed 32-bit random modulo num. +*/ +int32 randwc(int32 num) +{ + return(randnum((int32)0)%num); +} + +/*************************** +** abs_randwc() ** +**************************** +** Same as randwc(), only this routine returns only +** positive numbers. +*/ +/* +unsigned long abs_randwc(unsigned long num) +{ +long temp; + +temp=randwc(num); +if(temp<0) temp=0L-temp; + +return((unsigned long)temp); +} +*/ +u32 abs_randwc(u32 num) +{ +int32 temp; /* Temporary storage */ + +temp=randwc(num); +if(temp<0) temp=(int32)0-temp; + +return((u32)temp); +} + +/**************************** +* randnum() * +***************************** +** Second order linear congruential generator. +** Constants suggested by J. G. Skellam. +** If val==0, returns next member of sequence. +** val!=0, restart generator. +*/ +/* +long randnum(long lngval) +{ + register long interm; + static long randw[2] = { 13L , 117L }; + + if (lngval!=0L) + { randw[0]=13L; randw[1]=117L; } + + interm=(randw[0]*254754L+randw[1]*529562L)%999563L; + randw[1]=randw[0]; + randw[0]=interm; + return(interm); +} +*/ +int32 randnum(int32 lngval) +{ + register int32 interm; + static int32 randw[2] = { (int32)13 , (int32)117 }; + + if (lngval!=(int32)0) + { randw[0]=(int32)13; randw[1]=(int32)117; } + + interm=(randw[0]*(int32)254754+randw[1]*(int32)529562)%(int32)999563; + randw[1]=randw[0]; + randw[0]=interm; + return(interm); +} + diff --git a/misc.h b/misc.h new file mode 100644 index 0000000..0f9bc13 --- /dev/null +++ b/misc.h @@ -0,0 +1,41 @@ +/* +** misc.h +** Header for misc.c +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/************************ +** FUNCTION PROTOTYPES ** +************************/ + +/* +long randwc(long num); +unsigned long abs_randwc(unsigned long num); +long randnum(long lngval); +*/ + +#include "nmglobal.h" +int32 randwc(int32 num); +u32 abs_randwc(u32 num); +int32 randnum(int32 lngval); + + diff --git a/nbench0.c b/nbench0.c new file mode 100644 index 0000000..784b501 --- /dev/null +++ b/nbench0.c @@ -0,0 +1,1174 @@ + +/* +** nbench0.c +*/ + +/******************************************* +** BYTEmark (tm) ** +** BYTE MAGAZINE'S NATIVE MODE BENCHMARKS ** +** FOR CPU/FPU ** +** ver 2.0 ** +** Rick Grehan, BYTE Magazine ** +******************************************** +** NOTE: These benchmarks do NOT check for the presence +** of an FPU. You have to find that out manually. +** +** REVISION HISTORY FOR BENCHMARKS +** 9/94 -- First beta. --RG +** 12/94 -- Bug discovered in some of the integer routines +** (IDEA, Huffman,...). Routines were not accurately counting +** the number of loops. Fixed. --RG (Thanks to Steve A.) +** 12/94 -- Added routines to calculate and display index +** values. Indexes based on DELL XPS 90 (90 MHz Pentium). +** 1/95 -- Added Mac time manager routines for more accurate +** timing on Macintosh (said to be good to 20 usecs) -- RG +** 1/95 -- Re-did all the #defines so they made more +** sense. See NMGLOBAL.H -- RG +** 3/95 -- Fixed memory leak in LU decomposition. Did not +** invalidate previous results, just made it easier to run.--RG +** 3/95 -- Added TOOLHELP.DLL timing routine to Windows timer. --RG +** 10/95 -- Added memory array & alignment; moved memory +** allocation out of LU Decomposition -- RG +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +#include +#include +#include +#include +#include +#include +#include "nmglobal.h" +#include "nbench0.h" +#include "hardware.h" + +/************* +**** main **** +*************/ +#ifdef MAC +void main(void) +#else +int main(int argc, char *argv[]) +#endif +{ +int i; /* Index */ +time_t time_and_date; /* Self-explanatory */ +struct tm *loctime; +double bmean; /* Benchmark mean */ +double bstdev; /* Benchmark stdev */ +double lx_memindex; /* Linux memory index (mainly integer operations)*/ +double lx_intindex; /* Linux integer index */ +double lx_fpindex; /* Linux floating-point index */ +double intindex; /* Integer index */ +double fpindex; /* Floating-point index */ +ulong bnumrun; /* # of runs */ + +#ifdef MAC + MaxApplZone(); +#endif + +#ifdef MACTIMEMGR +/* Set up high res timer */ +MacHSTdelay=600*1000*1000; /* Delay is 10 minutes */ + +memset((char *)&myTMTask,0,sizeof(TMTask)); + +/* Prime and remove the task, calculating overhead */ +PrimeTime((QElemPtr)&myTMTask,-MacHSTdelay); +RmvTime((QElemPtr)&myTMTask); +MacHSTohead=MacHSTdelay+myTMTask.tmCount; +#endif + +#ifdef WIN31TIMER +/* Set up the size of the timer info structure */ +win31tinfo.dwSize=(DWORD)sizeof(TIMERINFO); +/* Load library */ +if((hThlp=LoadLibrary("TOOLHELP.DLL"))<32) +{ printf("Error loading TOOLHELP\n"); + exit(0); +} +if(!(lpfn=GetProcAddress(hThlp,"TimerCount"))) +{ printf("TOOLHELP error\n"); + exit(0); +} +#endif + +/* +** Set global parameters to default. +*/ +global_min_ticks=MINIMUM_TICKS; +global_min_seconds=MINIMUM_SECONDS; +global_allstats=0; +global_custrun=0; +global_align=8; +write_to_file=0; +lx_memindex=(double)1.0; /* set for geometric mean computations */ +lx_intindex=(double)1.0; +lx_fpindex=(double)1.0; +intindex=(double)1.0; +fpindex=(double)1.0; +mem_array_ents=0; /* Nothing in mem array */ + +/* +** We presume all tests will be run unless told +** otherwise +*/ +for(i=0;i1) + for(i=1;i(double)1e-100){ + /* avoid division by zero */ + sprintf(buffer," Relative standard deviation: %g %%\n", + (double)100*bstdev/bmean); + output_string(buffer); + } + sprintf(buffer," Number of runs: %lu\n",bnumrun); + output_string(buffer); + show_stats(i); + sprintf(buffer,"Done with %s\n\n",ftestnames[i]); + output_string(buffer); + } + } +} +/* printf("...done...\n"); */ + +/* +** Output the total indexes +*/ +if(global_custrun==0) +{ + output_string("==========================ORIGINAL BYTEMARK RESULTS==========================\n"); + sprintf(buffer,"INTEGER INDEX : %.3f\n", + pow(intindex,(double).142857)); + output_string(buffer); + sprintf(buffer,"FLOATING-POINT INDEX: %.3f\n", + pow(fpindex,(double).33333)); + output_string(buffer); + output_string("Baseline (MSDOS*) : Pentium* 90, 256 KB L2-cache, Watcom* compiler 10.0\n"); +#ifdef LINUX + output_string("==============================LINUX DATA BELOW===============================\n"); + hardware(write_to_file, global_ofile); +#include "sysinfoc.c" + sprintf(buffer,"MEMORY INDEX : %.3f\n", + pow(lx_memindex,(double).3333333333)); + output_string(buffer); + sprintf(buffer,"INTEGER INDEX : %.3f\n", + pow(lx_intindex,(double).25)); + output_string(buffer); + sprintf(buffer,"FLOATING-POINT INDEX: %.3f\n", + pow(lx_fpindex,(double).3333333333)); + output_string(buffer); + output_string("Baseline (LINUX) : AMD K6/233*, 512 KB L2-cache, gcc 2.7.2.3, libc-5.4.38\n"); +#endif +output_string("* Trademarks are property of their respective holder.\n"); +} + +exit(0); +} + +/************** +** parse_arg ** +*************** +** Given a pointer to a string, we assume that's an argument. +** Parse that argument and act accordingly. +** Return 0 if ok, else return -1. +*/ +static int parse_arg(char *argptr) +{ +int i; /* Index */ +FILE *cfile; /* Command file identifier */ + +/* +** First character has got to be a hyphen. +*/ +if(*argptr++!='-') return(-1); + +/* +** Convert the rest of the argument to upper case +** so there's little chance of confusion. +*/ +for(i=0;i]\n",progname); + printf(" -v = verbose\n"); + printf(" -c = input parameters thru command file \n"); + exit(0); +} + + +/***************** +** read_comfile ** +****************** +** Read the command file. Set global parameters as +** specified. This routine assumes that the command file +** is already open. +*/ +static void read_comfile(FILE *cfile) +{ +char inbuf[40]; +char *eptr; /* Offset to "=" sign */ +int i; /* Index */ + +/* +** Sit in a big loop, reading a line from the file at each +** pass. Terminate on EOF. +*/ +while(fgets(inbuf,39,cfile)!=(char *)NULL) +{ + /* Overwrite the CR character */ + if(strlen(inbuf)>0) + inbuf[strlen(inbuf)-1]='\0'; + + /* + ** Parse up to the "=" sign. If we don't find an + ** "=", then flag an error. + */ + if((eptr=strchr(inbuf,(int)'='))==(char *)NULL) + { printf("**COMMAND FILE ERROR at LINE:\n %s\n", + inbuf); + goto skipswitch; /* A GOTO!!!! */ + } + + /* + ** Insert a null where the "=" was, then convert + ** the substring to uppercase. That will enable + ** us to perform the match. + */ + *eptr++='\0'; + strtoupper((char *)&inbuf[0]); + i=MAXPARAM; + do { + if(strcmp(inbuf,paramnames[i])==0) + break; + } while(--i>=0); + + if(i<0) + { printf("**COMMAND FILE ERROR -- UNKNOWN PARAM: %s", + inbuf); + goto skipswitch; + } + + /* + ** Advance eptr to the next field...which should be + ** the value assigned to the parameter. + */ + switch(i) + { + case PF_GMTICKS: /* GLOBALMINTICKS */ + global_min_ticks=(ulong)atol(eptr); + break; + + case PF_MINSECONDS: /* MINSECONDS */ + global_min_seconds=(ulong)atol(eptr); + set_request_secs(); + break; + + case PF_ALLSTATS: /* ALLSTATS */ + global_allstats=getflag(eptr); + break; + + case PF_OUTFILE: /* OUTFILE */ + strcpy(global_ofile_name,eptr); + global_ofile=fopen(global_ofile_name,"a"); + /* + ** Open the output file. + */ + if(global_ofile==(FILE *)NULL) + { printf("**Error opening output file: %s\n", + global_ofile_name); + ErrorExit(); + } + write_to_file=-1; + break; + + case PF_CUSTOMRUN: /* CUSTOMRUN */ + global_custrun=getflag(eptr); + for(i=0;i*sdev) + { is_beaten=i; + sdev_to_beat=*sdev; + } + } + + if(is_beaten!=-1) + { scores[is_beaten]=*newscore; + return(-1); + } + return(0); + } +#endif + +/******************** +** calc_confidence ** +********************* +** Given a set of numtries scores, calculate the confidence +** half-interval. We'll also return the sample mean and sample +** standard deviation. +** NOTE: This routines presumes a confidence of 95% and +** a confidence coefficient of .95 +** returns 0 if there is an error, otherwise -1 +*/ +static int calc_confidence(double scores[], /* Array of scores */ + int num_scores, /* number of scores in array */ + double *c_half_interval, /* Confidence half-int */ + double *smean, /* Standard mean */ + double *sdev) /* Sample stand dev */ +{ +/* Here is a list of the student-t distribution up to 29 degrees of + freedom. The value at 0 is bogus, as there is no value for zero + degrees of freedom. */ +double student_t[30]={0.0 , 12.706 , 4.303 , 3.182 , 2.776 , 2.571 , + 2.447 , 2.365 , 2.306 , 2.262 , 2.228 , + 2.201 , 2.179 , 2.160 , 2.145 , 2.131 , + 2.120 , 2.110 , 2.101 , 2.093 , 2.086 , + 2.080 , 2.074 , 2.069 , 2.064 , 2.060 , + 2.056 , 2.052 , 2.048 , 2.045 }; +int i; /* Index */ +if ((num_scores<2) || (num_scores>30)) { + output_string("Internal error: calc_confidence called with an illegal number of scores\n"); + return(-1); +} +/* +** First calculate mean. +*/ +*smean=(double)0.0; +for(i=0;i +#include +/* +** Timer globals for Mac +*/ +struct TMTask myTMTask; +long MacHSTdelay,MacHSTohead; + +#endif + +/* +** Following globals used by Win 31 timing routines. +** NOTE: This requires the includes of the w31timer.asm +** file in your project!! +*/ +#ifdef WIN31TIMER +#include +#include +extern TIMERINFO win31tinfo; +extern HANDLE hThlp; +extern FARPROC lpfn; +#endif + +/* +** PROTOTYPES +*/ +static int parse_arg(char *argptr); +static void display_help(char *progname); +static void read_comfile(FILE *cfile); +static int getflag(char *cptr); +static void strtoupper(char *s); +static void set_request_secs(void); +static int bench_with_confidence(int fid, + double *mean, double *stdev, ulong *numtries); +/* +static int seek_confidence(double scores[5], + double *newscore, double *c_half_interval, + double *smean,double *sdev); +*/ +static int calc_confidence(double scores[], + int num_scores, + double *c_half_interval,double *smean, + double *sdev); +static double getscore(int fid); +static void output_string(char *buffer); +static void show_stats(int bid); + +#ifdef MAC +void UCommandLine(void); +void UParse(void); +unsigned char *UField(unsigned char *ptr); +#endif + +/* +** EXTERNAL PROTOTYPES +*/ +extern void DoNumSort(void); /* From NBENCH1 */ +extern void DoStringSort(void); +extern void DoBitops(void); +extern void DoEmFloat(void); +extern void DoFourier(void); +extern void DoAssign(void); +extern void DoIDEA(void); +extern void DoHuffman(void); +extern void DoNNET(void); +extern void DoLU(void); + +extern void ErrorExit(void); /* From SYSSPEC */ + +/* +** Array of pointers to the benchmark functions. +*/ +void (*funcpointer[])(void) = +{ DoNumSort, + DoStringSort, + DoBitops, + DoEmFloat, + DoFourier, + DoAssign, + DoIDEA, + DoHuffman, + DoNNET, + DoLU }; + + diff --git a/nbench1.c b/nbench1.c new file mode 100644 index 0000000..05c35df --- /dev/null +++ b/nbench1.c @@ -0,0 +1,4445 @@ + +/* +** nbench1.c +*/ + +/******************************** +** BYTEmark (tm) ** +** BYTE NATIVE MODE BENCHMARKS ** +** VERSION 2 ** +** ** +** Included in this source ** +** file: ** +** Numeric Heapsort ** +** String Heapsort ** +** Bitfield test ** +** Floating point emulation ** +** Fourier coefficients ** +** Assignment algorithm ** +** IDEA Encyption ** +** Huffman compression ** +** Back prop. neural net ** +** LU Decomposition ** +** (linear equations) ** +** ---------- ** +** Rick Grehan, BYTE Magazine ** +********************************* +** +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** 10/95 - Removed allocation that was taking place inside +** the LU Decomposition benchmark. Though it didn't seem to +** make a difference on systems we ran it on, it nonetheless +** removes an operating system dependency that probably should +** not have been there. +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** INCLUDES +*/ +#include +#include +#include +#include +#include +#include "nmglobal.h" +#include "nbench1.h" +#include "wordcat.h" + +#ifdef DEBUG +static int numsort_status=0; +static int stringsort_status=0; +#endif + +/********************* +** NUMERIC HEAPSORT ** +********************** +** This test implements a heapsort algorithm, performed on an +** array of longs. +*/ + +/************** +** DoNumSort ** +*************** +** This routine performs the CPU numeric sort test. +** NOTE: Last version incorrectly stated that the routine +** returned result in # of longword sorted per second. +** Not so; the routine returns # of iterations per sec. +*/ + +void DoNumSort(void) +{ +SortStruct *numsortstruct; /* Local pointer to global struct */ +farlong *arraybase; /* Base pointers of array */ +long accumtime; /* Accumulated time */ +double iterations; /* Iteration counter */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For holding error codes */ + +/* +** Link to global structure +*/ +numsortstruct=&global_numsortstruct; + +/* +** Set the error context string. +*/ +errorcontext="CPU:Numeric Sort"; + +/* +** See if we need to do self adjustment code. +*/ +if(numsortstruct->adjust==0) +{ + /* + ** Self-adjustment code. The system begins by sorting 1 + ** array. If it does that in no time, then two arrays + ** are built and sorted. This process continues until + ** enough arrays are built to handle the tolerance. + */ + numsortstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long) * + numsortstruct->numarrays * numsortstruct->arraysize, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the numeric sort. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then allocate for more arrays and + ** try again. + */ + if(DoNumSortIteration(arraybase, + numsortstruct->arraysize, + numsortstruct->numarrays)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase,&systemerror); + if(numsortstruct->numarrays++>NUMNUMARRAYS) + { printf("CPU:NSORT -- NUMNUMARRAYS hit.\n"); + ErrorExit(); + } + } +} +else +{ /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long) * + numsortstruct->numarrays * numsortstruct->arraysize, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + +} +/* +** All's well if we get here. Repeatedly perform sorts until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoNumSortIteration(arraybase, + numsortstruct->arraysize, + numsortstruct->numarrays); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); + +numsortstruct->sortspersec=iterations * + (double)numsortstruct->numarrays / TicksToFracSecs(accumtime); + +if(numsortstruct->adjust==0) + numsortstruct->adjust=1; + +#ifdef DEBUG +if (numsort_status==0) printf("Numeric sort: OK\n"); +numsort_status=0; +#endif +return; +} + +/*********************** +** DoNumSortIteration ** +************************ +** This routine executes one iteration of the numeric +** sort benchmark. It returns the number of ticks +** elapsed for the iteration. +*/ +static ulong DoNumSortIteration(farlong *arraybase, + ulong arraysize, + uint numarrays) +{ +ulong elapsed; /* Elapsed ticks */ +ulong i; +/* +** Load up the array with random numbers +*/ +LoadNumArrayWithRand(arraybase,arraysize,numarrays); + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute a heap of heapsorts +*/ +for(i=0;i0; --i) + NumSift(array,i,top); + +/* +** Repeatedly extract maximum from heap and place it at the +** end of the array. When we get done, we'll have a sorted +** array. +*/ +for(i=top; i>0; --i) +{ NumSift(array,bottom,i); + temp=*array; /* Perform exchange */ + *array=*(array+i); + *(array+i)=temp; +} +return; +} + +/************ +** NumSift ** +************* +** Peforms the sift operation on a numeric array, +** constructing a heap in the array. +*/ +static void NumSift(farlong *array, /* Array of numbers */ + ulong i, /* Minimum of array */ + ulong j) /* Maximum of array */ +{ +unsigned long k; +long temp; /* Used for exchange */ + +while((i+i)<=j) +{ + k=i+i; + if(kadjust==0) +{ + /* + ** Initialize the number of arrays. + */ + strsortstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for array. We'll add an extra 100 + ** bytes to protect memory as strings move around + ** (this can happen during string adjustment) + */ + arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) * + (long)strsortstruct->numarrays,&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the string sort. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then de-allocate the array, reallocate a + ** an additional array, and try again. + */ + if(DoStringSortIteration(arraybase, + strsortstruct->numarrays, + strsortstruct->arraysize)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase,&systemerror); + strsortstruct->numarrays+=1; + } +} +else +{ + /* + ** We don't have to perform self adjustment code. + ** Simply allocate the space for the array. + */ + arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) * + (long)strsortstruct->numarrays,&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } +} +/* +** All's well if we get here. Repeatedly perform sorts until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoStringSortIteration(arraybase, + strsortstruct->numarrays, + strsortstruct->arraysize); + iterations+=(double)strsortstruct->numarrays; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. +** Set flag to show we don't need to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); +strsortstruct->sortspersec=iterations / (double)TicksToFracSecs(accumtime); +if(strsortstruct->adjust==0) + strsortstruct->adjust=1; +#ifdef DEBUG +if (stringsort_status==0) printf("String sort: OK\n"); +stringsort_status=0; +#endif +return; +} + +/************************** +** DoStringSortIteration ** +*************************** +** This routine executes one iteration of the string +** sort benchmark. It returns the number of ticks +** Note that this routine also builds the offset pointer +** array. +*/ +static ulong DoStringSortIteration(faruchar *arraybase, + uint numarrays,ulong arraysize) +{ +farulong *optrarray; /* Offset pointer array */ +unsigned long elapsed; /* Elapsed ticks */ +unsigned long nstrings; /* # of strings in array */ +int syserror; /* System error code */ +unsigned int i; /* Index */ +farulong *tempobase; /* Temporary offset pointer base */ +faruchar *tempsbase; /* Temporary string base pointer */ + +/* +** Load up the array(s) with random numbers +*/ +optrarray=LoadStringArray(arraybase,numarrays,&nstrings,arraysize); + +/* +** Set temp base pointers...they will be modified as the +** benchmark proceeds. +*/ +tempobase=optrarray; +tempsbase=arraybase; + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute heapsorts +*/ +for(i=0;i=arraysize) + { stringlength=(unsigned char)((arraysize-curroffset-1L) & + 0xFF); + fullflag=1; /* Indicates a full */ + } + + /* + ** Store length at curroffset and advance current offset. + */ + *(strarray+curroffset)=stringlength; + curroffset++; + + /* + ** Fill up the rest of the string with random bytes. + */ + for(i=0;i0; --i) + strsift(optrarray,strarray,numstrings,i,top); + +/* +** Repeatedly extract maximum from heap and place it at the +** end of the array. When we get done, we'll have a sorted +** array. +*/ +for(i=top; i>0; --i) +{ + strsift(optrarray,strarray,numstrings,0,i); + + /* temp = string[0] */ + tlen=*strarray; + MoveMemory((farvoid *)&temp[0], /* Perform exchange */ + (farvoid *)strarray, + (unsigned long)(tlen+1)); + + + /* string[0]=string[i] */ + tlen=*(strarray+*(optrarray+i)); + stradjust(optrarray,strarray,numstrings,0,tlen); + MoveMemory((farvoid *)strarray, + (farvoid *)(strarray+*(optrarray+i)), + (unsigned long)(tlen+1)); + + /* string[i]=temp */ + tlen=temp[0]; + stradjust(optrarray,strarray,numstrings,i,tlen); + MoveMemory((farvoid *)(strarray+*(optrarray+i)), + (farvoid *)&temp[0], + (unsigned long)(tlen+1)); + +} +return; +} + +/**************** +** str_is_less ** +***************** +** Pass this function: +** 1) A pointer to an array of offset pointers +** 2) A pointer to a string array +** 3) The number of elements in the string array +** 4) Offsets to two strings (a & b) +** This function returns TRUE if string a is < string b. +*/ +static int str_is_less(farulong *optrarray, /* Offset pointers */ + faruchar *strarray, /* String array */ + ulong numstrings, /* # of strings */ + ulong a, ulong b) /* Offsets */ +{ +int slen; /* String length */ + +/* +** Determine which string has the minimum length. Use that +** to call strncmp(). If they match up to that point, the +** string with the longer length wins. +*/ +slen=(int)*(strarray+*(optrarray+a)); +if(slen > (int)*(strarray+*(optrarray+b))) + slen=(int)*(strarray+*(optrarray+b)); + +slen=strncmp((char *)(strarray+*(optrarray+a)), + (char *)(strarray+*(optrarray+b)),slen); + +if(slen==0) +{ + /* + ** They match. Return true if the length of a + ** is greater than the length of b. + */ + if(*(strarray+*(optrarray+a)) > + *(strarray+*(optrarray+b))) + return(TRUE); + return(FALSE); +} + +if(slen<0) return(TRUE); /* a is strictly less than b */ + +return(FALSE); /* Only other possibility */ +} + +/************ +** strsift ** +************* +** Pass this function: +** 1) A pointer to an array of offset pointers +** 2) A pointer to a string array +** 3) The number of elements in the string array +** 4) Offset within which to sort. +** Sift the array within the bounds of those offsets (thus +** building a heap). +*/ +static void strsift(farulong *optrarray, /* Offset pointers */ + faruchar *strarray, /* String array */ + ulong numstrings, /* # of strings */ + ulong i, ulong j) /* Offsets */ +{ +unsigned long k; /* Temporaries */ +unsigned char temp[80]; +unsigned char tlen; /* For string lengths */ + + +while((i+i)<=j) +{ + k=i+i; + if(kadjust==0) +{ + bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize * + sizeof(ulong),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + /* + ** Initialize bitfield operations array to [2,30] elements + */ + locbitopstruct->bitoparraysize=30L; + + while(1) + { + /* + ** Allocate space for operations array + */ + bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L* + sizeof(ulong), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)bitarraybase,&systemerror); + ErrorExit(); + } + /* + ** Do an iteration of the bitmap test. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then de-allocate the array, reallocate a + ** larger version, and try again. + */ + ticks=DoBitfieldIteration(bitarraybase, + bitoparraybase, + locbitopstruct->bitoparraysize, + &nbitops); +#ifdef DEBUG +#ifdef LINUX + if (locbitopstruct->bitoparraysize==30L){ + /* this is the first loop, write a debug file */ + FILE *file; + unsigned long *running_base; /* same as farulong */ + long counter; + file=fopen("debugbit.dat","w"); + running_base=bitarraybase; + for (counter=0;counter<(long)(locbitopstruct->bitfieldarraysize);counter++){ +#ifdef LONG64 + fprintf(file,"%08X",(unsigned int)(*running_base&0xFFFFFFFFL)); + fprintf(file,"%08X",(unsigned int)((*running_base>>32)&0xFFFFFFFFL)); + if ((counter+1)%4==0) fprintf(file,"\n"); +#else + fprintf(file,"%08lX",*running_base); + if ((counter+1)%8==0) fprintf(file,"\n"); +#endif + running_base=running_base+1; + } + fclose(file); + printf("\nWrote the file debugbit.dat, you may want to compare it to debugbit.good\n"); + } +#endif +#endif + + if (ticks>global_min_ticks) break; /* We're ok...exit */ + + FreeMemory((farvoid *)bitoparraybase,&systemerror); + locbitopstruct->bitoparraysize+=100L; + } +} +else +{ + /* + ** Don't need to do self adjustment, just allocate + ** the array space. + */ + bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize * + sizeof(ulong),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L* + sizeof(ulong), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)bitarraybase,&systemerror); + ErrorExit(); + } +} + +/* +** All's well if we get here. Repeatedly perform bitops until the +** accumulated elapsed time is greater than # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoBitfieldIteration(bitarraybase, + bitoparraybase, + locbitopstruct->bitoparraysize,&nbitops); + iterations+=(double)nbitops; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. +** Also, set adjustment flag to show that we don't have +** to do self adjusting in the future. +*/ +FreeMemory((farvoid *)bitarraybase,&systemerror); +FreeMemory((farvoid *)bitoparraybase,&systemerror); +locbitopstruct->bitopspersec=iterations /TicksToFracSecs(accumtime); +if(locbitopstruct->adjust==0) + locbitopstruct->adjust=1; + +return; +} + +/************************ +** DoBitfieldIteration ** +************************* +** Perform a single iteration of the bitfield benchmark. +** Return the # of ticks accumulated by the operation. +*/ +static ulong DoBitfieldIteration(farulong *bitarraybase, + farulong *bitoparraybase, + long bitoparraysize, + ulong *nbitops) +{ +long i; /* Index */ +ulong bitoffset; /* Offset into bitmap */ +ulong elapsed; /* Time to execute */ +/* +** Clear # bitops counter +*/ +*nbitops=0L; + +/* +** Construct a set of bitmap offsets and run lengths. +** The offset can be any random number from 0 to the +** size of the bitmap (in bits). The run length can +** be any random number from 1 to the number of bits +** between the offset and the end of the bitmap. +** Note that the bitmap has 8192 * 32 bits in it. +** (262,144 bits) +*/ +/* +** Reset random number generator so things repeat. +** Also reset the bit array we work on. +** added by Uwe F. Mayer +*/ +randnum((int32)13); +for (i=0;i>6; /* Index is number /64 */ + bitnumb=bit_addr % 64; /* Bit number in word */ +#else + bindex=bit_addr>>5; /* Index is number /32 */ + bitnumb=bit_addr % 32; /* bit number in word */ +#endif + if(val) + bitmap[bindex]|=(1L<>6; /* Index is number /64 */ + bitnumb=bit_addr % 64; /* Bit number in longword */ +#else + bindex=bit_addr>>5; /* Index is number /32 */ + bitnumb=bit_addr % 32; /* Bit number in longword */ +#endif + bitmap[bindex]^=(1L<arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + ErrorExit(); +} + +bbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)abase,&systemerror); + ErrorExit(); +} + +cbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF), + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + ErrorExit(); +} + +/* +** Set up the arrays +*/ +SetupCPUEmFloatArrays(abase,bbase,cbase,locemfloatstruct->arraysize); + +/* +** See if we need to do self-adjusting code. +*/ +if(locemfloatstruct->adjust==0) +{ + locemfloatstruct->loops=0; + + /* + ** Do an iteration of the tests. If the elapsed time is + ** less than minimum, increase the loop count and try + ** again. + */ + for(loops=1;loopsarraysize, + loops); + if(tickcount>global_min_ticks) + { locemfloatstruct->loops=loops; + break; + } + } +} + +/* +** Verify that selft adjustment code worked. +*/ +if(locemfloatstruct->loops==0) +{ printf("CPU:EMFPU -- CMPUEMFLOATLOOPMAX limit hit\n"); + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + FreeMemory((farvoid *)cbase,&systemerror); + ErrorExit(); +} + +/* +** All's well if we get here. Repeatedly perform floating +** tests until the accumulated time is greater than the +** # of seconds requested. +** Each iteration performs arraysize * 3 operations. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoEmFloatIteration(abase,bbase,cbase, + locemfloatstruct->arraysize, + locemfloatstruct->loops); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)request_secs); + + +/* +** Clean up, calculate results, and go home. +** Also, indicate that adjustment is done. +*/ +FreeMemory((farvoid *)abase,&systemerror); +FreeMemory((farvoid *)bbase,&systemerror); +FreeMemory((farvoid *)cbase,&systemerror); + +locemfloatstruct->emflops=(iterations*(double)locemfloatstruct->loops)/ + (double)TicksToFracSecs(accumtime); +if(locemfloatstruct->adjust==0) + locemfloatstruct->adjust=1; + +#ifdef DEBUG +printf("----------------------------------------------------------------------------\n"); +#endif +return; +} + +/************************* +** FOURIER COEFFICIENTS ** +*************************/ + +/************** +** DoFourier ** +*************** +** Perform the transcendental/trigonometric portion of the +** benchmark. This benchmark calculates the first n +** fourier coefficients of the function (x+1)^x defined +** on the interval 0,2. +*/ +void DoFourier(void) +{ +FourierStruct *locfourierstruct; /* Local fourier struct */ +fardouble *abase; /* Base of A[] coefficients array */ +fardouble *bbase; /* Base of B[] coefficients array */ +unsigned long accumtime; /* Accumulated time in ticks */ +double iterations; /* # of iterations */ +char *errorcontext; /* Error context string pointer */ +int systemerror; /* For error code */ + +/* +** Link to global structure +*/ +locfourierstruct=&global_fourierstruct; + +/* +** Set error context string +*/ +errorcontext="FPU:Transcendental"; + +/* +** See if we need to do self-adjustment code. +*/ +if(locfourierstruct->adjust==0) +{ + locfourierstruct->arraysize=100L; /* Start at 100 elements */ + while(1) + { + + abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((void *)abase,&systemerror); + ErrorExit(); + } + /* + ** Do an iteration of the tests. If the elapsed time is + ** less than or equal to the permitted minimum, re-allocate + ** larger arrays and try again. + */ + if(DoFPUTransIteration(abase,bbase, + locfourierstruct->arraysize)>global_min_ticks) + break; /* We're ok...exit */ + + /* + ** Make bigger arrays and try again. + */ + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + locfourierstruct->arraysize+=50L; + } +} +else +{ /* + ** Don't need self-adjustment. Just allocate the + ** arrays, and go. + */ + abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + ErrorExit(); + } + + bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double), + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((void *)abase,&systemerror); + ErrorExit(); + } +} +/* +** All's well if we get here. Repeatedly perform integration +** tests until the accumulated time is greater than the +** # of seconds requested. +*/ +accumtime=0L; +iterations=(double)0.0; +do { + accumtime+=DoFPUTransIteration(abase,bbase,locfourierstruct->arraysize); + iterations+=(double)locfourierstruct->arraysize*(double)2.0-(double)1.0; +} while(TicksToSecs(accumtime)request_secs); + + +/* +** Clean up, calculate results, and go home. +** Also set adjustment flag to indicate no adjust code needed. +*/ +FreeMemory((farvoid *)abase,&systemerror); +FreeMemory((farvoid *)bbase,&systemerror); + +locfourierstruct->fflops=iterations/(double)TicksToFracSecs(accumtime); + +if(locfourierstruct->adjust==0) + locfourierstruct->adjust=1; + +return; +} + +/************************ +** DoFPUTransIteration ** +************************* +** Perform an iteration of the FPU Transcendental/trigonometric +** benchmark. Here, an iteration consists of calculating the +** first n fourier coefficients of the function (x+1)^x on +** the interval 0,2. n is given by arraysize. +** NOTE: The # of integration steps is fixed at +** 200. +*/ +static ulong DoFPUTransIteration(fardouble *abase, /* A coeffs. */ + fardouble *bbase, /* B coeffs. */ + ulong arraysize) /* # of coeffs */ +{ +double omega; /* Fundamental frequency */ +unsigned long i; /* Index */ +unsigned long elapsed; /* Elapsed time */ + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Calculate the fourier series. Begin by +** calculating A[0]. +*/ + +*abase=TrapezoidIntegrate((double)0.0, + (double)2.0, + 200, + (double)0.0, /* No omega * n needed */ + 0 )/(double)2.0; + +/* +** Calculate the fundamental frequency. +** ( 2 * pi ) / period...and since the period +** is 2, omega is simply pi. +*/ +omega=(double)3.1415926535897932; + +for(i=1;iadjust==0) +{ + /* + ** Self-adjustment code. The system begins by working on 1 + ** array. If it does that in no time, then two arrays + ** are built. This process continues until + ** enough arrays are built to handle the tolerance. + */ + locassignstruct->numarrays=1; + while(1) + { + /* + ** Allocate space for arrays + */ + arraybase=(farlong *) AllocateMemory(sizeof(long)* + ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } + + /* + ** Do an iteration of the assignment alg. If the + ** elapsed time is less than or equal to the permitted + ** minimum, then allocate for more arrays and + ** try again. + */ + if(DoAssignIteration(arraybase, + locassignstruct->numarrays)>global_min_ticks) + break; /* We're ok...exit */ + + FreeMemory((farvoid *)arraybase, &systemerror); + locassignstruct->numarrays++; + } +} +else +{ /* + ** Allocate space for arrays + */ + arraybase=(farlong *)AllocateMemory(sizeof(long)* + ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)arraybase, + &systemerror); + ErrorExit(); + } +} + +/* +** All's well if we get here. Do the tests. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoAssignIteration(arraybase, + locassignstruct->numarrays); + iterations+=(double)1.0; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)arraybase,&systemerror); + +locassignstruct->iterspersec=iterations * + (double)locassignstruct->numarrays / TicksToFracSecs(accumtime); + +if(locassignstruct->adjust==0) + locassignstruct->adjust=1; + +return; + +} + +/********************** +** DoAssignIteration ** +*********************** +** This routine executes one iteration of the assignment test. +** It returns the number of ticks elapsed in the iteration. +*/ +static ulong DoAssignIteration(farlong *arraybase, + ulong numarrays) +{ +longptr abase; /* local pointer */ +ulong elapsed; /* Elapsed ticks */ +ulong i; + +/* +** Set up local pointer +*/ +abase.ptrs.p=arraybase; + +/* +** Load up the arrays with a random table. +*/ +LoadAssignArrayWithRand(arraybase,numarrays); + +/* +** Start the stopwatch +*/ +elapsed=StartStopwatch(); + +/* +** Execute assignment algorithms +*/ +for(i=0;i1) + for(i=1;i>encrypt>> crypt1 >>decrypt>> plain2. +** So, plain1 and plain2 should match. +** Also, fill up plain1 with sample text. +*/ +plain1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + ErrorExit(); +} + +crypt1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)plain1,&systemerror); + ErrorExit(); +} + +plain2=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror); +if(systemerror) +{ + ReportError(errorcontext,systemerror); + FreeMemory((farvoid *)plain1,&systemerror); + FreeMemory((farvoid *)crypt1,&systemerror); + ErrorExit(); +} +/* +** Note that we build the "plaintext" by simply loading +** the array up with random numbers. +*/ +for(i=0;iarraysize;i++) + plain1[i]=(uchar)(abs_randwc(255) & 0xFF); + +/* +** See if we need to perform self adjustment loop. +*/ +if(locideastruct->adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(locideastruct->loops=100L; + locideastruct->loopsloops+=10L) + if(DoIDEAIteration(plain1,crypt1,plain2, + locideastruct->arraysize, + locideastruct->loops, + Z,DK)>global_min_ticks) break; +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoIDEAIteration(plain1,crypt1,plain2, + locideastruct->arraysize, + locideastruct->loops,Z,DK); + iterations+=(double)locideastruct->loops; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)plain1,&systemerror); +FreeMemory((farvoid *)crypt1,&systemerror); +FreeMemory((farvoid *)plain2,&systemerror); +locideastruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(locideastruct->adjust==0) + locideastruct->adjust=1; + +return; + +} + +/******************** +** DoIDEAIteration ** +********************* +** Execute a single iteration of the IDEA encryption algorithm. +** Actually, a single iteration is one encryption and one +** decryption. +*/ +static ulong DoIDEAIteration(faruchar *plain1, + faruchar *crypt1, + faruchar *plain2, + ulong arraysize, + ulong nloops, + IDEAkey Z, + IDEAkey DK) +{ +register ulong i; +register ulong j; +ulong elapsed; +#ifdef DEBUG +int status=0; +#endif + +/* +** Start the stopwatch. +*/ +elapsed=StartStopwatch(); + +/* +** Do everything for nloops. +*/ +for(i=0;i>16); + return(b-a+(b> 7); + Z+=i&8; + i&=7; +} +return; +} + +/**************** +** de_key_idea ** +***************** +** Compute IDEA decryption subkeys DK from encryption +** subkeys Z. +*/ +static void de_key_idea(IDEAkey Z, IDEAkey DK) +{ +IDEAkey TT; +int j; +u16 t1, t2, t3; +u16 *p; +p=(u16 *)(TT+KEYLEN); + +t1=inv(*Z++); +t2=-*Z++; +t3=-*Z++; +*--p=inv(*Z++); +*--p=t3; +*--p=t2; +*--p=t1; + +for(j=1;jarraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + ErrorExit(); +} +comparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + ErrorExit(); +} +decomparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + FreeMemory(comparray,&systemerror); + ErrorExit(); +} + +hufftree=(huff_node *)AllocateMemory(sizeof(huff_node) * 512, + &systemerror); +if(systemerror) +{ ReportError(errorcontext,systemerror); + FreeMemory(plaintext,&systemerror); + FreeMemory(comparray,&systemerror); + FreeMemory(decomparray,&systemerror); + ErrorExit(); +} + +/* +** Build the plaintext buffer. Since we want this to +** actually be able to compress, we'll use the +** wordcatalog to build the plaintext stuff. +*/ +/* +** Reset random number generator so things repeat. +** added by Uwe F. Mayer +*/ +randnum((int32)13); +create_text_block(plaintext,lochuffstruct->arraysize-1,(ushort)500); +plaintext[lochuffstruct->arraysize-1L]='\0'; +plaintextlen=lochuffstruct->arraysize; + +/* +** See if we need to perform self adjustment loop. +*/ +if(lochuffstruct->adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(lochuffstruct->loops=100L; + lochuffstruct->loopsloops+=10L) + if(DoHuffIteration(plaintext, + comparray, + decomparray, + lochuffstruct->arraysize, + lochuffstruct->loops, + hufftree)>global_min_ticks) break; +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoHuffIteration(plaintext, + comparray, + decomparray, + lochuffstruct->arraysize, + lochuffstruct->loops, + hufftree); + iterations+=(double)lochuffstruct->loops; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +FreeMemory((farvoid *)plaintext,&systemerror); +FreeMemory((farvoid *)comparray,&systemerror); +FreeMemory((farvoid *)decomparray,&systemerror); +FreeMemory((farvoid *)hufftree,&systemerror); +lochuffstruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(lochuffstruct->adjust==0) + lochuffstruct->adjust=1; + +} + +/********************* +** create_text_line ** +********************** +** Create a random line of text, stored at *dt. The line may be +** no more than nchars long. +*/ +static void create_text_line(farchar *dt, + long nchars) +{ +long charssofar; /* # of characters so far */ +long tomove; /* # of characters to move */ +char myword[40]; /* Local buffer for words */ +farchar *wordptr; /* Pointer to word from catalog */ + +charssofar=0; + +do { +/* +** Grab a random word from the wordcatalog +*/ +/* wordptr=wordcatarray[abs_randwc((long)WORDCATSIZE)];*/ +wordptr=wordcatarray[abs_randwc((int32)WORDCATSIZE)]; +MoveMemory((farvoid *)myword, + (farvoid *)wordptr, + (unsigned long)strlen(wordptr)+1); + +/* +** Append a blank. +*/ +tomove=strlen(myword)+1; +myword[tomove-1]=' '; + +/* +** See how long it is. If its length+charssofar > nchars, we have +** to trim it. +*/ +if((tomove+charssofar)>nchars) + tomove=nchars-charssofar; +/* +** Attach the word to the current line. Increment counter. +*/ +MoveMemory((farvoid *)dt,(farvoid *)myword,(unsigned long)tomove); +charssofar+=tomove; +dt+=tomove; + +/* +** If we're done, bail out. Otherwise, go get another word. +*/ +} while(charssofartblen) + linelen=tblen-bytessofar; + +if(linelen>1) +{ + create_text_line(tb,linelen); +} +tb+=linelen-1; /* Add the carriage return */ +*tb++='\n'; + +bytessofar+=linelen; + +} while(bytessofar>3; +bitnumb=bitoffset % 8; + +/* +** Set or clear +*/ +if(bitchar=='1') + comparray[byteoffset]|=(1<>3; +bitnumb=bitoffset % 8; + +/* +** Fetch +*/ +return((1<adjust==0) +{ + /* + ** Do self-adjustment. This involves initializing the + ** # of loops and increasing the loop count until we + ** get a number of loops that we can use. + */ + for(locnnetstruct->loops=1L; + locnnetstruct->loopsloops++) + { /*randnum(3L); */ + randnum((int32)3); + if(DoNNetIteration(locnnetstruct->loops) + >global_min_ticks) break; + } +} + +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + /* randnum(3L); */ /* Gotta do this for Neural Net */ + randnum((int32)3); /* Gotta do this for Neural Net */ + accumtime+=DoNNetIteration(locnnetstruct->loops); + iterations+=(double)locnnetstruct->loops; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +locnnetstruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(locnnetstruct->adjust==0) + locnnetstruct->adjust=1; + + +return; +} + +/******************** +** DoNNetIteration ** +********************* +** Do a single iteration of the neural net benchmark. +** By iteration, we mean a "learning" pass. +*/ +static ulong DoNNetIteration(ulong nloops) +{ +ulong elapsed; /* Elapsed time */ +int patt; + +/* +** Run nloops learning cycles. Notice that, counted with +** the learning cycle is the weight randomization and +** zeroing of changes. This should reduce clock jitter, +** since we don't have to stop and start the clock for +** each iteration. +*/ +elapsed=StartStopwatch(); +while(nloops--) +{ + randomize_wts(); + zero_changes(); + iteration_count=1; + learned = F; + numpasses = 0; + while (learned == F) + { + for (patt=0; patt tot_error) + tot_error = -error; /* worst error this pattern */ + } + else + { + sum += error; + if (error > tot_error) + tot_error = error; /* worst error this pattern */ + } +} +avg_out_error[patt] = sum/OUT_SIZE; +tot_out_error[patt] = tot_error; +return; +} + +/*********************** +** worst_pass_error() ** +************************ +** Find the worst and average error in the pass and save it +**/ +static void worst_pass_error() +{ +double error,sum; + +int i; + +error = 0.0; +sum = 0.0; +for (i=0; i error) error = tot_out_error[i]; + sum += avg_out_error[i]; +} +worst_error = error; +average_error = sum/numpats; +return; +} + +/******************* +** do_mid_error() ** +******************** +** Compute the error for the middle layer neurodes +** This is based on the output errors computed above. +** Note that the derivative of the sigmoid f(x) is +** f'(x) = f(x)(1 - f(x)) +** Recall that f(x) is merely the output of the middle +** layer neurode on the forward pass. +**/ +static void do_mid_error() +{ +double sum; +int neurode, i; + +for (neurode=0; neurode= STOP) result = F; + if (tot_out_error[i] >= 16.0) error = T; +} + +if (error == T) result = ERR; + + +#ifdef DEBUG +/* printf("\n Error this pass thru data: Worst: %8.3f; Average: %8.3f", + worst_error,average_error); +*/ +/* fprintf(outfile, + "\n Error this pass thru data: Worst: %8.3f; Average: %8.3f", + worst_error, average_error); */ +#endif + +return(result); +} + + +/******************* +** zero_changes() ** +******************** +** Zero out all the wt change arrays +**/ +static void zero_changes() +{ +int i,j; + +for (i = 0; i MAXPATS) + numpats = MAXPATS; + +for (patt=0; patt= 0.9) + in_pats[patt][i] = 0.9; + if (in_pats[patt][i] <= 0.1) + in_pats[patt][i] = 0.1; + } + element = 0; + vals_read = fscanf(infile,"%d %d %d %d %d %d %d %d", + &val1, &val2, &val3, &val4, &val5, &val6, &val7, &val8); + + out_pats[patt][element] = (double) val1; element++; + out_pats[patt][element] = (double) val2; element++; + out_pats[patt][element] = (double) val3; element++; + out_pats[patt][element] = (double) val4; element++; + out_pats[patt][element] = (double) val5; element++; + out_pats[patt][element] = (double) val6; element++; + out_pats[patt][element] = (double) val7; element++; + out_pats[patt][element] = (double) val8; element++; +} + +/* printf("\n Closing the input file now. "); */ + +fclose(infile); +return(0); +} + +/********************* +** initialize_net() ** +********************** +** Do all the initialization stuff before beginning +*/ +/* +static int initialize_net() +{ +int err_code; + +randomize_wts(); +zero_changes(); +err_code = read_data_file(); +iteration_count = 1; +return(err_code); +} +*/ + +/********************** +** display_mid_wts() ** +*********************** +** Display the weights on the middle layer neurodes +** NOTE: This routine is not used in the benchmark +** test -- RG +**/ +/* static void display_mid_wts() +{ +int neurode, weight, row, col; + +fprintf(outfile,"\n Weights of Middle Layer neurodes:"); + +for (neurode=0; neurodeadjust==0) +{ + loclustruct->numarrays=0; + for(i=1;i<=MAXLUARRAYS;i++) + { + abase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYCOLS*LUARRAYROWS*(i+1),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL); + ErrorExit(); + } + bbase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYROWS*(i+1),&systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,abase,(fardouble *)NULL); + ErrorExit(); + } + if(DoLUIteration(a,b,abase,bbase,i)>global_min_ticks) + { loclustruct->numarrays=i; + break; + } + /* + ** Not enough arrays...free them all and try again + */ + FreeMemory((farvoid *)abase,&systemerror); + FreeMemory((farvoid *)bbase,&systemerror); + } + /* + ** Were we able to do it? + */ + if(loclustruct->numarrays==0) + { printf("FPU:LU -- Array limit reached\n"); + LUFreeMem(a,b,abase,bbase); + ErrorExit(); + } +} +else +{ /* + ** Don't need to adjust -- just allocate the proper + ** number of arrays and proceed. + */ + abase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYCOLS*LUARRAYROWS*loclustruct->numarrays, + &systemerror); + if(systemerror) + { ReportError(errorcontext,systemerror); + LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL); + ErrorExit(); + } + bbase=(fardouble *)AllocateMemory(sizeof(double) * + LUARRAYROWS*loclustruct->numarrays,&systemerror); + if(systemerror) + { + ReportError(errorcontext,systemerror); + LUFreeMem(a,b,abase,(fardouble *)NULL); + ErrorExit(); + } +} +/* +** All's well if we get here. Do the test. +*/ +accumtime=0L; +iterations=(double)0.0; + +do { + accumtime+=DoLUIteration(a,b,abase,bbase, + loclustruct->numarrays); + iterations+=(double)loclustruct->numarrays; +} while(TicksToSecs(accumtime)request_secs); + +/* +** Clean up, calculate results, and go home. Be sure to +** show that we don't have to rerun adjustment code. +*/ +loclustruct->iterspersec=iterations / TicksToFracSecs(accumtime); + +if(loclustruct->adjust==0) + loclustruct->adjust=1; + +LUFreeMem(a,b,abase,bbase); +return; +} + +/************** +** LUFreeMem ** +*************** +** Release memory associated with LU benchmark. +*/ +static void LUFreeMem(fardouble *a, fardouble *b, + fardouble *abase,fardouble *bbase) +{ +int systemerror; + +FreeMemory((farvoid *)a,&systemerror); +FreeMemory((farvoid *)b,&systemerror); +FreeMemory((farvoid *)LUtempvv,&systemerror); + +if(abase!=(fardouble *)NULL) FreeMemory((farvoid *)abase,&systemerror); +if(bbase!=(fardouble *)NULL) FreeMemory((farvoid *)bbase,&systemerror); +return; +} + +/****************** +** DoLUIteration ** +******************* +** Perform an iteration of the LU decomposition benchmark. +** An iteration refers to the repeated solution of several +** identical matrices. +*/ +static ulong DoLUIteration(fardouble *a,fardouble *b, + fardouble *abase, fardouble *bbase, + ulong numarrays) +{ +fardouble *locabase; +fardouble *locbbase; +LUdblptr ptra; /* For converting ptr to 2D array */ +ulong elapsed; +ulong j,i; /* Indexes */ + + +/* +** Move the seed arrays (a & b) into the destination +** arrays; +*/ +for(j=0;j big) + big=fabs(a[i][j]); + /* Bail out on singular matrix */ + if(big==(double)0.0) return(0); + LUtempvv[i]=1.0/big; +} + +/* +** Crout's algorithm...loop over columns. +*/ +for(j=0;j=big) + { big=dum; + imax=i; + } + } + if(j!=imax) /* Interchange rows if necessary */ + { for(k=0;k=0;i--) +{ + sum=b[i]; + if(i!=(n-1)) + for(j=(i+1);j 64K chunks of memory. +*/ +/* #define DOS16MEM */ + +/* Define MACMEM to use the Mac's GetPtr call to allocate +** memory (instead of malloc()). +*/ +/* #define MACMEM */ + +/* +++ TIMING +++ */ +/* +** You must define ONLY ONE of the following identifiers to pick +** the timing routine used. +** CLOCKWCPS +** CLOCKWCT +** MACTIMEMGR +** WIN31TIMER +*/ + +/* +** Define CLOCKWCPS if you are using the clock() routine and the +** constant used as the divisor to determine seconds is +** CLOCKS_PER_SEC. This is the default in most cases. +*/ +#define CLOCKWCPS + +/* +** Define CLOCKWCT if you are using the clock() routine and the +** constant used as the divisor to determine seconds is CLK_TCK +*/ +/* #define CLOCKWCT */ + +/* +** Define MACTIMEMGR to use the Mac Time manager routines. +** You'll need to be running at least system 6.0.3 or +** better...extended time manager is recommended (system 7 or +** better). +*/ +/* #define MACTIMEMGR */ + +/* +** Define WIN31TIMER to user the timing routines in TOOLHELP.DLL. +** Gets accuracy down to the millisecond. +*/ +/* #define WIN31TIMER */ + +/* +++ MISCELLANEOUS +++ */ + +/* +** Define DOS16 if you'll be compiling under DOS in 16-bit +** (non DOS-extended) mode. This will enable proper definitions +** for the far*** typedefs +*/ +/* #define DOS16 */ + +/* +** Define MAC if you're compiling on a Macintosh. This +** does a number of things: +** includes unix.h +** Incorporates code to mimic the command line via either +** the console library (Symantec/Think) or the SIOUX +** library (Code Warrior). +*/ +/* #define MAC */ + +/* +** Define LONG64 if your compiler emits 64-bit longs. +** This is typically true of Alpha compilers on Unix +** systems...though, who knows, this may change in the +** future. I MOVED THIS DEFINTION INTO THE FILE pointer.h. DO NOT +** DEFINE IT HERE. IT WILL AUTOMATICALLY BE DEFINED IF NECESSARY. +** Uwe F. Mayer, Dec 15, 1996, Nov 15, 1997 +*/ +/* #define LONG64 */ + +/* +** Define MACCWPROF if you are profiling on the Mac using +** Code Warrior. This enables code that turns off the +** profiler in an evern of an error exit. +*/ +/* #define MACCWPROF */ + +#ifdef MAC +#include +#endif + +/* +** ERROR CODES +*/ +#define ERROR_MEMORY 1 +#define ERROR_MEMARRAY_FULL 2 +#define ERROR_MEMARRAY_NFOUND 3 +#define ERROR_FILECREATE 10 +#define ERROR_FILEREAD 11 +#define ERROR_FILEWRITE 12 +#define ERROR_FILEOPEN 13 +#define ERROR_FILESEEK 14 + +/* +** MINIMUM_TICKS +** +** This sets the default number of minimum ticks. +** It can, of course, be overridden by the input +** command file. +** This ultimately gets loaded into the variable +** global_min_ticks, which specifies the minimum +** number of ticks that must take place between +** a StartStopwatch() and StopStopwatch() call. +** The idea is to reduce error buildup. +*/ +#define MINIMUM_TICKS 60 + +/* +** MINIMUM_SECONDS +** +** Minimum number of seconds to run each test. +*/ +#define MINIMUM_SECONDS 5 + +/* +** MAXPOSLONG +** +** This is the maximum positive long. +*/ +#ifdef LONG64 +#define MAXPOSLONG 0x7FFFFFFFFFFFFFFFL +#else +#define MAXPOSLONG 0x7FFFFFFFL +#endif + +/* +** OTHER DEFINES +*/ +#ifndef MAC +#define TRUE 1 +#define FALSE 0 +#endif + +/* +** Memory array size. Used in SYSSPEC for keeping track +** of re-aligned memory. +*/ +#define MEM_ARRAY_SIZE 20 + +/* +** TYPEDEFS +*/ +#define ulong unsigned long +#define uchar unsigned char +#define uint unsigned int +#define ushort unsigned short +/* +typedef unsigned char uchar; +typedef unsigned int uint; +typedef unsigned short ushort; +typedef unsigned long ulong; +*/ +/* +** The 'farxxx' typedefs were added in deference to DOS, which +** requires far pointers to handle some of the bigger +** memory structures. Other systems will simply +** map 'farxxx' to 'xxx' +*/ +#ifdef DOS16 +typedef void huge farvoid; +typedef double huge fardouble; +typedef long huge farlong; +typedef unsigned long huge farulong; +typedef char huge farchar; +typedef unsigned char huge faruchar; + +#else + +typedef void farvoid; +typedef double fardouble; +typedef long farlong; +typedef unsigned long farulong; +typedef char farchar; +typedef unsigned char faruchar; + +#endif + +/* +** The following typedefs are used when element size +** is critical. You'll have to alter these for +** your specifical platform/compiler. +*/ +typedef unsigned char u8; /* Unsigned 8-bits */ +typedef unsigned short u16; /* Unsigned 16 bits */ +#ifdef LONG64 +typedef unsigned int u32; /* Unsigned 32 bits */ +typedef int int32; /* Signed 32 bit integer */ +#else +typedef unsigned long u32; /* Unsigned 32 bits */ +typedef long int32; /* Signed 32 bit integer */ +#endif + +/***************** +** NUMERIC SORT ** +*****************/ +/* +** DEFINES +*/ + +/* +** The following constant, NUMNUMARRAYS (no, it is not a +** Peter Sellers joke) is the maximum number of arrays +** that can be built by the numeric sorting benchmark +** before it gives up. This maximum is dependent on the +** amount of memory in the system. +*/ +/*#define NUMNUMARRAYS 1000*/ +#define NUMNUMARRAYS 10000 + +/* +** The following constant NUMARRAYSIZE determines the +** default # of elements in each numeric array. Ordinarily +** this is something you shouldn't fool with, though as +** with most of the constants here, it is adjustable. +*/ +#define NUMARRAYSIZE 8111L + + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + double sortspersec; /* # of sort iterations per sec */ + ushort numarrays; /* # of arrays */ + ulong arraysize; /* # of elements in array */ +} SortStruct; + +/**************** +** STRING SORT ** +***************** +** Note: The string sort benchmark uses the same structure to +** communicate parameters as does the numeric sort benchmark. +** (i.e., SortStruct...see above. +*/ + +/* +** DEFINES +*/ +/* +** The following constant STRINGARRAYSIZE determines +** the default # of bytes allocated to each string array. +** Though the actual size can be pre-set from the command +** file, this constant should be left unchanged. +*/ +#define STRINGARRAYSIZE 8111L + +/************************ +** BITFIELD OPERATIONS ** +************************* +*/ + +/* +** DEFINES +*/ + +/* +** Following field sets the size of the bitfield array (in longs). +*/ +#ifdef LONG64 +#define BITFARRAYSIZE 16384L +#else +#define BITFARRAYSIZE 32768L +#endif + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + double bitopspersec; /* # of bitfield ops per sec */ + ulong bitoparraysize; /* Total # of bitfield ops */ + ulong bitfieldarraysize; /* Bit field array size */ +} BitOpStruct; + +/**************************** +** EMULATED FLOATING POINT ** +****************************/ +/* +** DEFINES +*/ +#define INTERNAL_FPF_PRECISION 4 + +/* +** The following constant is the maximum number of loops +** of the emulated floating point test that the system +** will allow before flagging an error. This is not a +** critical constant, and can be altered if your system is +** a real barn-burner. +*/ +/*#define CPUEMFLOATLOOPMAX 50000L*/ +#define CPUEMFLOATLOOPMAX 500000L + +/* +** Set size of array +*/ +#define EMFARRAYSIZE 3000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of seconds requested */ + ulong arraysize; /* Size of array */ + ulong loops; /* Loops per iterations */ + double emflops; /* Results */ +} EmFloatStruct; + +/************************* +** FOURIER COEFFICIENTS ** +*************************/ + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* # of requested seconds */ + ulong arraysize; /* Size of coeff. arrays */ + double fflops; /* Results */ +} FourierStruct; + +/************************* +** ASSIGNMENT ALGORITHM ** +*************************/ + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong numarrays; /* # of arrays */ + double iterspersec; /* Results */ +} AssignStruct; + +/******************** +** IDEA ENCRYPTION ** +********************/ + +/* +** DEFINES +*/ +/* Following constant defines the max number of loops the +** system will attempt. Keeps things from going off into the +** weeds. */ +/*#define MAXIDEALOOPS 50000L*/ +#define MAXIDEALOOPS 500000L + +/* +** Following constant sets the size of the arrays. +** NOTE: For the IDEA algorithm to work properly, this +** number MUST be some multiple of 8. +*/ +#define IDEAARRAYSIZE 4000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong arraysize; /* Size of array */ + ulong loops; /* # of times to convert */ + double iterspersec; /* Results */ +} IDEAStruct; + + +/************************ +** HUFFMAN COMPRESSION ** +************************/ + +/* +** DEFINES +*/ +/* +** MAXHUFFLOOPS +** +** This constant specifies the maximum number of Huffman +** compression loops the system will try for. This keeps +** the test from going off into the weeds. This is not +** a critical constant, and can be increased if your +** system is a real barn-burner. +*/ +/*#define MAXHUFFLOOPS 50000L*/ +#define MAXHUFFLOOPS 500000L + +/* +** Following constant sets the size of the arrays to +** be compressed/uncompressed. +*/ +#define HUFFARRAYSIZE 5000L + +/* +** TYPEDEFS +*/ + +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong arraysize; /* Size of array */ + ulong loops; /* # of times to compress/decompress */ + double iterspersec; /* Results */ +} HuffStruct; + +/******************************** +** BACK PROPAGATION NEURAL NET ** +********************************/ + +/* +** MAXNNETLOOPS +** +** This constant sets the max number of loops through the neural +** net that the system will attempt before giving up. This +** is not a critical constant. You can alter it if your system +** has sufficient horsepower. +*/ +/*#define MAXNNETLOOPS 50000L*/ +#define MAXNNETLOOPS 500000L + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong loops; /* # of times to learn */ + double iterspersec; /* Results */ +} NNetStruct; + +/*********************** +** LU DECOMPOSITION ** +** (Linear Equations) ** +***********************/ + +/* +** MAXLUARRAYS +** +** This sets the upper limit on the number of arrays +** that the benchmark will attempt to build before +** flagging an error. It is not a critical constant, and +** may be increased if your system has the horsepower. +*/ +/*#define MAXLUARRAYS 1000*/ +#define MAXLUARRAYS 10000 + +/* +** TYPEDEFS +*/ +typedef struct { + int adjust; /* Set adjust code */ + ulong request_secs; /* Requested # of seconds */ + ulong numarrays; /* # of arrays */ + double iterspersec; /* Results */ +} LUStruct; + diff --git a/pointer.c b/pointer.c new file mode 100644 index 0000000..f4de577 --- /dev/null +++ b/pointer.c @@ -0,0 +1,6 @@ +#include +int main(){ + printf("%d",(int)sizeof(long)); + return(0); +} + diff --git a/sysinfo.c.example b/sysinfo.c.example new file mode 100644 index 0000000..db650f0 --- /dev/null +++ b/sysinfo.c.example @@ -0,0 +1,10 @@ +sprintf(buffer,"**System used for compilation:\n"); +output_string(buffer); +sprintf(buffer,"**Linux mimi 2.0.31 #5 Thu Oct 23 10:02:08 CDT 1997 i486\n"); +output_string(buffer); +sprintf(buffer,"**C compiler: gcc version 2.7.2.3\n"); +output_string(buffer); +sprintf(buffer,"**libc: libc.so.5.4.38\n"); +output_string(buffer); +sprintf(buffer,"**Date of compilation: Thu Nov 20 10:04:43 CST 1997\n"); +output_string(buffer); diff --git a/sysinfo.c.template b/sysinfo.c.template new file mode 100644 index 0000000..c1a986c --- /dev/null +++ b/sysinfo.c.template @@ -0,0 +1,10 @@ +sprintf(buffer,"**System used for compilation:\n"); +output_string(buffer); +sprintf(buffer,"**%SYSTEM%\n"); +output_string(buffer); +sprintf(buffer,"**C compiler: %CCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"**libc: %LIBCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"**Date of compilation: %DATE%\n"); +output_string(buffer); diff --git a/sysinfo.sh b/sysinfo.sh new file mode 100755 index 0000000..57754fe --- /dev/null +++ b/sysinfo.sh @@ -0,0 +1,78 @@ +#!/bin/sh + +# the arguments of this script are the compiler name and flags + +# try to solve a chicken-and-egg problem on SunOS +# ucb's test program does not handle -L like the other test programs +# let's try to find another implementation +if test -x /bin/test; then + TEST=/bin/test; +else + if test -x /usr/bin/test; then + TEST=/usr/bin/test; + else + # cross your fingers that it's not like ucb test + TEST=test; + fi +fi + +compiler=`echo $* | sed -e 's/-static//g' -e 's/-Bstatic//g'` +if $TEST `basename $1` = "gcc" && ($compiler -v) >/dev/null 2>&1 ; then +# Cygwin writes more than one line with "version" in it + gccversion=`$compiler -v 2>&1 | sed -e "/version/!d" | tail -n 1` +else + gccversion="$1" +fi + +libcversion="" +if ($* hello.c -o hello) >/dev/null 2>&1; then + ldd_output=`(ldd hello) 2>&1` + libcversion=`echo $ldd_output | sed -e 's/.*static.*/static/' \ + -e 's/.*not a dynamic.*/static/'` + if $TEST "$libcversion" = "static" ; then + if ($compiler hello.c -o hello) >/dev/null 2>&1; then + if (ldd hello) >/dev/null 2>/dev/null; then + libcversion=`(ldd hello) 2>&1` + libcversion=`echo $libcversion | sed -e '/libc/!d'\ + -e 's/^[ ]*//' \ + -e 's/.*=>[ ][ ]*\([^ ]*\).*/\1/'` + # remember the current directory + current=`pwd` + while $TEST -L "$libcversion" && ! $TEST "$libcversion" = "" ; do + libcitself=`basename $libcversion` + libpath=`echo $libcversion | sed -e "s/$libcitself$//"` + if $TEST -d "$libpath" ; then + cd $libpath + fi + if ls $libcitself >/dev/null 2>/dev/null ; then + libcversion=`ls -l $libcitself | \ + sed -e 's/.*->[ ][ ]*\(.*\)$/\1/'` + else + # something must have gone wrong, let's bail out + libcversion="" + fi + done + # return to the current directory + cd $current + fi + fi + else + libcversion="" + fi +fi + +rm -f sysinfo.crm sysinfoc.c hello + +# this bombs out on Ultrix which expect "cut -d" + +compsystem=`uname -a | cut -b 1-78` +compdate=`date|cut -b1-55` + +# let's hope that ctrl-c is not part of any string here +# this also will barf later if " is in any of the strings + +for i in sysinfo.c sysinfoc.c ; do + sed -e "s%CCVERSION%$gccversion" -e "s%LIBCVERSION%$libcversion"\ + -e "s%SYSTEM%$compsystem" -e "s%DATE%$compdate"\ + ${i}.template > $i +done diff --git a/sysinfoc.c.example b/sysinfoc.c.example new file mode 100644 index 0000000..7da71ac --- /dev/null +++ b/sysinfoc.c.example @@ -0,0 +1,4 @@ +sprintf(buffer,"C compiler : gcc version 2.7.2.3\n"); +output_string(buffer); +sprintf(buffer,"libc : libc.so.5.4.38\n"); +output_string(buffer); diff --git a/sysinfoc.c.template b/sysinfoc.c.template new file mode 100644 index 0000000..922a5de --- /dev/null +++ b/sysinfoc.c.template @@ -0,0 +1,4 @@ +sprintf(buffer,"C compiler : %CCVERSION%\n"); +output_string(buffer); +sprintf(buffer,"libc : %LIBCVERSION%\n"); +output_string(buffer); diff --git a/sysspec.c b/sysspec.c new file mode 100644 index 0000000..a97010d --- /dev/null +++ b/sysspec.c @@ -0,0 +1,884 @@ + +/* +** sysspec.c +** System-specific routines. +** +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95;10/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/*********************************** +** SYSTEM-SPECIFIC ROUTINES ** +************************************ +** +** These are the routines that provide functions that are +** system-specific. If the benchmarks are to be ported +** to new hardware/new O.S., this is the first place to +** start. +*/ +#include "sysspec.h" + +#ifdef DOS16 +#include +#include +#include +#endif +/********************************* +** MEMORY MANAGEMENT ROUTINES ** +*********************************/ + + +/**************************** +** AllocateMemory +** This routine returns a void pointer to a memory +** block. The size of the memory block is given in bytes +** as the first argument. This routine also returns an +** error code in the second argument. +** 10/95 Update: +** Added an associative array for memory alignment reasons. +** mem_array[2][MEM_ARRAY_SIZE] +** mem_array[0][n] = Actual address (from malloc) +** mem_array[1][n] = Aligned address +** Currently, mem_array[][] is only used if you use malloc; +** it is not used for the 16-bit DOS and MAC versions. +*/ +farvoid *AllocateMemory(unsigned long nbytes, /* # of bytes to alloc */ + int *errorcode) /* Returned error code */ +{ +#ifdef DOS16MEM +union REGS registers; +unsigned short nparas; /* # of paragraphs */ + +/* +** Set # of paragraphs to nbytes/16 +1. The +1 is a +** slop factor. +*/ +nparas=(unsigned short)(nbytes/16L) + 1; + +/* +** Set incoming registers. +*/ +registers.h.ah=0x48; /* Allocate memory */ +registers.x.bx=nparas; /* # of paragraphs */ + + +intdos(®isters,®isters); /* Call DOS */ + +/* +** See if things succeeded. +*/ +if(registers.x.cflag) +{ printf("error: %d Lgst: %d\n",registers.x.ax,registers.x.bx); + *errorcode=ERROR_MEMORY; + return((farvoid *)NULL); +} + +/* +** Create a farvoid pointer to return. +*/ +*errorcode=0; +return((farvoid *)MK_FP(registers.x.ax,0)); + +#endif + +#ifdef MACMEM +/* +** For MAC CodeWarrior, we'll use the MacOS NewPtr call +*/ +farvoid *returnval; +returnval=(farvoid *)NewPtr((Size)nbytes); +if(returnval==(farvoid *)NULL) + *errorcode=ERROR_MEMORY; +else + *errorcode=0; +return(returnval); +#endif + +#ifdef MALLOCMEM +/* +** Everyone else, its pretty straightforward, given +** that you use a 32-bit compiler which treats size_t as +** a 4-byte entity. +*/ +farvoid *returnval; /* Return value */ +ulong true_addr; /* True address */ +ulong adj_addr; /* Adjusted address */ + +returnval=(farvoid *)malloc((size_t)(nbytes+2L*(long)global_align)); +if(returnval==(farvoid *)NULL) + *errorcode=ERROR_MEMORY; +else + *errorcode=0; + +/* +** Check for alignment +*/ +adj_addr=true_addr=(ulong)returnval; +if(global_align==0) +{ + if(AddMemArray(true_addr, adj_addr)) + *errorcode=ERROR_MEMARRAY_FULL; + return(returnval); +} + +if(global_align==1) +{ + if(true_addr%2==0) adj_addr++; +} +else +{ + while(adj_addr%global_align!=0) ++adj_addr; + if(adj_addr%(global_align*2)==0) adj_addr+=global_align; +} +returnval=(void *)adj_addr; +if(AddMemArray(true_addr,adj_addr)) + *errorcode=ERROR_MEMARRAY_FULL; +return(returnval); +#endif + +} + + +/**************************** +** FreeMemory +** This is the reverse of AllocateMemory. The memory +** block passed in is freed. Should an error occur, +** that error is returned in errorcode. +*/ +void FreeMemory(farvoid *mempointer, /* Pointer to memory block */ + int *errorcode) +{ +#ifdef DOS16MEM +/* +** 16-bit DOS VERSION!! +*/ +unsigned int segment; +unsigned int offset; +union REGS registers; +struct SREGS sregisters; + +/* +** First get the segment/offset of the farvoid pointer. +*/ +segment=FP_SEG(mempointer); +offset=FP_OFF(mempointer); + +/* +** Align the segment properly. For as long as offset > 16, +** subtract 16 from offset and add 1 to segment. +*/ +while(offset>=16) +{ offset-=16; + segment++; +} + +/* +** Build the call to DOS +*/ +registers.h.ah=0x49; /* Free memory */ +sregisters.es=segment; + +intdosx(®isters,®isters,&sregisters); + +/* +** Check for error +*/ +if(registers.x.cflag) +{ *errorcode=ERROR_MEMORY; + return; +} + +*errorcode=0; +return; +#endif + +#ifdef MACMEM +DisposPtr((Ptr)mempointer); +*errorcode=0; +return; +#endif + +#ifdef MALLOCMEM +ulong adj_addr, true_addr; + +/* Locate item in memory array */ +adj_addr=(ulong)mempointer; +if(RemoveMemArray(adj_addr, &true_addr)) +{ *errorcode=ERROR_MEMARRAY_NFOUND; + return; +} +mempointer=(void *)true_addr; +free(mempointer); +*errorcode=0; +return; +#endif +} + +/**************************** +** MoveMemory +** Moves n bytes from a to b. Handles overlap. +** In most cases, this is just a memmove operation. +** But, not in DOS....noooo.... +*/ +void MoveMemory( farvoid *destination, /* Destination address */ + farvoid *source, /* Source address */ + unsigned long nbytes) +{ + +/* +++16-bit DOS VERSION+++ */ +#ifdef DOS16MEM + + FarDOSmemmove( destination, source, nbytes); + +#else + +memmove(destination, source, nbytes); + +#endif +} + +#ifdef DOS16MEM + +/**************************** +** FarDOSmemmove +** Performs the same function as memmove for DOS when +** the arrays are defined with far pointers. +*/ +void FarDOSmemmove(farvoid *destination, /* Destination pointer */ + farvoid *source, /* Source pointer */ + unsigned long nbytes) /* # of bytes to move */ +{ +unsigned char huge *uchsource; /* Temp source */ +unsigned char huge *uchdest; /* Temp destination */ +unsigned long saddr; /* Source "true" address */ +unsigned long daddr; /* Destination "true" address */ + + +/* +** Get unsigned char pointer equivalents +*/ +uchsource=(unsigned char huge *)source; +uchdest=(unsigned char huge *)destination; + +/* +** Calculate true address of source and destination and +** compare. +*/ +saddr=(unsigned long)(FP_SEG(source)*16 + FP_OFF(source)); +daddr=(unsigned long)(FP_SEG(destination)*16 + FP_OFF(destination)); + +if(saddr > daddr) +{ + /* + ** Source is greater than destination. + ** Use a series of standard move operations. + ** We'll move 65535 bytes at a time. + */ + while(nbytes>=65535L) + { _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t) 65535); + uchsource+=65535; /* Advance pointers */ + uchdest+=65535; + nbytes-=65535; + } + + /* + ** Move remaining bytes + */ + if(nbytes!=0L) + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t)(nbytes & 0xFFFF)); + +} +else +{ + /* + ** Destination is greater than source. + ** Advance pointers to the end of their + ** respective blocks. + */ + uchsource+=nbytes; + uchdest+=nbytes; + + /* + ** Again, move 65535 bytes at a time. However, + ** "back" the pointers up before doing the + ** move. + */ + while(nbytes>=65535L) + { + uchsource-=65535; + uchdest-=65535; + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t) 65535); + nbytes-=65535; + } + + /* + ** Move remaining bytes. + */ + if(nbytes!=0L) + { uchsource-=nbytes; + uchdest-=nbytes; + _fmemmove((farvoid *)uchdest, + (farvoid *)uchsource, + (size_t)(nbytes & 0xFFFF)); + } +} +return; +} +#endif + +/*********************************** +** MEMORY ARRAY HANDLING ROUTINES ** +***********************************/ +/**************************** +** InitMemArray +** Initialize the memory array. This simply amounts to +** setting mem_array_ents to zero, indicating that there +** isn't anything in the memory array. +*/ +void InitMemArray(void) +{ +mem_array_ents=0; +return; +} + +/*************************** +** AddMemArray +** Add a pair of items to the memory array. +** true_addr is the true address (mem_array[0][n]) +** adj_addr is the adjusted address (mem_array[0][n]) +** Returns 0 if ok +** -1 if not enough room +*/ +int AddMemArray(ulong true_addr, + ulong adj_addr) +{ +if(mem_array_ents>=MEM_ARRAY_SIZE) + return(-1); + +mem_array[0][mem_array_ents]=true_addr; +mem_array[1][mem_array_ents]=adj_addr; +mem_array_ents++; +return(0); +} + +/************************* +** RemoveMemArray +** Given an adjusted address value (mem_array[1][n]), locate +** the entry and remove it from the mem_array. +** Also returns the associated true address. +** Returns 0 if ok +** -1 if not found. +*/ +int RemoveMemArray(ulong adj_addr,ulong *true_addr) +{ +int i,j; + +/* Locate the item in the array. */ +for(i=0;i +#include +#include +#include + +#include "nmglobal.h" + +#if !defined(MAC) && !defined(OSX) +#include +#endif + + +/* +** System-specific includes +*/ + +#ifdef DOS16MEM +#include "dos.h" +#endif + +/* #include "time.h" +#include "io.h" +#include "fcntl.h" +#include "sys\stat.h" */ +/* Removed for MSVC++ +#include "alloc.h" +*/ + +/* +** MAC Time Manager routines (from Code Warrior) +*/ +#ifdef MACTIMEMGR +#include +#include +#include +#include +extern struct TMTask myTMTask; +extern long MacHSTdelay,MacHSTohead; +#endif + +/* +** Windows 3.1 timer defines +*/ +#ifdef WIN31TIMER +#include +#include +TIMERINFO win31tinfo; +HANDLE hThlp; +FARPROC lpfn; +#endif + +/************** +** EXTERNALS ** +**************/ +extern ulong mem_array[2][MEM_ARRAY_SIZE]; +extern int mem_array_ents; +extern int global_align; + +/**************************** +** FUNCTION PROTOTYPES ** +****************************/ + +farvoid *AllocateMemory(unsigned long nbytes, + int *errorcode); + +void FreeMemory(farvoid *mempointer, + int *errorcode); + +void MoveMemory( farvoid *destination, + farvoid *source, + unsigned long nbytes); + +#ifdef DOS16MEM +void FarDOSmemmove(farvoid *destination, + farvoid *source, + unsigned long nbytes); +#endif + +void InitMemArray(void); + +int AddMemArray(ulong true_addr, ulong adj_addr); + +int RemoveMemArray(ulong adj_addr,ulong *true_addr); + +void ReportError(char *context, int errorcode); + +void ErrorExit(); + +void CreateFile(char *filename, + int *errorcode); + +#ifdef DOS16 +int bmOpenFile(char *fname, + int *errorcode); + +void CloseFile(int fhandle, + int *errorcode); + +void readfile(int fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +void writefile(int fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); +#endif + +#ifdef LINUX +FILE *bmOpenFile(char *fname, + int *errorcode); + +void CloseFile(FILE *fhandle, + int *errorcode); + +void readfile(FILE *fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +void writefile(FILE *fhandle, + unsigned long offset, + unsigned long nbytes, + void *buffer, + int *errorcode); + +#endif + +unsigned long StartStopwatch(); + +unsigned long StopStopwatch(unsigned long startticks); + +unsigned long TicksToSecs(unsigned long tickamount); + +double TicksToFracSecs(unsigned long tickamount); + diff --git a/wordcat.h b/wordcat.h new file mode 100644 index 0000000..9f18b42 --- /dev/null +++ b/wordcat.h @@ -0,0 +1,81 @@ +/* +** wordcat.h +** Word catalog +** BYTEmark (tm) +** BYTE's Native Mode Benchmarks +** Rick Grehan, BYTE Magazine +** +** Creation: +** Revision: 3/95 +** +** DISCLAIMER +** The source, executable, and documentation files that comprise +** the BYTEmark benchmarks are made available on an "as is" basis. +** This means that we at BYTE Magazine have made every reasonable +** effort to verify that the there are no errors in the source and +** executable code. We cannot, however, guarantee that the programs +** are error-free. Consequently, McGraw-HIll and BYTE Magazine make +** no claims in regard to the fitness of the source code, executable +** code, and documentation of the BYTEmark. +** Furthermore, BYTE Magazine, McGraw-Hill, and all employees +** of McGraw-Hill cannot be held responsible for any damages resulting +** from the use of this code or the results obtained from using +** this code. +*/ + +/* +** Word catalog +*/ +#define WORDCATSIZE 50 + +char *wordcatarray[WORDCATSIZE] = +{ "Hello", + "He", + "Him", + "the", + "this", + "that", + "though", + "rough", + "cough", + "obviously", + "But", + "but", + "bye", + "begin", + "beginning", + "beginnings", + "of", + "our", + "ourselves", + "yourselves", + "to", + "together", + "togetherness", + "from", + "either", + "I", + "A", + "return", + "However", + "that", + "example", + "yet", + "quickly", + "all", + "if", + "were", + "includes", + "always", + "never", + "not", + "small", + "returns", + "set", + "basic", + "Entered", + "with", + "used", + "shown", + "you", + "know" }; -- cgit v1.2.3