123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501 |
- /*
- * mhz.c - calculate clock rate and megahertz
- *
- * Usage: mhz [-c]
- *
- *******************************************************************
- *
- * Caveat emptor and other warnings
- *
- * This code must be compiled using the optimizer! If you don't
- * compile this using the optimizer, then many compilers don't
- * make good use of the registers and your inner loops end up
- * using stack variables, which is SLOW.
- *
- * Also, it is sensitive to other processor load. When running
- * mhz with "rtprio" (real-time priority), I have never had mhz
- * make a mistake on my machine. At other times mhz has been
- * wrong about 10% of the time.
- *
- * If there is too much noise/error in the data, then this program
- * will usually return a clock speed that is too high.
- *
- *******************************************************************
- *
- * Constraints
- *
- * mhz.c is meant to be platform independent ANSI/C code, and it
- * has as little platform dependent code as possible.
- *
- * This version of mhz is designed to eliminate the variable
- * instruction counts used by different compilers on different
- * architectures and instruction sets. It is also structured to
- * be tightly interlocked so processors with super-scalar elements
- * or dynamic instructure reorder buffers cannot overlap the
- * execution of the expressions.
- *
- * We have to try and make sure that the code in the various
- * inner loops does not fall out of the on-chip instruction cache
- * and that the inner loop variables fit inside the register set.
- * The i386 only has six addressable registers, so we had to make
- * sure that the inner loop procedures had fewer variables so they
- * would not spill onto the stack.
- *
- *******************************************************************
- *
- * Algorithm
- *
- * We can compute the CPU cycle time if we can get the compiler
- * to generate (at least) two instruction sequences inside loops
- * where the inner loop instruction counts are relatively prime.
- * We have several different loops to increase the chance that
- * two of them will be relatively prime on any given architecture.
- *
- * This technique makes no assumptions about the cost of any single
- * instruction or the number of instructions used to implement a
- * given expression. We just hope that the compiler gets at least
- * two inner loop instruction sequences with lengths that are
- * relatively prime. The "relatively prime" makes the greatest
- * common divisor method work. If all the instructions sequences
- * have a common factor (e.g. 2), then the apparent CPU speed will
- * be off by that common factor. Also, if there is too much
- * variability in the data so there is no apparent least common
- * multiple within the error bounds set in multiple_approx, then
- * we simply return the maximum clock rate found in the loops.
- *
- * The processor's clock speed is the greatest common divisor
- * of the execution frequencies of the various loops. For
- * example, suppose we are trying to compute the clock speed
- * for a 120Mhz processor, and we have two loops:
- * SHR --- two cycles to shift right
- * SHR;ADD --- three cycles to SHR and add
- * then the expression duration will be:
- * SHR 11.1ns (2 cycles/SHR)
- * SHR;ADD 16.6ns (3 cycles/SHR;ADD)
- * so the greatest common divisor is 5.55ns and the clock speed
- * is 120Mhz. Aside from extraneous variability added by poor
- * benchmarking hygiene, this method should always work when we
- * are able to get loops with cycle counts that are relatively
- * prime.
- *
- * Suppose we are unlucky, and we have our two loops do
- * not have relatively prime instruction counts. Suppose
- * our two loops are:
- * SHR 11.1ns (2 cycles/SHR)
- * SHR;ADD;SUB 22.2ns (4 cycles/SHR;ADD;SUB)
- * then the greatest common divisor will be 11.1ns, so the clock
- * speed will appear to be 60Mhz.
- *
- * The loops provided so far should have at least two relatively
- * prime loops on nearly all architectures.
- *
- *******************************************************************
- *
- * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with
- * additional restriction that results may published only if
- * (1) the benchmark is unmodified, and
- * (2) the version in the sccsid below is included in the report.
- * Support for this development by Silicon Graphics is gratefully acknowledged.
- * Support for this development by Hewlett Packard is gratefully acknowledged.
- * Support for this development by Sun Microsystems is gratefully acknowledged.
- *
- *******************************************************************
- */
- char *id = "$Id$\n";
- #include "bench.h"
- #include <math.h>
- typedef long TYPE;
- #define TEN(A) A A A A A A A A A A
- #define HUNDRED(A) TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) \
- TEN(A) TEN(A) TEN(A) TEN(A) TEN(A)
- #define MHZ(M, contents) \
- char* \
- name_##M() \
- { \
- return #contents; \
- } \
- \
- TYPE** \
- _mhz_##M (register long n, register TYPE **p, \
- register TYPE a, register TYPE b) \
- { \
- for (; n > 0; --n) { \
- HUNDRED(contents) \
- } \
- return p + a + b; \
- } \
- \
- void \
- mhz_##M(int enough) \
- { \
- TYPE __i = 1; \
- TYPE *__x=(TYPE *)&__x, **__p=(TYPE **)__x, **__q = NULL; \
- _mhz_##M(1, __p, 1, 1); \
- BENCH1(__q = _mhz_##M(__n, __p, __i, __i); __n = 1;, enough) \
- use_pointer((void*)__q); \
- save_n(100 * get_n()); /* # of expressions executed */ \
- }
- MHZ(1, p=(TYPE**)*p;)
- MHZ(2, a^=a+a;)
- MHZ(3, a^=a+a+a;)
- MHZ(4, a>>=b;)
- MHZ(5, a>>=a+a;)
- MHZ(6, a^=a<<b;)
- MHZ(7, a^=a+b;)
- MHZ(8, a+=(a+b)&07;)
- MHZ(9, a++;a^=1;a<<=1;)
- typedef void (*loop_f)(int);
- loop_f loops[] = {
- mhz_1,
- mhz_2,
- mhz_3,
- mhz_4,
- mhz_5,
- mhz_6,
- mhz_7,
- mhz_8,
- mhz_9,
- };
- #define NTESTS (sizeof(loops) / sizeof(loop_f))
- #define BIT_SET(A,bit) ((A) & 1 << (bit))
- /*
- * This is used to filter out bad points (mostly ones that have had
- * their inner loop optimized away). Bad points are those with values
- * less than 1/20th of the median value and more than 20 times the
- * median value.
- *
- * filter_data returns the number of valid data points, and puts the
- * valid points in the lower part of the values[] array.
- */
- int
- filter_data(double values[], int size)
- {
- int i;
- int tests;
- double median;
- double *d = (double *)malloc(size * sizeof(double));
- for (i = 0; i < size; ++i) d[i] = values[i];
- qsort(d, size, sizeof(double), double_compare);
- median = d[size/2];
- if (size > 0 && size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
- free(d);
- /* if the data point is inside the envelope of acceptable
- * results, then keep it, otherwise discard it
- */
- for (i = 0, tests = 0; i < size; ++i)
- if (0.05 * median < values[i] && values[i] < 20.0 * median) {
- if (i > tests) values[tests] = values[i];
- tests++;
- }
- return tests;
- }
- /*
- * make sure that there are enough points with significantly
- * different data values (greater than 5% difference) in the
- * data subset.
- */
- int
- classes(double values[], int size)
- {
- int i;
- double median;
- double *d = (double *)malloc(size * sizeof(double));
- int classid;
- for (i = 0; i < size; ++i) d[i] = values[i];
- qsort(d, size, sizeof(double), double_compare);
- median = d[size/2];
- if (size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
- /* if the difference is less than 1/20th of the median, then
- * we assume that the two points are the same
- */
- for (i = 1, classid = 1; i < size; ++i)
- if ((d[i] - d[i-1]) > 0.05 * median) classid++;
- free(d);
- return classid;
- }
- /*
- * mode
- *
- * return the most common value (within 1MHz)
- */
- int
- mode(double values[], int n)
- {
- int i, n_mode, n_curr;
- int mode, curr;
- qsort(values, n, sizeof(double), double_compare);
- n_mode = 1;
- n_curr = 1;
- mode = (int)(values[0] + 0.5);
- curr = (int)(values[0] + 0.5);
- for (i = 1; i < n; ++i) {
- int v = (int)(values[i] + 0.5);
- if (curr != v) {
- curr = v;
- n_curr = 0;
- }
- n_curr++;
- if (n_curr > n_mode) {
- mode = curr;
- n_mode = n_curr;
- }
- }
- return mode;
- }
- /*
- * cross_values
- *
- * This routine will create new data points by subtracting pairs
- * of data points.
- */
- void
- cross_values(double values[], int size, double **cvalues, int *csize)
- {
- int i, j;
- *cvalues = (double *)malloc(size * size * sizeof(double));
- *csize = 0;
- for (i = 0; i < size; ++i) {
- (*cvalues)[(*csize)++] = values[i];
- /* create new points with the differences */
- for (j = i + 1; j < size; ++j) {
- (*cvalues)[(*csize)++] = ABS(values[i] - values[j]);
- }
- }
- }
- /*
- * gcd
- *
- * return the greatest common divisor of the passed values (within a
- * margin of error because these are experimental results, not
- * theoretical numbers). We do this by guessing how many instructions
- * are in each loop, and then trying to fit a straight line through
- * the (instruction count, time) points. The regression is of the
- * form:
- *
- * y = a + b * x
- *
- * The time for an individual instruction is "b", while "a" should
- * be 0. The trick is to figure out which guess is the right one!
- *
- * We assume that the gcd is the first value at which we have
- * significantly improved regression fit (as measured by chi2).
- *
- * We increase the number of experimental points (and generate
- * more small points) by adding points for the differences between
- * measured values (and compute the standard error appropriately).
- *
- * We want the regression line to go through the origin, so we
- * add an artificial point at (0,0) with a tiny standard error.
- */
- double
- gcd(double values[], int size)
- {
- /* assumption: shortest inner loop has no more than this many instructions */
- #define MAX_COUNT 6
- int i, n, count;
- double min, result, min_chi2 = 0.0, a, b, sig_a, sig_b, chi2;
- double *y, *x = (double *)malloc(size * size * sizeof(double));
- /* find the smallest value */
- result = min = double_min(values, size);
- /* create new points by subtracting each pair of values */
- cross_values(values, size, &y, &n);
- /* make sure the regression goes through the origin */
- y[n++] = 0.0;
- for (count = 1; count < MAX_COUNT; ++count) {
- /*
- * given the minimum loop has "count" instructions,
- * guess how many instructions each other loop contains
- */
- for (i = 0; i < n; ++i) {
- int m = (int)((double)count * y[i] / min + 0.5);
- x[i] = (double)m;
- }
- /* find the regression of the samples */
- regression(x, y, NULL, n, &a, &b, &sig_a, &sig_b, &chi2);
- if (count == 1 || count * count * chi2 < min_chi2) {
- result = b;
- min_chi2 = chi2;
- }
- }
- free(x);
- free(y);
- return result;
- }
- /*
- * compute the gcd of many possible combinations of experimental values
- * and return the mode of the results to reduce the impact
- * of a few bad experimental measurements on the computed result.
- *
- * r - pointer to the array of experimental results
- * off - offset of the result we want. TRIES-1 == minimum result.
- */
- int
- compute_mhz(result_t *r)
- {
- int i, j, mhz[2], n, subset, ntests;
- double data[NTESTS], results[1<<NTESTS];
- for (i = 0; i < 2; ++i) {
- for (subset = 0, ntests = 0; subset < (1<<NTESTS); ++subset) {
- for (j = 0, n = 0; j < NTESTS; ++j)
- if (BIT_SET(subset, j) && r[j].N > TRIES/2)
- data[n++] = r[j].u[r[j].N-1-i] / (double)r[j].n[r[j].N-1-i];
- if (n < 2
- || (n = filter_data(data, n)) < 2
- || classes(data, n) < 2)
- continue;
- results[ntests++] = 1.0 / gcd(data, n);
- }
- mhz[i] = mode(results, ntests);
- }
- /* if the results agree within 1% or 1MHz, accept them */
- if (ABS(mhz[0] - mhz[1]) / (double)mhz[0] <= 0.01
- || ABS(mhz[0] - mhz[1]) <= 1)
- return mhz[0];
- return -1;
- }
- void
- save_data(result_t* data, result_t* data_save)
- {
- int i;
- for (i = 0; i < NTESTS; ++i) {
- data_save[i] = data[i];
- }
- }
- void
- print_data(double mhz, result_t* data)
- {
- int i, j;
- char *CPU_name = "CPU";
- char *uname = "uname";
- char *email = "email";
- int speed = -1;
- char *names[NTESTS];
- names[0] = name_1();
- names[1] = name_2();
- names[2] = name_3();
- names[3] = name_4();
- names[4] = name_5();
- names[5] = name_6();
- names[6] = name_7();
- names[7] = name_8();
- names[8] = name_9();
- printf("/* \"%s\", \"%s\", \"%s\", %d, %.0f, %d, %f, %f */\n",
- CPU_name, uname, email, speed,
- mhz, get_enough(0), l_overhead(), t_overhead());
- printf("result_t* data[] = { \n");
- for (i = 0; i < NTESTS; ++i) {
- printf("\t/* %s */ { %d, {", names[i], data[i].N);
- for (j = 0; j < data[i].N; ++j) {
- printf("\n\t\t{ /* %f */ %lu, %lu}", data[i].u[j] / (100. * data[i].n[j]), (unsigned long)data[i].u[j], (unsigned long)data[i].n[j]);
- if (j < TRIES - 1) printf(", ");
- }
- if (i < NTESTS - 1) printf("}},\n");
- else printf("}}\n");
- }
- printf("};\n");
- }
- int
- main(int ac, char **av)
- {
- int i, j, k, mhz = -1;
- double runtime;
- result_t data[NTESTS];
- result_t data_save[NTESTS];
- putenv("LOOP_O=0.0"); /* should be at most 1% */
- runtime = (NTESTS * TRIES * 3 * get_enough(0)) / 1000000.;
- if (runtime > 3.) {
- fprintf(stderr, "mhz: should take approximately %.0f seconds\n", runtime);
- }
- /* make three efforts to get reliable data */
- for (i = 0; i < 3 && mhz < 0; ++i) {
- /* initialize the data arrays */
- for (j = 0; j < NTESTS; ++j)
- insertinit(&data[j]);
- /*
- * collect the data; try to minimize impact of activity bursts
- * by putting NTESTS in the inner loop so a burst will affect
- * one data point for all expressions first, rather than all
- * data points for one expression.
- */
- for (j = 0; j < TRIES; ++j) {
- for (k = 0; k < NTESTS; ++k) {
- (*loops[k])(0);
- insertsort(gettime(), get_n(), &data[k]);
- }
- }
- save_data(data, data_save);
- mhz = compute_mhz(data);
- }
- if (ac > 1 && !strcmp(av[1], "-d")) {
- if (ac > 1) {
- ac --;
- for (i = 1; i < ac; ++i) {
- av[i] = av[i+1];
- }
- }
- print_data(mhz, data_save);
- }
- if (mhz < 0.) {
- printf("-1 System too busy\n");
- exit(1);
- }
- if (ac == 2 && !strcmp(av[1], "-c")) {
- printf("%.4f\n", 1000. / (double)mhz);
- } else {
- printf("%d MHz, %.2f nanosec clock\n",
- mhz, 1000. / (double)mhz);
- }
- exit(0);
- }
|