mhz.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. /*
  2. * mhz.c - calculate clock rate and megahertz
  3. *
  4. * Usage: mhz [-c]
  5. *
  6. *******************************************************************
  7. *
  8. * Caveat emptor and other warnings
  9. *
  10. * This code must be compiled using the optimizer! If you don't
  11. * compile this using the optimizer, then many compilers don't
  12. * make good use of the registers and your inner loops end up
  13. * using stack variables, which is SLOW.
  14. *
  15. * Also, it is sensitive to other processor load. When running
  16. * mhz with "rtprio" (real-time priority), I have never had mhz
  17. * make a mistake on my machine. At other times mhz has been
  18. * wrong about 10% of the time.
  19. *
  20. * If there is too much noise/error in the data, then this program
  21. * will usually return a clock speed that is too high.
  22. *
  23. *******************************************************************
  24. *
  25. * Constraints
  26. *
  27. * mhz.c is meant to be platform independent ANSI/C code, and it
  28. * has as little platform dependent code as possible.
  29. *
  30. * This version of mhz is designed to eliminate the variable
  31. * instruction counts used by different compilers on different
  32. * architectures and instruction sets. It is also structured to
  33. * be tightly interlocked so processors with super-scalar elements
  34. * or dynamic instructure reorder buffers cannot overlap the
  35. * execution of the expressions.
  36. *
  37. * We have to try and make sure that the code in the various
  38. * inner loops does not fall out of the on-chip instruction cache
  39. * and that the inner loop variables fit inside the register set.
  40. * The i386 only has six addressable registers, so we had to make
  41. * sure that the inner loop procedures had fewer variables so they
  42. * would not spill onto the stack.
  43. *
  44. *******************************************************************
  45. *
  46. * Algorithm
  47. *
  48. * We can compute the CPU cycle time if we can get the compiler
  49. * to generate (at least) two instruction sequences inside loops
  50. * where the inner loop instruction counts are relatively prime.
  51. * We have several different loops to increase the chance that
  52. * two of them will be relatively prime on any given architecture.
  53. *
  54. * This technique makes no assumptions about the cost of any single
  55. * instruction or the number of instructions used to implement a
  56. * given expression. We just hope that the compiler gets at least
  57. * two inner loop instruction sequences with lengths that are
  58. * relatively prime. The "relatively prime" makes the greatest
  59. * common divisor method work. If all the instructions sequences
  60. * have a common factor (e.g. 2), then the apparent CPU speed will
  61. * be off by that common factor. Also, if there is too much
  62. * variability in the data so there is no apparent least common
  63. * multiple within the error bounds set in multiple_approx, then
  64. * we simply return the maximum clock rate found in the loops.
  65. *
  66. * The processor's clock speed is the greatest common divisor
  67. * of the execution frequencies of the various loops. For
  68. * example, suppose we are trying to compute the clock speed
  69. * for a 120Mhz processor, and we have two loops:
  70. * SHR --- two cycles to shift right
  71. * SHR;ADD --- three cycles to SHR and add
  72. * then the expression duration will be:
  73. * SHR 11.1ns (2 cycles/SHR)
  74. * SHR;ADD 16.6ns (3 cycles/SHR;ADD)
  75. * so the greatest common divisor is 5.55ns and the clock speed
  76. * is 120Mhz. Aside from extraneous variability added by poor
  77. * benchmarking hygiene, this method should always work when we
  78. * are able to get loops with cycle counts that are relatively
  79. * prime.
  80. *
  81. * Suppose we are unlucky, and we have our two loops do
  82. * not have relatively prime instruction counts. Suppose
  83. * our two loops are:
  84. * SHR 11.1ns (2 cycles/SHR)
  85. * SHR;ADD;SUB 22.2ns (4 cycles/SHR;ADD;SUB)
  86. * then the greatest common divisor will be 11.1ns, so the clock
  87. * speed will appear to be 60Mhz.
  88. *
  89. * The loops provided so far should have at least two relatively
  90. * prime loops on nearly all architectures.
  91. *
  92. *******************************************************************
  93. *
  94. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with
  95. * additional restriction that results may published only if
  96. * (1) the benchmark is unmodified, and
  97. * (2) the version in the sccsid below is included in the report.
  98. * Support for this development by Silicon Graphics is gratefully acknowledged.
  99. * Support for this development by Hewlett Packard is gratefully acknowledged.
  100. * Support for this development by Sun Microsystems is gratefully acknowledged.
  101. *
  102. *******************************************************************
  103. */
  104. char *id = "$Id$\n";
  105. #include "bench.h"
  106. #include <math.h>
  107. typedef long TYPE;
  108. #define TEN(A) A A A A A A A A A A
  109. #define HUNDRED(A) TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) \
  110. TEN(A) TEN(A) TEN(A) TEN(A) TEN(A)
  111. #define MHZ(M, contents) \
  112. char* \
  113. name_##M() \
  114. { \
  115. return #contents; \
  116. } \
  117. \
  118. TYPE** \
  119. _mhz_##M (register long n, register TYPE **p, \
  120. register TYPE a, register TYPE b) \
  121. { \
  122. for (; n > 0; --n) { \
  123. HUNDRED(contents) \
  124. } \
  125. return p + a + b; \
  126. } \
  127. \
  128. void \
  129. mhz_##M(int enough) \
  130. { \
  131. TYPE __i = 1; \
  132. TYPE *__x=(TYPE *)&__x, **__p=(TYPE **)__x, **__q = NULL; \
  133. _mhz_##M(1, __p, 1, 1); \
  134. BENCH1(__q = _mhz_##M(__n, __p, __i, __i); __n = 1;, enough) \
  135. use_pointer((void*)__q); \
  136. save_n(100 * get_n()); /* # of expressions executed */ \
  137. }
  138. MHZ(1, p=(TYPE**)*p;)
  139. MHZ(2, a^=a+a;)
  140. MHZ(3, a^=a+a+a;)
  141. MHZ(4, a>>=b;)
  142. MHZ(5, a>>=a+a;)
  143. MHZ(6, a^=a<<b;)
  144. MHZ(7, a^=a+b;)
  145. MHZ(8, a+=(a+b)&07;)
  146. MHZ(9, a++;a^=1;a<<=1;)
  147. typedef void (*loop_f)(int);
  148. loop_f loops[] = {
  149. mhz_1,
  150. mhz_2,
  151. mhz_3,
  152. mhz_4,
  153. mhz_5,
  154. mhz_6,
  155. mhz_7,
  156. mhz_8,
  157. mhz_9,
  158. };
  159. #define NTESTS (sizeof(loops) / sizeof(loop_f))
  160. #define BIT_SET(A,bit) ((A) & 1 << (bit))
  161. /*
  162. * This is used to filter out bad points (mostly ones that have had
  163. * their inner loop optimized away). Bad points are those with values
  164. * less than 1/20th of the median value and more than 20 times the
  165. * median value.
  166. *
  167. * filter_data returns the number of valid data points, and puts the
  168. * valid points in the lower part of the values[] array.
  169. */
  170. int
  171. filter_data(double values[], int size)
  172. {
  173. int i;
  174. int tests;
  175. double median;
  176. double *d = (double *)malloc(size * sizeof(double));
  177. for (i = 0; i < size; ++i) d[i] = values[i];
  178. qsort(d, size, sizeof(double), double_compare);
  179. median = d[size/2];
  180. if (size > 0 && size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
  181. free(d);
  182. /* if the data point is inside the envelope of acceptable
  183. * results, then keep it, otherwise discard it
  184. */
  185. for (i = 0, tests = 0; i < size; ++i)
  186. if (0.05 * median < values[i] && values[i] < 20.0 * median) {
  187. if (i > tests) values[tests] = values[i];
  188. tests++;
  189. }
  190. return tests;
  191. }
  192. /*
  193. * make sure that there are enough points with significantly
  194. * different data values (greater than 5% difference) in the
  195. * data subset.
  196. */
  197. int
  198. classes(double values[], int size)
  199. {
  200. int i;
  201. double median;
  202. double *d = (double *)malloc(size * sizeof(double));
  203. int classid;
  204. for (i = 0; i < size; ++i) d[i] = values[i];
  205. qsort(d, size, sizeof(double), double_compare);
  206. median = d[size/2];
  207. if (size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
  208. /* if the difference is less than 1/20th of the median, then
  209. * we assume that the two points are the same
  210. */
  211. for (i = 1, classid = 1; i < size; ++i)
  212. if ((d[i] - d[i-1]) > 0.05 * median) classid++;
  213. free(d);
  214. return classid;
  215. }
  216. /*
  217. * mode
  218. *
  219. * return the most common value (within 1MHz)
  220. */
  221. int
  222. mode(double values[], int n)
  223. {
  224. int i, n_mode, n_curr;
  225. int mode, curr;
  226. qsort(values, n, sizeof(double), double_compare);
  227. n_mode = 1;
  228. n_curr = 1;
  229. mode = (int)(values[0] + 0.5);
  230. curr = (int)(values[0] + 0.5);
  231. for (i = 1; i < n; ++i) {
  232. int v = (int)(values[i] + 0.5);
  233. if (curr != v) {
  234. curr = v;
  235. n_curr = 0;
  236. }
  237. n_curr++;
  238. if (n_curr > n_mode) {
  239. mode = curr;
  240. n_mode = n_curr;
  241. }
  242. }
  243. return mode;
  244. }
  245. /*
  246. * cross_values
  247. *
  248. * This routine will create new data points by subtracting pairs
  249. * of data points.
  250. */
  251. void
  252. cross_values(double values[], int size, double **cvalues, int *csize)
  253. {
  254. int i, j;
  255. *cvalues = (double *)malloc(size * size * sizeof(double));
  256. *csize = 0;
  257. for (i = 0; i < size; ++i) {
  258. (*cvalues)[(*csize)++] = values[i];
  259. /* create new points with the differences */
  260. for (j = i + 1; j < size; ++j) {
  261. (*cvalues)[(*csize)++] = ABS(values[i] - values[j]);
  262. }
  263. }
  264. }
  265. /*
  266. * gcd
  267. *
  268. * return the greatest common divisor of the passed values (within a
  269. * margin of error because these are experimental results, not
  270. * theoretical numbers). We do this by guessing how many instructions
  271. * are in each loop, and then trying to fit a straight line through
  272. * the (instruction count, time) points. The regression is of the
  273. * form:
  274. *
  275. * y = a + b * x
  276. *
  277. * The time for an individual instruction is "b", while "a" should
  278. * be 0. The trick is to figure out which guess is the right one!
  279. *
  280. * We assume that the gcd is the first value at which we have
  281. * significantly improved regression fit (as measured by chi2).
  282. *
  283. * We increase the number of experimental points (and generate
  284. * more small points) by adding points for the differences between
  285. * measured values (and compute the standard error appropriately).
  286. *
  287. * We want the regression line to go through the origin, so we
  288. * add an artificial point at (0,0) with a tiny standard error.
  289. */
  290. double
  291. gcd(double values[], int size)
  292. {
  293. /* assumption: shortest inner loop has no more than this many instructions */
  294. #define MAX_COUNT 6
  295. int i, n, count;
  296. double min, result, min_chi2 = 0.0, a, b, sig_a, sig_b, chi2;
  297. double *y, *x = (double *)malloc(size * size * sizeof(double));
  298. /* find the smallest value */
  299. result = min = double_min(values, size);
  300. /* create new points by subtracting each pair of values */
  301. cross_values(values, size, &y, &n);
  302. /* make sure the regression goes through the origin */
  303. y[n++] = 0.0;
  304. for (count = 1; count < MAX_COUNT; ++count) {
  305. /*
  306. * given the minimum loop has "count" instructions,
  307. * guess how many instructions each other loop contains
  308. */
  309. for (i = 0; i < n; ++i) {
  310. int m = (int)((double)count * y[i] / min + 0.5);
  311. x[i] = (double)m;
  312. }
  313. /* find the regression of the samples */
  314. regression(x, y, NULL, n, &a, &b, &sig_a, &sig_b, &chi2);
  315. if (count == 1 || count * count * chi2 < min_chi2) {
  316. result = b;
  317. min_chi2 = chi2;
  318. }
  319. }
  320. free(x);
  321. free(y);
  322. return result;
  323. }
  324. /*
  325. * compute the gcd of many possible combinations of experimental values
  326. * and return the mode of the results to reduce the impact
  327. * of a few bad experimental measurements on the computed result.
  328. *
  329. * r - pointer to the array of experimental results
  330. * off - offset of the result we want. TRIES-1 == minimum result.
  331. */
  332. int
  333. compute_mhz(result_t *r)
  334. {
  335. int i, j, mhz[2], n, subset, ntests;
  336. double data[NTESTS], results[1<<NTESTS];
  337. for (i = 0; i < 2; ++i) {
  338. for (subset = 0, ntests = 0; subset < (1<<NTESTS); ++subset) {
  339. for (j = 0, n = 0; j < NTESTS; ++j)
  340. if (BIT_SET(subset, j) && r[j].N > TRIES/2)
  341. data[n++] = r[j].u[r[j].N-1-i] / (double)r[j].n[r[j].N-1-i];
  342. if (n < 2
  343. || (n = filter_data(data, n)) < 2
  344. || classes(data, n) < 2)
  345. continue;
  346. results[ntests++] = 1.0 / gcd(data, n);
  347. }
  348. mhz[i] = mode(results, ntests);
  349. }
  350. /* if the results agree within 1% or 1MHz, accept them */
  351. if (ABS(mhz[0] - mhz[1]) / (double)mhz[0] <= 0.01
  352. || ABS(mhz[0] - mhz[1]) <= 1)
  353. return mhz[0];
  354. return -1;
  355. }
  356. void
  357. save_data(result_t* data, result_t* data_save)
  358. {
  359. int i;
  360. for (i = 0; i < NTESTS; ++i) {
  361. data_save[i] = data[i];
  362. }
  363. }
  364. void
  365. print_data(double mhz, result_t* data)
  366. {
  367. int i, j;
  368. char *CPU_name = "CPU";
  369. char *uname = "uname";
  370. char *email = "email";
  371. int speed = -1;
  372. char *names[NTESTS];
  373. names[0] = name_1();
  374. names[1] = name_2();
  375. names[2] = name_3();
  376. names[3] = name_4();
  377. names[4] = name_5();
  378. names[5] = name_6();
  379. names[6] = name_7();
  380. names[7] = name_8();
  381. names[8] = name_9();
  382. printf("/* \"%s\", \"%s\", \"%s\", %d, %.0f, %d, %f, %f */\n",
  383. CPU_name, uname, email, speed,
  384. mhz, get_enough(0), l_overhead(), t_overhead());
  385. printf("result_t* data[] = { \n");
  386. for (i = 0; i < NTESTS; ++i) {
  387. printf("\t/* %s */ { %d, {", names[i], data[i].N);
  388. for (j = 0; j < data[i].N; ++j) {
  389. printf("\n\t\t{ /* %f */ %lu, %lu}", data[i].u[j] / (100. * data[i].n[j]), (unsigned long)data[i].u[j], (unsigned long)data[i].n[j]);
  390. if (j < TRIES - 1) printf(", ");
  391. }
  392. if (i < NTESTS - 1) printf("}},\n");
  393. else printf("}}\n");
  394. }
  395. printf("};\n");
  396. }
  397. int
  398. main(int ac, char **av)
  399. {
  400. int i, j, k, mhz = -1;
  401. double runtime;
  402. result_t data[NTESTS];
  403. result_t data_save[NTESTS];
  404. putenv("LOOP_O=0.0"); /* should be at most 1% */
  405. runtime = (NTESTS * TRIES * 3 * get_enough(0)) / 1000000.;
  406. if (runtime > 3.) {
  407. fprintf(stderr, "mhz: should take approximately %.0f seconds\n", runtime);
  408. }
  409. /* make three efforts to get reliable data */
  410. for (i = 0; i < 3 && mhz < 0; ++i) {
  411. /* initialize the data arrays */
  412. for (j = 0; j < NTESTS; ++j)
  413. insertinit(&data[j]);
  414. /*
  415. * collect the data; try to minimize impact of activity bursts
  416. * by putting NTESTS in the inner loop so a burst will affect
  417. * one data point for all expressions first, rather than all
  418. * data points for one expression.
  419. */
  420. for (j = 0; j < TRIES; ++j) {
  421. for (k = 0; k < NTESTS; ++k) {
  422. (*loops[k])(0);
  423. insertsort(gettime(), get_n(), &data[k]);
  424. }
  425. }
  426. save_data(data, data_save);
  427. mhz = compute_mhz(data);
  428. }
  429. if (ac > 1 && !strcmp(av[1], "-d")) {
  430. if (ac > 1) {
  431. ac --;
  432. for (i = 1; i < ac; ++i) {
  433. av[i] = av[i+1];
  434. }
  435. }
  436. print_data(mhz, data_save);
  437. }
  438. if (mhz < 0.) {
  439. printf("-1 System too busy\n");
  440. exit(1);
  441. }
  442. if (ac == 2 && !strcmp(av[1], "-c")) {
  443. printf("%.4f\n", 1000. / (double)mhz);
  444. } else {
  445. printf("%d MHz, %.2f nanosec clock\n",
  446. mhz, 1000. / (double)mhz);
  447. }
  448. exit(0);
  449. }