/* * lat_mem_rd.c - measure memory load latency * * usage: lat_mem_rd size-in-MB stride [stride ...] * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define N 1000000 /* Don't change this */ #define STRIDE (512/sizeof(char *)) #define MEMTRIES 4 #define LOWER 512 void loads(char *addr, size_t range, size_t stride); size_t step(size_t k); int main(int ac, char **av) { size_t len; size_t range; size_t stride; int i; char *addr; len = atoi(av[1]) * 1024 * 1024; addr = (char *)malloc(len); if (av[2] == 0) { fprintf(stderr, "\"stride=%d\n", STRIDE); for (range = LOWER; range <= len; range = step(range)) { loads(addr, range, STRIDE); } } else { for (i = 2; i < ac; ++i) { stride = bytes(av[i]); fprintf(stderr, "\"stride=%d\n", stride); for (range = LOWER; range <= len; range = step(range)) { loads(addr, range, stride); } fprintf(stderr, "\n"); } } return(0); } void loads(char *addr, size_t range, size_t stride) { register char **p = 0 /* lint */; size_t i; int tries = 0; int result = 0x7fffffff; double time; if (stride & (sizeof(char *) - 1)) { printf("lat_mem_rd: stride must be aligned.\n"); return; } if (range < stride) { return; } /* * First create a list of pointers. * * This used to go forwards, we want to go backwards to try and defeat * HP's fetch ahead. * * We really need to do a random pattern once we are doing one hit per * page. */ for (i = stride; i < range; i += stride) { *(char **)&addr[i] = (char*)&addr[i - stride]; } *(char **)&addr[0] = (char*)&addr[i - stride]; p = (char**)&addr[0]; /* * Now walk them and time it. */ for (tries = 0; tries < MEMTRIES; ++tries) { /* time loop with loads */ #define ONE p = (char **)*p; #define FIVE ONE ONE ONE ONE ONE #define TEN FIVE FIVE #define FIFTY TEN TEN TEN TEN TEN #define HUNDRED FIFTY FIFTY i = N; start(0); while (i >= 1000) { HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED i -= 1000; } i = stop(0,0); use_pointer((void *)p); if (i < result) { result = i; } } /* * We want to get to nanoseconds / load. We don't want to * lose any precision in the process. What we have is the * milliseconds it took to do N loads, where N is 1 million, * and we expect that each load took between 10 and 2000 * nanoseconds. * * We want just the memory latency time, not including the * time to execute the load instruction. We allow one clock * for the instruction itself. So we need to subtract off * N * clk nanoseconds. * * lmbench 2.0 - do the subtration later, in the summary. * Doing it here was problematic. * * XXX - we do not account for loop overhead here. */ time = (double)result; time *= 1000.; /* convert to nanoseconds */ time /= (double)N; /* nanosecs per load */ fprintf(stderr, "%.5f %.3f\n", range / (1024. * 1024), time); } size_t step(size_t k) { if (k < 1024) { k = k * 2; } else if (k < 4*1024) { k += 1024; } else { size_t s; for (s = 32 * 1024; s <= k; s *= 2) ; k += s / 16; } return (k); }