usenix96.ms 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798
  1. .\" This document is GNU groff -mgs -t -p -R -s
  2. .\" It will not print with normal troffs, it uses groff features, in particular,
  3. .\" long names for registers & strings.
  4. .\" Deal with it and use groff - it makes things portable.
  5. .\"
  6. .\" $X$ xroff -mgs -t -p -R -s $file
  7. .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
  8. .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
  9. .VARPS
  10. .\" Define a page top that looks cool
  11. .\" HELLO CARL! To turn this off, s/PT/oldPT/
  12. .de draftPT
  13. .\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP'
  14. ..
  15. .de PT
  16. .if \\n%>1 \{\
  17. . sp -.1i
  18. . ps 14
  19. . ft 3
  20. . nr big 24
  21. . nr space \\w'XXX'
  22. . nr titlewid \\w'\\*[title]'
  23. . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
  24. . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
  25. . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
  26. . ce 1
  27. \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
  28. . ps
  29. . sp -.70
  30. . ps 12
  31. \\l'\\n[LL]u'
  32. . ft
  33. . ps
  34. .\}
  35. ..
  36. .\" Define a page bottom that looks cool
  37. .\" HELLO CARL! To turn this off, s/BT/oldBT/
  38. .de draftBT
  39. .\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP'
  40. ..
  41. .de BT
  42. . ps 9
  43. \v'-1'\\l'\\n(LLu'
  44. . sp -1
  45. . tl '\(co 1995 \\*[author]'\\*(DY'%'
  46. . ps
  47. ..
  48. .de SP
  49. . if t .sp .5
  50. . if n .sp 1
  51. ..
  52. .de BU
  53. . SP
  54. . ne 2
  55. \(bu\
  56. . if \\n[.$] \fB\\$1\fP\\$2
  57. ..
  58. .nr FIGURE 0
  59. .nr TABLE 0
  60. .nr SMALL .25i
  61. .de TSTART
  62. . KF
  63. . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
  64. . ps -1
  65. . vs -1
  66. ..
  67. .de TEND
  68. . ps +1
  69. . vs +1
  70. . if \\n[.$]=2 \{\
  71. . sp -.5
  72. \s(24\\l'\\n[pg@colw]u'\s0 \}
  73. . sp .25
  74. . nr TABLE \\n[TABLE]+1
  75. . ce 1
  76. \fBTable \\n[TABLE].\ \ \\$1\fP
  77. . SP
  78. . KE
  79. ..
  80. .de FEND
  81. . ps +1
  82. . vs +1
  83. . if \\n[.$]=2 \{\
  84. . sp -.5
  85. \s(24\\l'\\n[pg@colw]u'\s0 \}
  86. . sp .25
  87. . nr FIGURE \\n[FIGURE]+1
  88. . ce 1
  89. \fBFigure \\n[FIGURE].\ \ \\$1\fP
  90. . SP
  91. . KE
  92. ..
  93. .\" Configuration
  94. .nr PI 3n
  95. .nr HM .95i
  96. .nr FM 1i
  97. .nr PO .95i
  98. .if t .po .95i
  99. .nr LL 6.5i
  100. .if n .nr PO 0i
  101. .if n .nr LL 7.75i
  102. .nr PS 10
  103. .nr VS \n(PS+1
  104. .ds title Portable tools for performance analysis
  105. .ds author Larry McVoy
  106. .ds lmbench \f(CWlmbench\fP
  107. .ds lmdd \f(CWlmdd\fP
  108. .ds bcopy \f(CWbcopy\fP
  109. .ds connect \f(CWconnect\fP
  110. .ds execlp \f(CWexeclp\fP
  111. .ds exit \f(CWexit\fP
  112. .ds fork \f(CWfork\fP
  113. .ds gcc \f(CWgcc\fP
  114. .ds getpid \f(CWgetpid\fP
  115. .ds getpid \f(CWgetpid\fP
  116. .ds gettimeofday \f(CWgettimeofday\fP
  117. .ds kill \f(CWkill\fP
  118. .ds memmove \f(CWmemmove\fP
  119. .ds mmap \f(CWmmap\fP
  120. .ds popen \f(CWpopen\fP
  121. .ds read \f(CWread\fP
  122. .ds stream \f(CWstream\fP
  123. .ds system \f(CWsystem\fP
  124. .ds uiomove \f(CWuiomove\fP
  125. .ds write \f(CWwrite\fP
  126. .ds yield \f(CWyield\fP
  127. .\" References stuff
  128. .de RN \"Reference Name: .RN $1 -- prints the reference prettily
  129. .\"[\s-2\\$1\s+2]\\$2
  130. [\s-1\\$1\s0]\\$2
  131. ..
  132. .\" .R1
  133. .\" sort A+DT
  134. .\" database references
  135. .\" label-in-text
  136. .\" label A.nD.y-2
  137. .\" bracket-label \*([. \*(.] ", "
  138. .\" .R2
  139. .TL
  140. \s(14lmbench: Portable tools for performance analysis\s0\**
  141. .AU
  142. \s+2\fR\*[author]\fP\s0
  143. .AI
  144. \fI\s+2Silicon Graphics, Inc.\s0\fP
  145. .AU
  146. \s+2\fRCarl Staelin\fP
  147. .AI
  148. \s+2\fIHewlett-Packard Laboratories\s0\fP
  149. .SP
  150. .AB
  151. \*[lmbench] is a micro-benchmark suite designed to focus
  152. attention on the basic building blocks of many
  153. common system applications, such as databases, simulations,
  154. software development, and networking. In almost all
  155. cases, the individual tests are the result of analysis and isolation
  156. of a customer's actual performance problem.
  157. .\" .SP
  158. These tools can be, and currently are, used to compare different
  159. system implementations from different vendors.
  160. In several cases,
  161. the benchmarks have uncovered previously unknown bugs and design flaws.
  162. The results have shown a strong
  163. correlation between memory system performance and overall performance.
  164. .\" XXX - MP versus uniprocessors?
  165. \*[lmbench] includes an extensible database of
  166. results from systems current as of late 1995.
  167. .AE
  168. .if t .MC 3.05i
  169. .FS
  170. This paper first appeared in the January 1996 Usenix conference proceedings.
  171. The version you are reading has new results as well as some corrections.
  172. .FE
  173. .NH 1
  174. Introduction
  175. .PP
  176. \*[lmbench]
  177. provides a suite of benchmarks that attempt to measure the most commonly
  178. found performance bottlenecks in a wide range of system applications.
  179. These bottlenecks have been identified, isolated, and reproduced in a set
  180. of small micro-benchmarks, which measure
  181. system latency and bandwidth of data movement among
  182. the processor and memory, network, file system, and disk.
  183. The intent is to produce numbers that real
  184. applications can reproduce, rather than the frequently
  185. quoted and somewhat less reproducible marketing performance numbers.
  186. .PP
  187. The benchmarks focus on latency and bandwidth because
  188. performance issues are usually caused by latency
  189. problems, bandwidth problems, or some combination of the two. Each benchmark
  190. exists because it captures some unique performance problem present in
  191. one or more important applications.
  192. For example, the TCP latency benchmark is an accurate predictor of the
  193. Oracle distributed lock manager's performance, the memory latency
  194. benchmark gives a strong indication of Verilog simulation performance,
  195. and the file system latency benchmark models a critical path
  196. in software development.
  197. .PP
  198. \*[lmbench] was developed to identify and evaluate system performance
  199. bottlenecks present in many machines in 1993-1995. It is entirely
  200. possible that computer architectures will have changed and advanced
  201. enough in the next few years to render parts of this benchmark suite
  202. obsolete or irrelevant.
  203. .PP
  204. \*[lmbench] is already in widespread use at many sites by both end
  205. users and system designers. In some cases, \*[lmbench] has provided
  206. the data necessary to discover and correct critical performance
  207. problems that might have gone unnoticed. \*[lmbench] uncovered a
  208. problem in Sun's memory management software
  209. that made all pages map to the same location in the cache, effectively
  210. turning a 512 kilobyte (K) cache into a 4K cache.
  211. .PP
  212. \*[lmbench] measures only a system's ability
  213. to transfer data between processor, cache, memory, network, and disk.
  214. It does not measure other parts of the system, such as the graphics subsystem,
  215. nor is it a MIPS, MFLOPS,
  216. throughput, saturation, stress, graphics, or multiprocessor test suite.
  217. It is frequently run on multiprocessor (MP) systems to compare their performance
  218. against
  219. uniprocessor systems, but it does not take advantage of any multiprocessor
  220. features.
  221. .PP
  222. The benchmarks are written using standard, portable
  223. system interfaces and facilities commonly
  224. used by applications, so
  225. \*[lmbench]
  226. is portable and comparable over a wide set of Unix systems.
  227. \*[lmbench] has been run on
  228. AIX,
  229. BSDI,
  230. HP-UX,
  231. IRIX,
  232. Linux,
  233. FreeBSD,
  234. NetBSD,
  235. OSF/1,
  236. Solaris,
  237. and
  238. SunOS.
  239. Part of the suite has been run on Windows/NT as well.
  240. .PP
  241. \*[lmbench]
  242. is freely distributed under
  243. the Free Software Foundation's General Public License
  244. .RN Stallman89 ,
  245. with the additional restriction
  246. that results may be reported only if the benchmarks are unmodified.
  247. .NH 1
  248. Prior work
  249. .PP
  250. Benchmarking and performance analysis is not a new endeavor.
  251. There are too many other benchmark suites to list all of
  252. them here. We compare \*[lmbench]
  253. to a set of similar benchmarks.
  254. .BU "I/O (disk) benchmarks" :
  255. IOstone
  256. .RN Park90
  257. wants to be an I/O benchmark, but actually measures the memory
  258. subsystem; all of the tests fit easily in the cache.
  259. IObench
  260. .RN Wolman89
  261. is a systematic file system and disk benchmark, but it is
  262. complicated and unwieldy.
  263. In
  264. .RN McVoy91
  265. we reviewed many I/O benchmarks and found them all
  266. lacking because they took too long to run and
  267. were too complex a solution to a fairly simple problem. We wrote a
  268. small, simple I/O benchmark, \*[lmdd] that
  269. measures sequential and random I/O far
  270. faster than either IOstone or IObench. As part of
  271. .RN McVoy91
  272. the results from \*[lmdd] were checked against IObench (as well as some other
  273. Sun internal I/O benchmarks). \*[lmdd] proved to be more accurate than any
  274. of the other benchmarks.
  275. At least one disk vendor
  276. routinely uses \*[lmdd] to do performance testing of its disk drives.
  277. .SP
  278. Chen and Patterson
  279. .RN "Chen93, Chen94"
  280. measure I/O performance under a
  281. variety of workloads that are automatically varied to test the
  282. range of the system's performance.
  283. Our efforts differ in that we are more interested in the CPU overhead
  284. of a single request, rather than the capacity of the system as a whole.
  285. .BU "Berkeley Software Distribution's microbench suite" :
  286. The BSD effort generated an extensive set of
  287. test benchmarks to do regression testing (both quality and performance)
  288. of the BSD releases.
  289. We did not use this as a basis for our work (although we used ideas)
  290. for the following reasons:
  291. (a) missing tests \(em such as memory latency,
  292. (b) too many tests, the results tended to be obscured under a mountain
  293. of numbers,
  294. and (c) wrong copyright \(em we wanted the
  295. Free Software Foundation's General Public License.
  296. .BU "Ousterhout's Operating System benchmark" :
  297. .RN Ousterhout90
  298. proposes several system benchmarks to measure system call
  299. latency, context switch time, and file system performance.
  300. We used the same ideas as a basis for our work, while trying to
  301. go farther. We measured a more complete set of
  302. primitives, including some hardware measurements; went into greater depth
  303. on some of the tests, such as context switching; and went to great
  304. lengths to make the benchmark portable and extensible.
  305. .BU "Networking benchmarks" :
  306. \f(CWNetperf\fP measures networking bandwidth and latency and
  307. was written by Rick Jones of Hewlett-Packard.
  308. \*[lmbench] includes a smaller,
  309. less complex benchmark that produces similar results.
  310. .SP
  311. \f(CWttcp\fP is a widely used benchmark in the Internet community.
  312. Our version of the same benchmark
  313. routinely delivers bandwidth numbers that are within 2% of the numbers
  314. quoted by \f(CWttcp\fP.
  315. .BU "McCalpin's stream benchmark" :
  316. .RN McCalpin95
  317. has memory bandwidth measurements and results for a large number of
  318. high-end systems.
  319. We did not use these because we discovered them only after
  320. we had results using our versions.
  321. We will probably include McCalpin's benchmarks in \*[lmbench]
  322. in the future.
  323. .PP
  324. In summary, we rolled our own because we wanted simple, portable
  325. benchmarks that accurately measured a wide variety of operations that we
  326. consider crucial to performance on today's systems. While portions of
  327. other benchmark suites include similar work, none includes all of it,
  328. few are as portable, and almost all are far more complex. Less filling,
  329. tastes great.
  330. .NH 1
  331. Benchmarking notes
  332. .NH 2
  333. Sizing the benchmarks
  334. .PP
  335. The proper sizing of various benchmark parameters is crucial to ensure
  336. that the benchmark is measuring the right component of system performance.
  337. For example, memory-to-memory copy
  338. speeds are dramatically affected by the location of the data: if
  339. the size parameter is too small so
  340. the data is in a cache, then the performance may be as much as ten times
  341. faster than if the data is in memory.
  342. On the other hand, if the memory size parameter is too big so the data
  343. is paged to disk, then performance may be slowed to such an extent
  344. that the benchmark seems to `never finish.'
  345. .PP
  346. \*[lmbench] takes the following approach to the cache and memory
  347. size issues:
  348. .BU
  349. All of the benchmarks that could be affected
  350. by cache size are run in a loop,
  351. with increasing sizes (typically powers of two) until some maximum size
  352. is reached. The results may then be plotted to see where the benchmark
  353. no longer fits in the cache.
  354. .BU
  355. The benchmark verifies that there is sufficient memory to run all of the
  356. benchmarks in main memory. A small test program allocates as much memory
  357. as it can, clears the memory,
  358. and then strides through that memory a page at a time, timing
  359. each reference. If any reference takes more than a few microseconds, the
  360. page is no longer in memory. The test program starts small and works forward
  361. until either enough memory is seen as present or the memory limit is reached.
  362. .NH 2
  363. Compile time issues
  364. .PP
  365. The GNU C compiler, \*[gcc], is the compiler we chose because
  366. it gave the most reproducible results across platforms.
  367. When \*[gcc] was not present, we used the vendor-supplied \f(CWcc\fP.
  368. All of the benchmarks were compiled with optimization \f(CW-O\fP
  369. except
  370. the benchmarks that calculate clock speed and the context switch times,
  371. which must be compiled without optimization in order to produce
  372. correct results. No other optimization flags were enabled because
  373. we wanted results that would be commonly seen by application writers.
  374. .PP
  375. All of the benchmarks were linked using the default manner of
  376. the target system. For most if not all systems, the
  377. binaries were linked using shared libraries.
  378. .NH 2
  379. Multiprocessor issues
  380. .PP
  381. All of the multiprocessor systems ran the benchmarks in the same way as
  382. the uniprocessor systems. Some systems allow users to pin processes
  383. to a particular CPU, which sometimes results in better cache reuse. We
  384. do not pin processes because it defeats the MP scheduler.
  385. .\" XXX - I should do this on an IP19 and mark it as pinned.
  386. In certain cases, this decision yields interesting results discussed later.
  387. .NH 2
  388. Timing issues
  389. .LP
  390. .sp -.5
  391. .BU "Clock resolution" :
  392. The benchmarks measure the elapsed time by reading the system clock via the
  393. \*[gettimeofday] interface. On some systems this interface has a resolution
  394. of 10 milliseconds, a long time relative to many of the benchmarks which
  395. have results measured in tens to hundreds of microseconds. To compensate for
  396. the coarse clock resolution, the benchmarks are hand-tuned to measure
  397. many operations within a single time interval lasting for many clock ticks.
  398. Typically, this is done by executing the operation in a small loop, sometimes
  399. unrolled if the operation is exceedingly fast, and then dividing
  400. the loop time by the loop count.
  401. .BU Caching :
  402. If the benchmark expects the data to be in the cache, the benchmark is
  403. typically run several times; only the last result is recorded.
  404. .SP
  405. If the benchmark does not want to measure cache performance it sets
  406. the size parameter larger than the cache. For example, the
  407. \*[bcopy] benchmark by default copies 8 megabytes to 8 megabytes,
  408. which largely defeats any second-level cache in use today. (Note that the
  409. benchmarks are not trying to defeat the file or process page cache,
  410. only the hardware caches.)
  411. .br
  412. .di bigtable
  413. .ev keep
  414. .ps 8
  415. .vs 9
  416. .so systems.tbl
  417. .ps \n[PS]
  418. .vs \n[VS]
  419. .nr TABLE \n[TABLE]+1
  420. .ce 1
  421. .SP
  422. \fBTable \n[TABLE].\ \ System descriptions.\fP
  423. .SP
  424. .di
  425. .ev
  426. .nr WHEN \n[dn]+\n[FM]
  427. .nr THT \n[dn]
  428. .de print*table
  429. ' sp .5
  430. ' ev keep
  431. ' nf
  432. ' bigtable
  433. . ne 1
  434. . wh -\n[WHEN]u skip*page
  435. . fi
  436. . ev
  437. ..
  438. .de skip*page
  439. ' sp \n[THT]u
  440. . wh -\n[WHEN]u
  441. ..
  442. .wh -\n[WHEN]u print*table
  443. .BU Variability :
  444. The results of some benchmarks, most notably the context switch benchmark, had a tendency
  445. to vary quite a bit, up to 30%. We suspect that the
  446. operating system is not using the same set of physical
  447. pages each time a process is created and we are seeing the effects of
  448. collisions in the external caches. We compensate by running the
  449. benchmark in a loop and taking the minimum result. Users interested in
  450. the most accurate data are advised to verify the results on their
  451. own platforms.
  452. .PP
  453. Many of the results included in the database were donated by users
  454. and were not created by the authors.
  455. Good benchmarking practice suggests that one should run the benchmarks
  456. as the only user of a machine, without other resource intensive
  457. or unpredictable processes or daemons.
  458. .NH 2
  459. Using the \f(CBlmbench\fP database
  460. .PP
  461. \*[lmbench] includes a database of results that
  462. is useful for comparison purposes. It is quite easy to
  463. build the source, run the benchmark, and produce a table of results
  464. that includes the run. All of the tables in this paper were produced
  465. from the database included in \*[lmbench]. This paper is also
  466. included with \*[lmbench] and may be reproduced incorporating new results.
  467. For more information, consult the file \f(CWlmbench-HOWTO\fP in the
  468. \*[lmbench] distribution.
  469. .NH 1
  470. Systems tested
  471. .PP
  472. \*[lmbench] has been run on a wide variety of platforms. This
  473. paper includes results from a representative subset of machines and
  474. operating systems.
  475. Comparisons between similar hardware running different operating
  476. systems can be very illuminating, and we have included a few examples
  477. in our results.
  478. .PP
  479. The systems are briefly characterized in Table 1. Please note that the list prices
  480. are very approximate as is the year of introduction.
  481. The SPECInt92 numbers are a little suspect since
  482. some vendors have been ``optimizing'' for certain parts of SPEC. We try and
  483. quote the original SPECInt92 numbers where we can.
  484. .NH 2
  485. Reading the result tables
  486. .PP
  487. Throughout the rest of this paper, we present tables of results for many of the
  488. benchmarks. All of the tables are sorted, from best to worst. Some tables
  489. have multiple columns of results and those tables are sorted on only one of
  490. the columns. The sorted column's heading will be in \fBbold\fP.
  491. .NH 1
  492. Bandwidth benchmarks
  493. .PP
  494. By bandwidth, we mean the rate at which a particular facility can move
  495. data.
  496. We attempt to measure the data movement ability of a number of
  497. different facilities:
  498. library \*[bcopy],
  499. hand-unrolled \*[bcopy],
  500. direct-memory read and write (no copying),
  501. pipes,
  502. TCP sockets,
  503. the \*[read] interface,
  504. and
  505. the \*[mmap] interface.
  506. .NH 2
  507. Memory bandwidth
  508. .PP
  509. Data movement is fundamental to any operating system.
  510. In the past, performance
  511. was frequently measured in MFLOPS because floating point units were
  512. slow enough that microprocessor systems were
  513. rarely limited by memory bandwidth. Today, floating point units are usually much
  514. faster than memory bandwidth, so many current MFLOP ratings can not be
  515. maintained using memory-resident data; they are ``cache only'' ratings.
  516. .PP
  517. We measure the ability to
  518. copy, read, and write data over a varying set of sizes.
  519. There are too many results to report all of them here, so we concentrate on
  520. large memory transfers.
  521. .PP
  522. We measure copy bandwidth two ways. The first is the user-level library
  523. \*[bcopy] interface.
  524. The second is a hand-unrolled loop that loads and stores
  525. aligned 8-byte words.
  526. In both cases, we took care to
  527. ensure that the source and destination locations would not map to the same
  528. lines if the any of the caches were direct-mapped.
  529. In order to test memory bandwidth rather than cache bandwidth,
  530. both benchmarks copy an 8M\** area to another 8M area.
  531. (As secondary caches reach 16M, these benchmarks will have to
  532. be resized to reduce caching effects.)
  533. .FS
  534. Some of the PCs had less than 16M of available memory;
  535. those machines copied 4M.
  536. .FE
  537. .PP
  538. The copy results actually represent one-half to one-third of the memory
  539. bandwidth used to obtain those results since we are reading and writing
  540. memory. If the cache line size is larger than the word stored, then
  541. the written cache line will typically be read before it is written. The
  542. actual amount of memory bandwidth used varies because some architectures
  543. have special instructions specifically designed for the \*[bcopy]
  544. function. Those architectures will move twice as much memory as
  545. reported by this benchmark; less advanced architectures move three
  546. times as much memory: the memory read, the memory read because it is
  547. about to be overwritten, and the memory written.
  548. .PP
  549. The \*[bcopy] results reported in Table 2
  550. may be correlated with John McCalpin's \*[stream]
  551. .RN McCalpin95
  552. benchmark results in the following manner:
  553. the \*[stream] benchmark reports all of the memory moved
  554. whereas the \*[bcopy] benchmark reports the bytes copied. So our
  555. numbers should be approximately one-half to one-third of his numbers.
  556. .PP
  557. Memory reading is measured by an unrolled loop that sums up a series of
  558. integers. On most (perhaps all) systems measured the integer
  559. size is 4 bytes. The loop is unrolled such that most compilers generate
  560. code that uses a constant offset with the load, resulting in a load and
  561. an add for each word of memory. The add is an integer add that completes
  562. in one cycle on all of the processors. Given that today's processor
  563. typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000
  564. ns per cache line, the results reported here should be dominated by the
  565. memory subsystem, not the processor add unit.
  566. .PP
  567. The memory contents are added up because almost all C compilers
  568. would optimize out the whole loop when optimization was turned on, and
  569. would generate far too many instructions without optimization.
  570. The solution is to
  571. add up the data and pass the result as an unused argument to the
  572. ``finish timing'' function.
  573. .PP
  574. Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect
  575. that pure reads should run at roughly twice the speed of \*[bcopy].
  576. Exceptions to this rule should be studied, for exceptions indicate a bug
  577. in the benchmarks, a problem in \*[bcopy], or some unusual hardware.
  578. .TSTART
  579. .so ../Results/tmp/bw_allmem.tbl
  580. .TEND "Memory bandwidth (MB/s)"
  581. .PP
  582. Memory writing is measured by an unrolled loop that stores a value into
  583. an integer (typically a 4 byte integer) and then increments the pointer.
  584. The processor cost of each memory operation is approximately the same
  585. as the cost in the read case.
  586. .PP
  587. The numbers reported in Table \n[TABLE]
  588. are not the raw hardware speed in some cases.
  589. The Power2\** is capable of up to 800M/sec read rates
  590. .FS
  591. Someone described this machine as a $1,000 processor on a $99,000 memory
  592. subsystem.
  593. .FE
  594. .RN McCalpin95
  595. and HP PA RISC (and other prefetching)
  596. systems also do better if higher levels of code optimization used
  597. and/or the code is hand tuned.
  598. .PP
  599. The Sun libc bcopy in Table \n[TABLE]
  600. is better because they use a hardware specific bcopy
  601. routine that uses instructions new in SPARC V9 that were added specifically
  602. for memory movement.
  603. .PP
  604. The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because,
  605. according to Intel, the write transaction turns into a read followed by
  606. a write to maintain cache consistency for MP systems.
  607. .NH 2
  608. IPC bandwidth
  609. .PP
  610. Interprocess communication bandwidth is frequently a performance issue.
  611. Many Unix applications are composed of several processes communicating
  612. through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation
  613. system that prepared this paper, the \f(CWX Window System\fP, remote file access,
  614. and \f(CWWorld Wide Web\fP servers.
  615. .PP
  616. Unix pipes are an interprocess communication mechanism implemented as
  617. a one-way byte stream. Each end of the stream has an associated file
  618. descriptor; one is the write descriptor and the other the read
  619. descriptor.
  620. TCP sockets are similar
  621. to pipes except they are bidirectional and can cross machine
  622. boundaries.
  623. .PP
  624. Pipe bandwidth is measured by creating two processes, a writer and a
  625. reader, which transfer 50M of data in 64K transfers.
  626. The transfer size was chosen so that the overhead of system calls
  627. and context switching would not dominate the benchmark time.
  628. The reader prints the timing results, which guarantees that all
  629. data has been moved before the timing is finished.
  630. .PP
  631. TCP bandwidth is measured similarly, except the data is transferred in
  632. 1M page aligned transfers instead of 64K transfers. If the TCP
  633. implementation supports it, the send and receive socket buffers are
  634. enlarged to 1M, instead of the default 4-60K. We have found that
  635. setting the transfer size equal to the socket buffer size produces the
  636. greatest throughput over the most implementations.
  637. .TSTART
  638. .so ../Results/tmp/bw_ipc.tbl
  639. .TEND "Pipe and local TCP bandwidth (MB/s)"
  640. .PP
  641. \*[bcopy] is important to this test because the
  642. pipe write/read is typically implemented as a \*[bcopy] into the kernel
  643. from the writer and then a \*[bcopy] from the kernel to the reader.
  644. Ideally, these results would be approximately one-half of the
  645. \*[bcopy] results. It is possible for the kernel \*[bcopy]
  646. to be faster than the C library \*[bcopy] since the kernel may have
  647. access to \*[bcopy] hardware unavailable to the C library.
  648. .PP
  649. It is interesting to compare pipes with TCP because the TCP benchmark is
  650. identical to the pipe benchmark except for the transport mechanism.
  651. Ideally, the TCP bandwidth would be as good as the pipe
  652. bandwidth. It is not widely known that the
  653. majority of the TCP cost is in the \*[bcopy], the checksum,
  654. and the network interface driver.
  655. The checksum and the driver may be safely eliminated in the loopback
  656. case and if the costs have been eliminated, then TCP should be just as
  657. fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to
  658. see that Solaris and HP-UX have done this optimization.
  659. .PP
  660. Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the
  661. pipe transfers are done in 64K buffers, a size that frequently fits in
  662. caches, while the bcopy is typically an 8M-to-8M copy, which does not
  663. fit in the cache.
  664. .PP
  665. In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than
  666. the SGI MP on pipe bandwidth because of caching effects - in the UP
  667. case, both processes share the cache; on the MP, each process is
  668. communicating with a different cache.
  669. .PP
  670. All of the TCP results in Table \n[TABLE] are in loopback mode \(em that
  671. is both ends of the socket are on the same machine. It was impossible
  672. to get remote networking results for all the machines included in this
  673. paper. We are interested in receiving more results for identical
  674. machines with a dedicated network connecting them. The results we have
  675. for over the wire TCP bandwidth are shown below.
  676. .TSTART
  677. .so tcp_bw.tbl
  678. .TEND "Remote TCP bandwidth (MB/s)"
  679. .PP
  680. The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE].
  681. The SGI Hippi interface has hardware support for TCP checksums and
  682. the IRIX operating system uses virtual memory tricks to avoid copying
  683. data as much as possible.
  684. For larger transfers, SGI Hippi has reached 92MB/s over TCP.
  685. .PP
  686. 100baseT is looking quite competitive when compared to FDDI in Table
  687. \n[TABLE], even though FDDI has packets that are almost three times
  688. larger. We wonder how long it will be before we see gigabit ethernet
  689. interfaces.
  690. .NH 2
  691. Cached I/O bandwidth
  692. .PP
  693. Experience has shown us that reusing data in the file system
  694. page cache can be a performance issue. This
  695. section measures that operation through two interfaces, \*[read] and
  696. \*[mmap].
  697. The benchmark here is not an I/O benchmark in that no disk activity is
  698. involved.
  699. We wanted to measure the overhead
  700. of reusing data, an overhead that is CPU intensive, rather than disk intensive.
  701. .PP
  702. The \*[read] interface copies data from the kernel's file system page cache into the
  703. process's buffer, using 64K buffers. The transfer size was chosen
  704. to minimize the kernel entry overhead while
  705. remaining realistically sized.
  706. .PP
  707. The difference between the \*[bcopy] and the \*[read] benchmarks
  708. is the cost of the file and virtual memory system overhead. In most
  709. systems, the \*[bcopy] speed should be faster than the \*[read] speed. The
  710. exceptions usually have hardware specifically designed
  711. for the \*[bcopy] function and that hardware may be available only to
  712. the operating system.
  713. .PP
  714. The \*[read] benchmark is implemented by rereading a file
  715. (typically 8M) in 64K
  716. buffers. Each buffer is summed as a series of integers in the user
  717. process. The summing is done for two reasons: for an apples-to-apples
  718. comparison the memory-mapped benchmark needs to touch all the data,
  719. and the file system can sometimes transfer data into memory faster than the
  720. processor can read the data.
  721. For example, \s-1SGI\s0's XFS can move data into memory at
  722. rates in excess of 500M per second, but it can move data into
  723. the cache at only 68M per second. The intent is to measure performance
  724. delivered to the application, not DMA performance to memory.
  725. .TSTART
  726. .so ../Results/tmp/bw_reread2.tbl
  727. .TEND "File vs. memory bandwidth (MB/s)"
  728. .PP
  729. The \*[mmap] interface provides a way to access the kernel's file cache
  730. without copying the data.
  731. The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M)
  732. into the
  733. process's address space. The file is then summed to force the data
  734. into the cache.
  735. .PP
  736. In Table \n[TABLE],
  737. a good system will have \fIFile read\fP as fast as (or even faster than)
  738. \fILibc bcopy\fP because as the file system overhead goes to zero, the
  739. file reread case is virtually the same as the library \*[bcopy] case.
  740. However, file reread can be faster because the kernel may have access to
  741. \*[bcopy] assist hardware not available to the C library.
  742. Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP
  743. performance, but \*[mmap] is often dramatically worse.
  744. Judging by the results, this looks to be a
  745. potential area for operating system improvements.
  746. .PP
  747. In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes
  748. full advantage of the memory subsystem from inside the kernel.
  749. The mmap reread is probably slower because of the lower clock rate;
  750. the page faults start to show up as a significant cost.
  751. .PP
  752. It is surprising that the Sun Ultra1 was able to bcopy at the high
  753. rates shown in Table 2 but did not show those rates for file reread
  754. in Table \n[TABLE].
  755. HP has the opposite problem, they get file reread faster than bcopy,
  756. perhaps because the kernel \*[bcopy] has access to hardware support.
  757. .PP
  758. The Unixware system has outstanding mmap reread rates, better than
  759. systems of substantially higher cost. Linux needs to do some work on
  760. the \f(CWmmap\fP code.
  761. .NH 1
  762. Latency measurements
  763. .PP
  764. Latency is an often-overlooked
  765. area of performance problems, possibly because resolving latency issues
  766. is frequently much harder than resolving bandwidth issues. For example,
  767. memory bandwidth may be increased by making wider cache lines and increasing
  768. memory ``width'' and interleave,
  769. but memory latency can be improved only by shortening paths or increasing
  770. (successful) prefetching.
  771. The first step toward improving latency is understanding the
  772. current latencies in a system.
  773. .PP
  774. The latency measurements included in this suite are
  775. memory latency,
  776. basic operating system entry cost,
  777. signal handling cost,
  778. process creation times,
  779. context switching,
  780. interprocess communication,
  781. .\" virtual memory system latency,
  782. file system latency,
  783. and disk latency.
  784. .NH 2
  785. Memory read latency background
  786. .PP
  787. In this section, we expend considerable effort to define the different memory
  788. latencies and to explain and justify our benchmark.
  789. The background is a bit tedious but important, since we believe the
  790. memory
  791. latency measurements to be one of the most thought-provoking and useful
  792. measurements in \*[lmbench].
  793. .PP
  794. The most basic latency measurement is memory latency since most of
  795. the other latency measurements can be expressed in terms of memory
  796. latency. For example, context switches require saving the current
  797. process state and loading the state of the next process. However, memory
  798. latency is rarely accurately measured and frequently misunderstood.
  799. .PP
  800. Memory read latency has many definitions;
  801. the most common,
  802. in increasing time order,
  803. are memory chip cycle time, processor-pins-to-memory-and-back time,
  804. load-in-a-vacuum time, and back-to-back-load time.
  805. .BU "Memory chip cycle latency" :
  806. Memory chips are rated in nanoseconds; typical speeds are around 60ns.
  807. A general overview on DRAM architecture may be found in
  808. .RN Hennessy96 .
  809. The
  810. specific information we describe here is from
  811. .RN Toshiba94
  812. and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0
  813. \s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the
  814. time from
  815. .ps -1
  816. .nr width \w'R\&A\&S'
  817. .nr height \n[rst]+1000
  818. RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
  819. .ps
  820. assertion to the when
  821. the data will be available on the \s-1DRAM\s0 pins (assuming
  822. .ps -1
  823. .nr width \w'C\&A\&S'
  824. .nr height \n[rst]+1000
  825. CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
  826. .ps
  827. access time requirements were met).
  828. While it is possible
  829. to get data out of a \s-1DRAM\s0 in 60ns, that is not all of
  830. the time involved. There is a precharge time that must occur after
  831. every access.
  832. .RN Toshiba94
  833. quotes 110ns as the random read or write cycle time and this
  834. time is more representative of the cycle time.
  835. .\" For example, most systems offer a wide range of memory
  836. .\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number
  837. .\" of simms range from 1 to 16. The more simms there are, the more
  838. .\" capacitance there is in the memory subsystem. More capacitance means
  839. .\" longer setup times for the fully populated memory subsystem. System
  840. .\" designers have to allow time for this setup.
  841. .\" For more details, consult [XXX - reference on DRAM].
  842. .\" This is sometimes referred to as the chip latency. The
  843. .\" chip cycle time is the chip latency plus the time required to restore
  844. .\" the data in the capacitors which is often referred to as the precharge
  845. .\" time. This means that 60 nanosecond memory chips really are more like
  846. .\" 100 nanosecond memory chips. Some systems operate memory in ``page
  847. .\" mode'' or ``static column'' memory systems hold either RAS or CAS and
  848. .\" allow subsequent accesses in the same row or column in one cycle instead
  849. .\" of two.
  850. .BU "Pin-to-pin latency" :
  851. This number represents the time needed
  852. for the memory request to travel from the processor's pins to the memory
  853. subsystem and back again. Many vendors have used the pin-to-pin
  854. definition of memory latency in their reports. For example,
  855. .RN Fenwick95
  856. while describing the \s-1DEC\s0 8400
  857. quotes memory latencies of 265ns; a careful
  858. reading of that paper shows that these are pin-to-pin numbers. In spite
  859. of the historical precedent in vendor reports, this definition of memory
  860. latency is misleading since it ignores actual delays seen when a load
  861. instruction is immediately followed by a use of the data being loaded.
  862. The number of additional cycles inside the processor can be significant
  863. and grows more significant with today's highly pipelined architectures.
  864. .PP
  865. It is worth noting that the pin-to-pin numbers
  866. include the amount of time it takes to charge
  867. the lines going to the \s-1SIMM\s0s, a time that increases with the
  868. (potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean
  869. more capacitance which requires in longer charge times. This is one reason
  870. why personal computers frequently have better memory latencies than
  871. workstations: the PCs typically have less memory capacity.
  872. .BU "Load-in-a-vacuum latency" :
  873. A load in a vacuum is the time that the processor will wait for one load that
  874. must be fetched from main memory (i.e., a cache miss). The ``vacuum''
  875. means that there is no other activity on the system bus, including no other
  876. loads.
  877. While this number is frequently used as the memory latency, it is not very
  878. useful. It is basically a ``not to exceed'' number important only for
  879. marketing reasons.
  880. Some architects point out that since most processors implement nonblocking
  881. loads (the load does not cause a stall until the data is used), the perceived
  882. load latency may be much less that the real latency. When pressed, however,
  883. most will admit that cache misses occur in bursts, resulting in perceived
  884. latencies of at least the load-in-a-vacuum latency.
  885. .BU "Back-to-back-load latency" :
  886. Back-to-back-load latency is the time that each load takes, assuming
  887. that the instructions before and after are also cache-missing loads.
  888. Back-to-back loads may take longer than loads in a vacuum for the
  889. following reason: many systems implement something known as \fIcritical
  890. word first\fP, which means that the subblock of the cache line that
  891. contains the word being loaded is delivered to the processor before the
  892. entire cache line has been brought into the cache. If another load
  893. occurs quickly enough after the processor gets restarted from the
  894. current load, the second load may stall because the cache is still
  895. busy filling the cache line for the previous load. On some systems,
  896. such as the current implementation of UltraSPARC,
  897. the difference between back to back and load in a vacuum is about 35%.
  898. .PP
  899. \*[lmbench] measures back-to-back-load latency because it is the
  900. only measurement that may be easily measured from software and
  901. because we feel that it is what most software developers consider to be memory
  902. latency. Consider the following C code fragment:
  903. .DS
  904. .nf
  905. .ft CW
  906. p = head;
  907. while (p->p_next)
  908. p = p->p_next;
  909. .ft
  910. .fi
  911. .DE
  912. On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the
  913. load. A 300 Mhz processor has a 3.33ns cycle time, so the loop
  914. could execute in slightly less than 10ns. However, the load itself
  915. takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the
  916. instructions cost 10ns but the load stalls for 400. Another
  917. way to look at it is that 400/3.3, or 121, nondependent,
  918. nonloading instructions following the load would be needed
  919. to hide the load latency.
  920. Because superscalar processors typically execute multiple operations
  921. per clock cycle, they need even more useful operations between cache
  922. misses to keep the processor from stalling.
  923. .PP
  924. This benchmark illuminates the tradeoffs in processor cache design.
  925. Architects like large cache lines, up to 64 bytes or so, because
  926. the prefetch effect of gathering a whole line increases
  927. hit rate given reasonable spatial locality.
  928. Small stride sizes have high spatial locality and should have higher
  929. performance, but large stride sizes have poor spatial locality causing
  930. the system to prefetch useless data.
  931. So the benchmark provides the following insight into negative
  932. effects of large line prefetch:
  933. .BU
  934. Multi-cycle fill operations are typically atomic events at the
  935. caches, and sometimes block other cache accesses until they
  936. complete.
  937. .BU
  938. Caches are typically single-ported. Having a large line prefetch
  939. of unused data causes extra bandwidth
  940. demands at the cache, and can cause increased access latency for
  941. normal cache accesses.
  942. .PP
  943. In summary, we believe that processors are so fast that the average
  944. load latency for cache misses will be closer to the
  945. back-to-back-load number than to the load-in-a-vacuum number. We are
  946. hopeful that the industry will standardize on this definition of
  947. memory latency.
  948. .NH 2
  949. Memory read latency
  950. .PP
  951. The entire memory hierarchy can be measured, including on-board data
  952. cache latency and size, external data cache latency and size, and
  953. main memory latency.
  954. Instruction caches are not measured.
  955. TLB miss latency can also be measured, as in
  956. .RN Saavedra92 ,
  957. but we stopped at main memory. Measuring TLB miss time is problematic
  958. because different systems map different amounts of memory with their
  959. TLB hardware.
  960. .PP
  961. The benchmark varies two parameters, array size and array stride.
  962. For each size, a list of pointers is created for all of the different
  963. strides. Then the list is walked thus:
  964. .DS
  965. .ft CW
  966. mov r4,(r4) # C code: p = *p;
  967. .ft
  968. .DE
  969. The time to do about 1,000,000 loads (the list wraps) is measured and
  970. reported. The time reported is pure latency time and may be zero even though
  971. the load instruction does not execute in zero time. Zero is defined as one
  972. clock cycle; in other words, the time reported is \fBonly\fP memory latency
  973. time, as it does not include the instruction execution time. It is assumed
  974. that all processors can do a load instruction in one processor cycle
  975. (not counting stalls). In other words, if the processor cache load time
  976. is 60ns on a 20ns processor, the load latency reported
  977. would be 40ns, the additional 20ns is for the load instruction
  978. itself.\**
  979. .FS
  980. In retrospect, this was a bad idea because we calculate the clock
  981. rate to get the instruction execution time. If the clock rate is off,
  982. so is the load time.
  983. .FE
  984. Processors that can manage to get the load address out to the
  985. address pins before the end of the load cycle get some free time in this
  986. benchmark (we don't know of any processors that do that).
  987. .PP
  988. This benchmark has been validated by logic analyzer measurements
  989. on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer
  990. Research Center.
  991. .TSTART 1
  992. .so mem.pic
  993. .FEND "Memory latency" 1
  994. .PP
  995. Results from the memory latency benchmark are plotted as a series of data sets
  996. as shown in Figure \n[FIGURE].
  997. Each data set represents a stride size,
  998. with the array size varying from 512 bytes up to 8M or more.
  999. The curves contain a series of
  1000. horizontal plateaus, where each plateau represents a level in the
  1001. memory hierarchy.
  1002. The point where each plateau ends and the line rises marks the
  1003. end of that portion of the memory hierarchy (e.g., external cache).
  1004. Most machines have similar memory hierarchies:
  1005. on-board cache, external cache, main memory, and main memory plus TLB
  1006. miss costs.
  1007. There are variations: some processors are missing a cache, while
  1008. others add another cache to the hierarchy.
  1009. .\" XXX Larry please double-check this; I am going on dim memory...
  1010. For example, the Alpha 8400 has two on-board caches, one 8K
  1011. and the other 96K.
  1012. .PP
  1013. The cache line size can be derived by comparing curves and noticing which
  1014. strides are faster than main memory times. The smallest stride that is
  1015. the same as main memory speed is likely to be the cache line size because
  1016. the strides that are faster than memory are
  1017. getting more than one hit per cache line.
  1018. .\" Prefetching may confuse
  1019. .\" the issue because a demand read may stall behind a prefetch load,
  1020. .\" causing cache lines to appear twice as large as they are.
  1021. .\" XXX
  1022. .\" Larry --- can we use prime modulus arithmetic to set up pointer
  1023. .\" loops which might appear random but which really aren't and which
  1024. .\" hit every stride once before looping?
  1025. .\"
  1026. .\" XXX
  1027. .\" Larry --- is there any way we can defeat/disable prefetching
  1028. .\" so the cache line size can be more accurately determined?
  1029. .\"
  1030. .\" XXX
  1031. .\" Larry --- can we create a benchmark for TLB misses?
  1032. .\" I think it was Tom Rokicki who suggested that we create a
  1033. .\" benchmark where the data fits in the cache, but the pages don't
  1034. .\" fit in the TLB.
  1035. .\"
  1036. .\" XXX
  1037. .\" Larry --- is the description of the memory hierarchy correct?
  1038. .\" I am not sure I haven't added an extra level of external cache...
  1039. .EQ
  1040. delim $$
  1041. .EN
  1042. .PP
  1043. Figure \n[FIGURE] shows memory latencies on a nicely made machine,
  1044. a \s-1DEC\s0 Alpha.
  1045. We use this machine as the example
  1046. because it shows the latencies and sizes of
  1047. the on-chip level 1 and motherboard level 2 caches, and because it
  1048. has good all-around numbers, especially considering it can support a
  1049. 4M level 2 cache.
  1050. The on-board cache is $2 sup 13$ bytes or 8K, while the
  1051. external cache is $2 sup 19$ bytes or 512K.
  1052. .EQ
  1053. delim off
  1054. .EN
  1055. .TSTART
  1056. .so lat_allmem.tbl
  1057. .TEND "Cache and memory latency (ns)"
  1058. .nr MEMTABLE \n[TABLE]
  1059. .PP
  1060. Table \n[TABLE] shows the cache size, cache latency, and main memory
  1061. latency as extracted from the memory latency graphs.
  1062. The graphs and the tools for extracting the data are
  1063. included with \*[lmbench].
  1064. It is worthwhile to plot all of the graphs and examine them since the
  1065. table is missing some details, such as the
  1066. \s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache.
  1067. .PP
  1068. We sorted Table \n[TABLE] on level 2 cache latency because we think
  1069. that many applications will fit in the level 2 cache. The HP and IBM
  1070. systems have only one level of cache so we count that as both level 1
  1071. and level 2. Those two systems have remarkable cache performance for
  1072. caches of that size. In both cases, the cache delivers data in one
  1073. clock cycle after the load instruction.
  1074. .PP
  1075. HP systems usually focus on
  1076. large caches as close as possible to the processor. A older HP
  1077. multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped
  1078. cache with a 2K victim cache, accessible in one clock (16ns).\** That system is
  1079. primarily a database server.
  1080. .FS
  1081. The Usenix version of this paper had this as a set associate cache; that was
  1082. incorrect.
  1083. .FE
  1084. .PP
  1085. The IBM focus is on low latency, high
  1086. bandwidth memory. The IBM memory subsystem is good because all of
  1087. memory is close to the processor, but has the weakness that it is
  1088. extremely difficult to evolve the design to a multiprocessor system.
  1089. .PP
  1090. The 586 and PowerPC motherboards have quite poor second level caches,
  1091. the caches are not substantially better than main memory.
  1092. .PP
  1093. The Pentium Pro and Sun Ultra second level caches are of medium speed
  1094. at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared
  1095. against the HP and IBM one cycle latency caches of similar size.
  1096. Given the tight integration of the Pentium Pro level 2 cache, it is
  1097. surprising that it has such high latencies.
  1098. .PP
  1099. The 300Mhz DEC Alpha has a rather high 22 clock latency to the second
  1100. level cache which is probably one of the reasons that they needed a 96K
  1101. level 1.5 cache. SGI and DEC have used large second level caches
  1102. to hide their long latency from main memory.
  1103. .PP
  1104. .NH 2
  1105. Operating system entry
  1106. .PP
  1107. Entry into the operating system is required for many system facilities.
  1108. When calculating the cost of a facility, it is useful to know how
  1109. expensive it is to perform a nontrivial entry into the operating system.
  1110. .PP
  1111. We measure nontrivial entry into the system by repeatedly writing one
  1112. word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but
  1113. discard the data. This particular entry point was chosen because it has
  1114. never been optimized in any system that we have measured. Other entry
  1115. points, typically \*[getpid] and \*[gettimeofday], are heavily used,
  1116. heavily optimized, and sometimes implemented as user-level library
  1117. routines rather than system calls.
  1118. A write to the \f(CW/dev/null\fP driver will go
  1119. through the system call table to \*[write], verify the user area as
  1120. readable, look up the file descriptor to get the vnode, call the vnode's
  1121. write function, and then return.
  1122. .TSTART
  1123. .so ../Results/tmp/lat_nullsys.tbl
  1124. .TEND "Simple system call time (microseconds)"
  1125. .PP
  1126. Linux is the clear winner in the system call time. The reasons are
  1127. twofold: Linux is a uniprocessor operating system, without any
  1128. MP overhead, and Linux is a small operating system, without all
  1129. of the ``features'' accumulated by the commercial offers.
  1130. .PP
  1131. Unixware and Solaris are doing quite well, given that they are both fairly
  1132. large, commercially oriented operating systems with a large accumulation
  1133. of ``features.''
  1134. .NH 2
  1135. Signal handling cost
  1136. .PP
  1137. Signals in Unix are a way to tell another process to handle an event. They
  1138. are to processes as interrupts are to the CPU.
  1139. .PP
  1140. Signal handling is often critical to layered systems. Some applications,
  1141. such as databases, software development environments, and threading libraries
  1142. provide an operating system-like layer on top of the operating system,
  1143. making signal handling a critical path in many of these applications.
  1144. .PP
  1145. \*[lmbench] measure both signal installation and signal dispatching in two separate
  1146. loops, within the context of one process.
  1147. It measures signal handling by installing a signal handler and then repeatedly
  1148. sending itself the signal.
  1149. .TSTART
  1150. .so ../Results/tmp/lat_signal.tbl
  1151. .TEND "Signal times (microseconds)"
  1152. .PP
  1153. Table \n[TABLE] shows the signal handling costs.
  1154. Note that there are no context switches in this benchmark; the signal goes
  1155. to the same process that generated the signal. In real applications,
  1156. the signals usually go to another process, which implies
  1157. that the true cost of sending that signal is the signal overhead plus the
  1158. context switch overhead. We wanted to measure signal and context
  1159. switch overheads separately since context
  1160. switch times vary widely among operating systems.
  1161. .PP
  1162. SGI does very well on signal processing,
  1163. especially since their hardware is of an older generation than
  1164. many of the others.
  1165. .PP
  1166. The Linux/Alpha signal handling numbers are so poor
  1167. that we suspect that this is a bug, especially given that the Linux/x86
  1168. numbers are quite reasonable.
  1169. .NH 2
  1170. Process creation costs
  1171. .PP
  1172. Process benchmarks are used to measure the basic process primitives,
  1173. such as creating a new process, running a different program, and context
  1174. switching. Process creation benchmarks are of particular interest
  1175. in distributed systems since many remote operations include the creation
  1176. of a remote process to shepherd the remote operation to completion.
  1177. Context switching is important for the same reasons.
  1178. .BU "Simple process creation" .
  1179. The Unix process creation primitive is \*[fork], which
  1180. creates a (virtually) exact copy of the calling process.
  1181. Unlike VMS and some other operating systems, Unix starts any new process
  1182. with a \*[fork].
  1183. Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and
  1184. ``light,'' facts that many have been ignoring for some time.
  1185. .PP
  1186. \*[lmbench] measures simple process creation by creating a process
  1187. and immediately
  1188. exiting the child process. The parent process waits for the child
  1189. process to exit.
  1190. The benchmark is intended to measure the overhead for creating a
  1191. new thread of control, so it includes the \*[fork] and
  1192. the \*[exit] time.
  1193. .PP
  1194. The benchmark also includes a \f(CWwait\fP system call in the parent and
  1195. context switches from the parent to the child and back again. Given that
  1196. context switches of this sort are on the order of 20 microseconds and a
  1197. system call is on the order of 5 microseconds, and that the entire benchmark
  1198. time is on the order of a millisecond or more, the extra overhead
  1199. is insignificant.
  1200. Note that even this relatively simple task is very expensive and is
  1201. measured in milliseconds while most of the other operations we consider are
  1202. measured in microseconds.
  1203. .BU "New process creation" .
  1204. The preceding benchmark did not create a new application; it created a
  1205. copy of the old application. This benchmark measures the cost of creating a
  1206. new process and changing that process into a new application, which.
  1207. forms the basis of every Unix command
  1208. line interface, or shell.
  1209. \*[lmbench] measures this facility by forking a new child and having that child
  1210. execute a new program \(em in this case, a tiny program that prints
  1211. ``hello world'' and exits.
  1212. .PP
  1213. The startup cost is especially noticeable
  1214. on (some) systems that have shared libraries. Shared libraries can
  1215. introduce a substantial (tens of milliseconds) startup cost.
  1216. .\" XXX - statically linked example?
  1217. .TSTART
  1218. .so ../Results/tmp/lat_allproc.tbl
  1219. .TEND "Process creation time (milliseconds)"
  1220. .BU "Complicated new process creation" .
  1221. When programs start other programs, they frequently use one of
  1222. three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first
  1223. two interfaces start a new process by invoking the standard command
  1224. interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way
  1225. guarantees that the shell will look for the requested application
  1226. in all of the places that the user would look \(em in other words, the shell
  1227. uses the user's $PATH variable as a list of places to find the
  1228. application. \*[execlp] is a C library routine which also looks for the
  1229. program using the user's $PATH variable.
  1230. .PP
  1231. Since this is a common way of starting applications, we felt it
  1232. was useful to show the costs of the generality.
  1233. .PP
  1234. We measure this by starting \f(CW/bin/sh\fP to start the same tiny
  1235. program we ran in the last case.
  1236. In Table \n[TABLE] the cost of asking the shell to go
  1237. look for the program is
  1238. quite large, frequently ten times as expensive as just creating a
  1239. new process, and four times as expensive as explicitly naming the location
  1240. of the new program.
  1241. .PP
  1242. The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results.
  1243. Given that the processor is one of the fastest, the problem is likely to be
  1244. software. There is room for substantial improvement in the Solaris
  1245. process creation code.
  1246. .NH 2
  1247. Context switching
  1248. .PP
  1249. Context switch time is defined here as
  1250. the time needed to save the state of one process and restore the state
  1251. of another process.
  1252. .PP
  1253. Context switches are frequently in the critical performance path of
  1254. distributed applications. For example, the multiprocessor versions
  1255. of the IRIX operating system use
  1256. processes to move data through the networking stack. This means that the
  1257. processing time for each new packet arriving at an idle system includes
  1258. the time needed to switch in the networking process.
  1259. .PP
  1260. Typical context switch benchmarks measure just the minimal context switch
  1261. time \(em the time to switch between two processes that are doing nothing
  1262. but context switching. We feel that this is
  1263. misleading because there are frequently more than two active processes,
  1264. and they usually have a larger working set (cache footprint)
  1265. than the benchmark processes.
  1266. .PP
  1267. Other benchmarks frequently include the cost of
  1268. the system calls needed to force the context switches.
  1269. For example, Ousterhout's context switch benchmark
  1270. measures context switch time plus a \*[read] and a \*[write]
  1271. on a pipe.
  1272. In many of the systems measured by \*[lmbench], the pipe overhead
  1273. varies between 30% and 300% of the context switch time, so we were
  1274. careful to factor out the pipe overhead.
  1275. .BU "Number of processes."
  1276. The context switch benchmark is implemented as
  1277. a ring of two to twenty processes that are connected with Unix pipes.
  1278. A token is passed from process to process, forcing context switches.
  1279. The benchmark measures the time needed to pass
  1280. the token two thousand times from process to process.
  1281. Each transfer of the token has two costs: the context switch, and
  1282. the overhead of passing the token.
  1283. In order to calculate just the context switching time, the benchmark first
  1284. measures the cost of passing the token through a ring of pipes in a
  1285. single process. This overhead time is defined as the cost of passing
  1286. the token and is not included in the reported context switch time.
  1287. .BU "Size of processes."
  1288. In order to measure more realistic context switch times, we add
  1289. an artificial variable size ``cache footprint'' to the switching
  1290. processes. The cost of the context switch then includes the cost
  1291. of restoring user-level state (cache footprint). The cache footprint
  1292. is implemented by having the process allocate an array of data\**
  1293. .FS
  1294. All arrays are at the same virtual
  1295. address in all processes.
  1296. .FE
  1297. and sum
  1298. the array as a series of integers after receiving the token but before
  1299. passing the token to the next process. Since most systems will cache data
  1300. across context switches, the working set for the benchmark is slightly
  1301. larger than the number of processes times the array size.
  1302. .PP
  1303. It is worthwhile to point out that the overhead mentioned above
  1304. also includes the cost of accessing the data, in the same way as
  1305. the actual benchmark. However, because the overhead is measured
  1306. in a single process, the cost is typically the cost with ``hot''
  1307. caches. In the Figure 2, each size is plotted as a line, with
  1308. context switch times on the Y axis, number of processes on the
  1309. X axis, and the process size as the data set.
  1310. The process size and the hot cache overhead costs for
  1311. the pipe read/writes and any data access is what is labeled
  1312. as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead
  1313. is in microseconds.
  1314. .PP
  1315. The context switch time does not include anything other than
  1316. the context switch, provided that all the benchmark processes fit in the
  1317. cache. If the total size of all of the benchmark processes is larger
  1318. than the cache size, the cost of each context switch will include cache
  1319. misses.
  1320. We are trying to show realistic context switch times as a
  1321. function of both size and number of processes.
  1322. .TSTART 1
  1323. .so ctx.pic
  1324. .FEND "Context switch times" 1
  1325. .PP
  1326. Results for an Intel Pentium Pro system running Linux at 167 MHz are
  1327. shown in Figure \n[FIGURE].
  1328. The data points on the figure are labeled with the working set
  1329. due to the sum of data in all of the processes. The actual working set is
  1330. larger, as it includes the process and kernel overhead as well.
  1331. One would expect the context switch times to stay constant until
  1332. the working set is
  1333. approximately the size of the second level cache. The Intel system has a
  1334. 256K second level cache, and the context switch times
  1335. stay almost constant until about 256K (marked as .25M in the graph).
  1336. .BU "Cache issues"
  1337. The context switch benchmark is a deliberate measurement of the
  1338. effectiveness of the caches across process context switches. If the
  1339. cache does not include the process identifier (PID, also sometimes
  1340. called an address space identifier) as part of the address, then the
  1341. cache must be flushed on every context switch. If the cache does not map
  1342. the same virtual addresses from different processes to different cache
  1343. lines, then the cache will appear to be flushed on every context
  1344. switch.
  1345. .PP
  1346. If the caches do
  1347. not cache across context switches there would be no grouping at the
  1348. lower left corner of Figure \n[FIGURE], instead, the graph would
  1349. appear as a series of straight, horizontal, parallel lines. The number
  1350. of processes will not matter, the two process case will be just as bad
  1351. as the twenty process case since the cache would not be
  1352. useful across context switches.
  1353. .TSTART
  1354. .so ../Results/tmp/ctx.tbl
  1355. .TEND "Context switch time (microseconds)"
  1356. .PP
  1357. We picked four points on the graph and extracted those values for Table
  1358. \n[TABLE]. The complete set of values, as well as tools to graph them,
  1359. are included with \*[lmbench].
  1360. .PP
  1361. Note that multiprocessor context switch times are frequently more expensive
  1362. than uniprocessor context switch times. This is because multiprocessor
  1363. operating systems tend to have very complicated scheduling code.
  1364. We believe that multiprocessor context switch times can be, and should be,
  1365. within 10% of the uniprocessor times.
  1366. .PP
  1367. Linux does quite well on context switching, especially on the more
  1368. recent architectures. By comparing the Linux 2 0K processes to the
  1369. Linux 2 32K processes, it is apparent that there is something wrong
  1370. with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can
  1371. find at least part of the cause. The second level cache latency for the
  1372. i586 is substantially worse than either the i686 or the Alpha.
  1373. .PP
  1374. Given the poor second level cache behavior of the PowerPC, it is surprising
  1375. that it does so well on context switches, especially the larger sized cases.
  1376. .PP
  1377. The Sun Ultra1 context switches quite well in part because of enhancements
  1378. to the register window handling in SPARC V9.
  1379. .NH 2
  1380. Interprocess communication latencies
  1381. .PP
  1382. Interprocess communication latency is important because many operations
  1383. are control messages to another process (frequently on another
  1384. system). The time to tell the remote process to
  1385. do something is pure overhead and is frequently in the critical path
  1386. of important functions such as distributed applications (e.g.,
  1387. databases, network servers).
  1388. .PP
  1389. The interprocess communication latency benchmarks typically have the
  1390. following form: pass a small message (a byte or so) back and forth between two
  1391. processes. The reported results are always the microseconds needed
  1392. to do one round trip. For one way timing,
  1393. about half the round trip is right. However, the CPU cycles tend to be
  1394. somewhat asymmetric for one trip: receiving is typically more
  1395. expensive than sending.
  1396. .BU "Pipe latency" .
  1397. Unix pipes are an interprocess communication mechanism implemented as
  1398. a one-way byte stream. Each end of the stream has an associated file
  1399. descriptor; one is the write descriptor and the other the read
  1400. descriptor.
  1401. .PP
  1402. Pipes are frequently used as a local IPC mechanism. Because of the
  1403. simplicity of pipes, they are frequently the fastest portable
  1404. communication mechanism.
  1405. .PP
  1406. Pipe latency is measured by creating a pair of pipes, forking a child process,
  1407. and passing a word back and forth. This benchmark is identical to the
  1408. two-process, zero-sized context switch benchmark, except that it includes
  1409. both the context switching time and the pipe overhead in the results.
  1410. .nr NTABLE \n[TABLE]+1
  1411. .nr LTABLE \n[TABLE]
  1412. Table \n[NTABLE] shows the round trip latency from process A to process B
  1413. and back to process A.
  1414. .TSTART
  1415. .so ../Results/tmp/lat_pipe.tbl
  1416. .TEND "Pipe latency (microseconds)"
  1417. .PP
  1418. The time can be broken down to two context switches plus four system calls
  1419. plus the pipe overhead. The context switch component is two of the small
  1420. processes in Table \n[LTABLE].
  1421. This benchmark is identical to the context switch benchmark in
  1422. .RN Ousterhout90 .
  1423. .BU "TCP and RPC/TCP latency" .
  1424. TCP sockets may be viewed as an interprocess communication mechanism similar
  1425. to pipes with the added feature that TCP sockets work across machine
  1426. boundaries.
  1427. .PP
  1428. TCP and RPC/TCP connections are frequently used in low-bandwidth,
  1429. latency-sensitive applications. The default Oracle distributed
  1430. lock manager uses TCP sockets, and the locks per second available
  1431. from this service are accurately modeled by the TCP latency test.
  1432. .TSTART
  1433. .so ../Results/tmp/lat_tcp.tbl
  1434. .TEND "TCP latency (microseconds)"
  1435. .PP
  1436. Sun's RPC is layered either over TCP or over UDP.
  1437. The RPC layer is responsible for managing connections (the port mapper),
  1438. managing different byte orders and word sizes (XDR), and implementing a
  1439. remote procedure call abstraction.
  1440. Table \n[TABLE] shows the same benchmark with and
  1441. without the RPC layer to show the cost of the RPC implementation.
  1442. .PP
  1443. TCP latency is measured by having a server process that waits for connections
  1444. and a client process that connects to the server. The two processes then
  1445. exchange a word between them in a loop. The latency reported is one
  1446. round-trip time. The measurements in Table \n[TABLE] are local
  1447. or loopback measurements,
  1448. since our intent is to show the overhead of the software. The same benchmark
  1449. may be, and frequently is, used to measure host-to-host latency.
  1450. .PP
  1451. Note that the RPC layer frequently adds hundreds of microseconds of
  1452. additional latency. The problem is not the external data
  1453. representation (XDR) layer \(em the
  1454. data being passed back and forth is a byte, so there is no XDR to be done.
  1455. There is no justification for the extra cost; it is simply
  1456. an expensive implementation. DCE RPC is worse.
  1457. .TSTART
  1458. .so ../Results/tmp/lat_udp.tbl
  1459. .TEND "UDP latency (microseconds)"
  1460. .BU "UDP and RPC/UDP latency" .
  1461. UDP sockets are an alternative to TCP sockets. They differ in that UDP
  1462. sockets are unreliable messages that leave the retransmission issues to
  1463. the application. UDP sockets have a few advantages, however. They preserve
  1464. message boundaries, whereas TCP does not; and a single UDP socket may
  1465. send messages
  1466. to any number of other sockets, whereas TCP sends data to only one place.
  1467. .PP
  1468. UDP and RPC/UDP messages are commonly used in many client/server applications.
  1469. NFS is probably the most widely used RPC/UDP application in the world.
  1470. .PP
  1471. Like TCP latency, UDP latency is measured by having a server process
  1472. that waits for connections
  1473. and a client process that connects to the server. The two processes then
  1474. exchange a word between them in a loop. The latency reported is round-trip
  1475. time. The measurements in Table \n[TABLE] are local or loopback measurements,
  1476. since our intent is to show the overhead of the software.
  1477. Again, note that the RPC library can add hundreds of microseconds of extra
  1478. latency.
  1479. .\" .PP
  1480. .\" It is interesting to compare UDP latency with TCP latency. In many cases the
  1481. .\" TCP latency is \fBless\fP than the UDP latency. This flies in the face
  1482. .\" of conventional wisdom, which says that TCP is an inherently more expensive
  1483. .\" protocol than UDP. The reasons that TCP may appear faster are: in this
  1484. .\" benchmark, the protocol costs are dwarfed by the other costs (context
  1485. .\" switching, system calls, and driver overhead); and TCP is frequently
  1486. .\" hand-tuned for performance, while UDP is rarely hand-tuned.
  1487. .TSTART
  1488. .so ipc.tbl
  1489. .TEND "Remote latencies (microseconds)"
  1490. .BU "Network latency" .
  1491. We have a few results for over the wire latency included in Table \n[TABLE].
  1492. As might be expected, the most heavily used network interfaces (i.e., ethernet)
  1493. have the lowest latencies. The times shown include the time on the wire,
  1494. which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit
  1495. ethernet and FDDI, and less than 10 microseconds for Hippi.
  1496. .BU "TCP connection latency" .
  1497. TCP is a connection-based, reliable, byte-stream-oriented protocol. As
  1498. part of this reliability, a connection must be established before any
  1499. data can be transferred. The connection is accomplished by a ``three-way
  1500. handshake,'' an exchange of packets when the client attempts to connect
  1501. to the server.
  1502. .PP
  1503. Unlike UDP, where no connection is established, TCP sends packets
  1504. at startup time. If an application creates a TCP connection to send
  1505. one message, then the startup time can be a substantial
  1506. fraction of the total connection and transfer costs.
  1507. The benchmark shows that the connection cost is approximately half of
  1508. the cost.
  1509. .PP
  1510. Connection cost is measured by having a server, registered using
  1511. the port mapper, waiting for connections. The client figures out where the
  1512. server is registered and then repeatedly times a \*[connect] system call to
  1513. the server. The socket is closed after each connect. Twenty connects
  1514. are completed and the fastest of them is used as the result. The time measured
  1515. will include two of the three packets that make up the three way TCP handshake,
  1516. so the cost is actually greater than the times listed.
  1517. .\" XXX Larry --- if a machine's clock granularity is on the order of
  1518. .\" 10 milliseconds, won't this benchmark run into granularity problems?
  1519. .TSTART
  1520. .so ../Results/tmp/lat_connect.tbl
  1521. .TEND "TCP connect latency (microseconds)"
  1522. .PP
  1523. Table \n[TABLE] shows that if the need is to send
  1524. a quick message to another process, given that most packets get through,
  1525. a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive
  1526. acknowledgments are needed, which they are in order to have an apples-to-apples
  1527. comparison with TCP). If the transmission medium is 10Mbit Ethernet, the
  1528. time on the wire will be approximately 65 microseconds each way, or 130
  1529. microseconds total. To do the same thing with a short-lived TCP
  1530. connection would cost 896 microseconds of wire time alone.
  1531. .PP
  1532. The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor
  1533. is the point to suggest that all messages should be UDP. In many cases,
  1534. the difference between 130 microseconds and 900 microseconds is
  1535. insignificant compared with other aspects of application performance.
  1536. However, if the application is very latency sensitive
  1537. and the transmission medium is slow (such as serial link or a message
  1538. through many routers), then a UDP message may prove cheaper.
  1539. .NH 2
  1540. File system latency
  1541. .PP
  1542. File system latency is defined as the time required to create or delete
  1543. a zero length file.
  1544. We define it this way because in many file systems,
  1545. such as the BSD fast file system, the directory operations are done
  1546. synchronously in order to maintain on-disk integrity. Since the
  1547. file data is typically cached and sent to disk at some later date,
  1548. the file creation and deletion become the bottleneck
  1549. seen by an application. This bottleneck is substantial: to do
  1550. a synchronous update to a disk is a matter of tens of milliseconds.
  1551. In many cases, this bottleneck is much more of a perceived performance
  1552. issue than processor speed.
  1553. .PP
  1554. The benchmark creates 1,000 zero-sized files and then deletes them.
  1555. All the files are created in one directory and their names are
  1556. short, such as "a", "b", "c", ... "aa", "ab", ....
  1557. .TSTART
  1558. .so lat_fs.tbl
  1559. .TEND "File system latency (microseconds)"
  1560. .PP
  1561. The create and delete latencies are shown in Table \n[TABLE].
  1562. Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster
  1563. than the slowest systems. However, Linux does not guarantee
  1564. anything about the disk integrity; the directory operations are done in
  1565. memory. Other fast systems, such as SGI's XFS, use a log to guarantee the
  1566. file system integrity.
  1567. The slower systems, all those with ~10 millisecond file latencies, are
  1568. using synchronous writes to guarantee the file system integrity.
  1569. Unless Unixware has modified UFS substantially, they must be running in
  1570. an unsafe mode since the FreeBSD UFS is much slower and both file
  1571. systems are basically the 4BSD fast file system.
  1572. .NH 2
  1573. Disk latency
  1574. .\" XXX - either get more results for this benchmark or delete it.
  1575. .\" I'd really like to not delete it - lmdd is probably the most
  1576. .\" useful tool and it gets the least press.
  1577. .PP
  1578. Included with \*[lmbench] is a small benchmarking program useful for
  1579. measuring disk and file I/O. \*[lmdd], which is patterned after
  1580. the Unix utility \f(CWdd\fP, measures both sequential and random I/O,
  1581. optionally generates patterns on output and checks them on input,
  1582. supports flushing the data from the buffer cache on systems that
  1583. support \f(CWmsync\fP, and has a very flexible user interface.
  1584. Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script
  1585. wrapped around \*[lmdd].
  1586. .PP
  1587. While we could have generated both sequential and random I/O results as
  1588. part of this paper, we did not because those benchmarks are heavily
  1589. influenced by the performance of the disk drives used in the test. We
  1590. intentionally measure only the system overhead of a SCSI command since
  1591. that overhead may become a bottleneck in large database configurations.
  1592. .PP
  1593. Some important applications, such as transaction processing, are
  1594. limited by random disk IO latency.
  1595. Administrators can increase the number of disk operations per
  1596. second by buying more disks, until the processor overhead becomes
  1597. the bottleneck.
  1598. The \f(CWlmdd\fP benchmark measures the processor overhead associated with each
  1599. disk operation, and it can provide an upper bound on the number of
  1600. disk operations the processor can support.
  1601. It is designed for SCSI disks, and it assumes that most
  1602. disks have 32-128K read-ahead buffers and that they can read ahead
  1603. faster than the processor can request the chunks of data.\**
  1604. .FS
  1605. This may not always be true: a processor could be fast enough to make the
  1606. requests faster than the rotating disk.
  1607. If we take 6M/second to be disk
  1608. speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or
  1609. 81 microseconds/IO. We don't know of any processor/OS/IO controller
  1610. combinations that can do an IO in 81 microseconds.
  1611. .FE
  1612. .PP
  1613. The benchmark simulates a large number of disks by reading 512byte
  1614. transfers sequentially from the raw disk device (raw disks are unbuffered
  1615. and are not read ahead by Unix).
  1616. Since the disk can read ahead faster than the system can request
  1617. data, the benchmark is doing small transfers of data from the
  1618. disk's track buffer.
  1619. Another way to look at this is that the benchmark
  1620. is doing memory-to-memory transfers across a SCSI channel.
  1621. It is possible to generate loads of more than 1,000 SCSI
  1622. operations/second on a single SCSI disk. For comparison, disks under
  1623. database load typically run at 20-80 operations per second.
  1624. .TSTART
  1625. .so ../Results/tmp/lat_disk.tbl
  1626. .TEND "SCSI I/O overhead (microseconds)"
  1627. .PP
  1628. The resulting overhead number represents a
  1629. \fBlower\fP bound on the overhead of a disk I/O.
  1630. The real overhead numbers will be higher on SCSI systems because
  1631. most SCSI controllers will not disconnect if the request can be
  1632. satisfied immediately.
  1633. During the benchmark, the processor simply sends the request and
  1634. transfers the data, while
  1635. during normal operation, the processor will send the request,
  1636. disconnect, get interrupted, reconnect, and transfer the data.
  1637. .PP
  1638. This technique can be used to discover how many drives a system can support
  1639. before the system becomes CPU-limited because it can produce the
  1640. overhead load of a fully configured system with just a few disks.
  1641. .NH 1
  1642. Future work
  1643. .PP
  1644. There are several known improvements and extensions that could be made
  1645. to \*[lmbench].
  1646. .BU "Memory latency" .
  1647. The current benchmark measures clean-read latency. By clean, we mean that
  1648. the cache lines being replaced are highly likely to be unmodified, so there
  1649. is no associated write-back cost. We would like to extend the benchmark
  1650. to measure dirty-read latency, as well as write latency. Other changes
  1651. include making the benchmark impervious to sequential prefetching and
  1652. measuring TLB miss cost.
  1653. .BU "MP benchmarks" .
  1654. None of the benchmarks in \*[lmbench] is designed to measure any
  1655. multiprocessor features directly. At a minimum, we could measure
  1656. cache-to-cache latency as well as cache-to-cache bandwidth.
  1657. .BU "Static vs. dynamic processes" .
  1658. In the process creation section, we allude to the cost of starting up processes
  1659. that use shared libraries. When we figure out how to create statically linked
  1660. processes on all or most systems, we could quantify these costs exactly.
  1661. .BU "McCalpin's stream benchmark" .
  1662. We will probably incorporate part or all of this benchmark into \*[lmbench].
  1663. .BU "Automatic sizing" .
  1664. We have enough technology that we could determine the size of the external
  1665. cache and autosize the memory used such that the external cache had no effect.
  1666. .BU "More detailed papers" .
  1667. There are several areas that could yield some interesting papers. The
  1668. memory latency section could use an in-depth treatment, and the
  1669. context switching section could turn into an interesting discussion of
  1670. caching technology.
  1671. .NH 1
  1672. Conclusion
  1673. .PP
  1674. \*[lmbench] is a useful, portable micro-benchmark suite designed to
  1675. measure important aspects of system performance. We have found that a good
  1676. memory subsystem is at least as important as the processor speed.
  1677. As processors get faster and faster, more and more of the system design
  1678. effort will need to move to the cache and memory subsystems.
  1679. .NH 1
  1680. Acknowledgments
  1681. .PP
  1682. Many people have provided invaluable help and insight into both the
  1683. benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers
  1684. were especially helpful.
  1685. We thank all of them
  1686. and especially thank:
  1687. Ken Okin \s-1(SUN)\s0,
  1688. Kevin Normoyle \s-1(SUN)\s0,
  1689. Satya Nishtala \s-1(SUN)\s0,
  1690. Greg Chesson \s-1(SGI)\s0,
  1691. John Mashey \s-1(SGI)\s0,
  1692. Neal Nuckolls \s-1(SGI)\s0,
  1693. John McCalpin \s-1(Univ. of Delaware)\s0,
  1694. Ron Minnich \s-1(Sarnoff)\s0,
  1695. Chris Ruemmler \s-1(HP)\s0,
  1696. Tom Rokicki \s-1(HP)\s0,
  1697. and
  1698. John Weitz \s-1(Digidesign)\s0.
  1699. .PP
  1700. We would also like to thank all of the people that have run the
  1701. benchmark and contributed their results; none of this would have been possible
  1702. without their assistance.
  1703. .PP
  1704. Our thanks to
  1705. all of the free software community for tools that were used during this
  1706. project.
  1707. \*[lmbench] is currently developed on Linux, a copylefted Unix written by
  1708. Linus Torvalds and his band of happy hackers.
  1709. This paper and all of the
  1710. \*[lmbench] documentation was produced using
  1711. the \f(CWgroff\fP suite of tools written by James Clark.
  1712. Finally, all of the data processing of the results is done with
  1713. \f(CWperl\fP written by Larry Wall.
  1714. .PP
  1715. Sun Microsystems, and in particular Paul Borrill,
  1716. supported the initial development of this project. Silicon Graphics
  1717. has supported ongoing development that turned into far more time then we
  1718. ever imagined. We are grateful to both of these companies for their
  1719. financial support.
  1720. .NH 1
  1721. Obtaining the benchmarks
  1722. .PP
  1723. The benchmarks are available at
  1724. .ft I
  1725. http://reality.sgi.com/employees/lm_engr/lmbench.tgz
  1726. .ft
  1727. as well as via a mail server.
  1728. You may request the latest version of \*[lmbench] by sending email
  1729. to \fIarchives@slovax.engr.sgi.com\fP with \fIlmbench-current*\fP
  1730. as the subject.
  1731. .\" .R1
  1732. .\" bibliography references
  1733. .\" .R2
  1734. .\"********************************************************************
  1735. .\" Redefine the IP paragraph format so it won't insert a useless line
  1736. .\" break when the paragraph tag is longer than the indent distance
  1737. .\"
  1738. .de @IP
  1739. .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
  1740. .par*start \\n[\\n[.ev]:ai] 0
  1741. .if !'\\$1'' \{\
  1742. . \" Divert the label so as to freeze any spaces.
  1743. . di par*label
  1744. . in 0
  1745. . nf
  1746. \&\\$1
  1747. . di
  1748. . in
  1749. . fi
  1750. . chop par*label
  1751. . ti -\\n[\\n[.ev]:ai]u
  1752. . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
  1753. . el \{\
  1754. \\*[par*label]
  1755. .\". br
  1756. . \}
  1757. . rm par*label
  1758. .\}
  1759. ..
  1760. .\"********************************************************************
  1761. .\" redefine the way the reference tag is printed so it is enclosed in
  1762. .\" square brackets
  1763. .\"
  1764. .de ref*end-print
  1765. .ie d [F .IP "[\\*([F]" 2
  1766. .el .XP
  1767. \\*[ref*string]
  1768. ..
  1769. .\"********************************************************************
  1770. .\" Get journal number entries right. Now will print as V(N) rather
  1771. .\" than the awful V, N.
  1772. .\"
  1773. .de ref*add-N
  1774. .ref*field N "" ( )
  1775. ..
  1776. .\"********************************************************************
  1777. .\" Get journal volume entries right. Now will print as V(N) rather
  1778. .\" than the awful V, N.
  1779. .\"
  1780. .de ref*add-V
  1781. .ref*field V , "" "" ""
  1782. ..
  1783. .\"********************************************************************
  1784. .\" Get the date entry right. Should not be enclosed in parentheses.
  1785. .\"
  1786. .de ref*add-D
  1787. .ref*field D ","
  1788. ..
  1789. .R1
  1790. accumulate
  1791. sort A+DT
  1792. database references
  1793. label-in-text
  1794. label A.nD.y-2
  1795. bracket-label [ ] ", "
  1796. bibliography references
  1797. .R2
  1798. .so bios