1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798 |
- .\" This document is GNU groff -mgs -t -p -R -s
- .\" It will not print with normal troffs, it uses groff features, in particular,
- .\" long names for registers & strings.
- .\" Deal with it and use groff - it makes things portable.
- .\"
- .\" $X$ xroff -mgs -t -p -R -s $file
- .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
- .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
- .VARPS
- .\" Define a page top that looks cool
- .\" HELLO CARL! To turn this off, s/PT/oldPT/
- .de draftPT
- .\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP'
- ..
- .de PT
- .if \\n%>1 \{\
- . sp -.1i
- . ps 14
- . ft 3
- . nr big 24
- . nr space \\w'XXX'
- . nr titlewid \\w'\\*[title]'
- . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
- . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
- . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
- . ce 1
- \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
- . ps
- . sp -.70
- . ps 12
- \\l'\\n[LL]u'
- . ft
- . ps
- .\}
- ..
- .\" Define a page bottom that looks cool
- .\" HELLO CARL! To turn this off, s/BT/oldBT/
- .de draftBT
- .\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP'
- ..
- .de BT
- . ps 9
- \v'-1'\\l'\\n(LLu'
- . sp -1
- . tl '\(co 1995 \\*[author]'\\*(DY'%'
- . ps
- ..
- .de SP
- . if t .sp .5
- . if n .sp 1
- ..
- .de BU
- . SP
- . ne 2
- \(bu\
- . if \\n[.$] \fB\\$1\fP\\$2
- ..
- .nr FIGURE 0
- .nr TABLE 0
- .nr SMALL .25i
- .de TSTART
- . KF
- . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
- . ps -1
- . vs -1
- ..
- .de TEND
- . ps +1
- . vs +1
- . if \\n[.$]=2 \{\
- . sp -.5
- \s(24\\l'\\n[pg@colw]u'\s0 \}
- . sp .25
- . nr TABLE \\n[TABLE]+1
- . ce 1
- \fBTable \\n[TABLE].\ \ \\$1\fP
- . SP
- . KE
- ..
- .de FEND
- . ps +1
- . vs +1
- . if \\n[.$]=2 \{\
- . sp -.5
- \s(24\\l'\\n[pg@colw]u'\s0 \}
- . sp .25
- . nr FIGURE \\n[FIGURE]+1
- . ce 1
- \fBFigure \\n[FIGURE].\ \ \\$1\fP
- . SP
- . KE
- ..
- .\" Configuration
- .nr PI 3n
- .nr HM .95i
- .nr FM 1i
- .nr PO .95i
- .if t .po .95i
- .nr LL 6.5i
- .if n .nr PO 0i
- .if n .nr LL 7.75i
- .nr PS 10
- .nr VS \n(PS+1
- .ds title Portable tools for performance analysis
- .ds author Larry McVoy
- .ds lmbench \f(CWlmbench\fP
- .ds lmdd \f(CWlmdd\fP
- .ds bcopy \f(CWbcopy\fP
- .ds connect \f(CWconnect\fP
- .ds execlp \f(CWexeclp\fP
- .ds exit \f(CWexit\fP
- .ds fork \f(CWfork\fP
- .ds gcc \f(CWgcc\fP
- .ds getpid \f(CWgetpid\fP
- .ds getpid \f(CWgetpid\fP
- .ds gettimeofday \f(CWgettimeofday\fP
- .ds kill \f(CWkill\fP
- .ds memmove \f(CWmemmove\fP
- .ds mmap \f(CWmmap\fP
- .ds popen \f(CWpopen\fP
- .ds read \f(CWread\fP
- .ds stream \f(CWstream\fP
- .ds system \f(CWsystem\fP
- .ds uiomove \f(CWuiomove\fP
- .ds write \f(CWwrite\fP
- .ds yield \f(CWyield\fP
- .\" References stuff
- .de RN \"Reference Name: .RN $1 -- prints the reference prettily
- .\"[\s-2\\$1\s+2]\\$2
- [\s-1\\$1\s0]\\$2
- ..
- .\" .R1
- .\" sort A+DT
- .\" database references
- .\" label-in-text
- .\" label A.nD.y-2
- .\" bracket-label \*([. \*(.] ", "
- .\" .R2
- .TL
- \s(14lmbench: Portable tools for performance analysis\s0\**
- .AU
- \s+2\fR\*[author]\fP\s0
- .AI
- \fI\s+2Silicon Graphics, Inc.\s0\fP
- .AU
- \s+2\fRCarl Staelin\fP
- .AI
- \s+2\fIHewlett-Packard Laboratories\s0\fP
- .SP
- .AB
- \*[lmbench] is a micro-benchmark suite designed to focus
- attention on the basic building blocks of many
- common system applications, such as databases, simulations,
- software development, and networking. In almost all
- cases, the individual tests are the result of analysis and isolation
- of a customer's actual performance problem.
- .\" .SP
- These tools can be, and currently are, used to compare different
- system implementations from different vendors.
- In several cases,
- the benchmarks have uncovered previously unknown bugs and design flaws.
- The results have shown a strong
- correlation between memory system performance and overall performance.
- .\" XXX - MP versus uniprocessors?
- \*[lmbench] includes an extensible database of
- results from systems current as of late 1995.
- .AE
- .if t .MC 3.05i
- .FS
- This paper first appeared in the January 1996 Usenix conference proceedings.
- The version you are reading has new results as well as some corrections.
- .FE
- .NH 1
- Introduction
- .PP
- \*[lmbench]
- provides a suite of benchmarks that attempt to measure the most commonly
- found performance bottlenecks in a wide range of system applications.
- These bottlenecks have been identified, isolated, and reproduced in a set
- of small micro-benchmarks, which measure
- system latency and bandwidth of data movement among
- the processor and memory, network, file system, and disk.
- The intent is to produce numbers that real
- applications can reproduce, rather than the frequently
- quoted and somewhat less reproducible marketing performance numbers.
- .PP
- The benchmarks focus on latency and bandwidth because
- performance issues are usually caused by latency
- problems, bandwidth problems, or some combination of the two. Each benchmark
- exists because it captures some unique performance problem present in
- one or more important applications.
- For example, the TCP latency benchmark is an accurate predictor of the
- Oracle distributed lock manager's performance, the memory latency
- benchmark gives a strong indication of Verilog simulation performance,
- and the file system latency benchmark models a critical path
- in software development.
- .PP
- \*[lmbench] was developed to identify and evaluate system performance
- bottlenecks present in many machines in 1993-1995. It is entirely
- possible that computer architectures will have changed and advanced
- enough in the next few years to render parts of this benchmark suite
- obsolete or irrelevant.
- .PP
- \*[lmbench] is already in widespread use at many sites by both end
- users and system designers. In some cases, \*[lmbench] has provided
- the data necessary to discover and correct critical performance
- problems that might have gone unnoticed. \*[lmbench] uncovered a
- problem in Sun's memory management software
- that made all pages map to the same location in the cache, effectively
- turning a 512 kilobyte (K) cache into a 4K cache.
- .PP
- \*[lmbench] measures only a system's ability
- to transfer data between processor, cache, memory, network, and disk.
- It does not measure other parts of the system, such as the graphics subsystem,
- nor is it a MIPS, MFLOPS,
- throughput, saturation, stress, graphics, or multiprocessor test suite.
- It is frequently run on multiprocessor (MP) systems to compare their performance
- against
- uniprocessor systems, but it does not take advantage of any multiprocessor
- features.
- .PP
- The benchmarks are written using standard, portable
- system interfaces and facilities commonly
- used by applications, so
- \*[lmbench]
- is portable and comparable over a wide set of Unix systems.
- \*[lmbench] has been run on
- AIX,
- BSDI,
- HP-UX,
- IRIX,
- Linux,
- FreeBSD,
- NetBSD,
- OSF/1,
- Solaris,
- and
- SunOS.
- Part of the suite has been run on Windows/NT as well.
- .PP
- \*[lmbench]
- is freely distributed under
- the Free Software Foundation's General Public License
- .RN Stallman89 ,
- with the additional restriction
- that results may be reported only if the benchmarks are unmodified.
- .NH 1
- Prior work
- .PP
- Benchmarking and performance analysis is not a new endeavor.
- There are too many other benchmark suites to list all of
- them here. We compare \*[lmbench]
- to a set of similar benchmarks.
- .BU "I/O (disk) benchmarks" :
- IOstone
- .RN Park90
- wants to be an I/O benchmark, but actually measures the memory
- subsystem; all of the tests fit easily in the cache.
- IObench
- .RN Wolman89
- is a systematic file system and disk benchmark, but it is
- complicated and unwieldy.
- In
- .RN McVoy91
- we reviewed many I/O benchmarks and found them all
- lacking because they took too long to run and
- were too complex a solution to a fairly simple problem. We wrote a
- small, simple I/O benchmark, \*[lmdd] that
- measures sequential and random I/O far
- faster than either IOstone or IObench. As part of
- .RN McVoy91
- the results from \*[lmdd] were checked against IObench (as well as some other
- Sun internal I/O benchmarks). \*[lmdd] proved to be more accurate than any
- of the other benchmarks.
- At least one disk vendor
- routinely uses \*[lmdd] to do performance testing of its disk drives.
- .SP
- Chen and Patterson
- .RN "Chen93, Chen94"
- measure I/O performance under a
- variety of workloads that are automatically varied to test the
- range of the system's performance.
- Our efforts differ in that we are more interested in the CPU overhead
- of a single request, rather than the capacity of the system as a whole.
- .BU "Berkeley Software Distribution's microbench suite" :
- The BSD effort generated an extensive set of
- test benchmarks to do regression testing (both quality and performance)
- of the BSD releases.
- We did not use this as a basis for our work (although we used ideas)
- for the following reasons:
- (a) missing tests \(em such as memory latency,
- (b) too many tests, the results tended to be obscured under a mountain
- of numbers,
- and (c) wrong copyright \(em we wanted the
- Free Software Foundation's General Public License.
- .BU "Ousterhout's Operating System benchmark" :
- .RN Ousterhout90
- proposes several system benchmarks to measure system call
- latency, context switch time, and file system performance.
- We used the same ideas as a basis for our work, while trying to
- go farther. We measured a more complete set of
- primitives, including some hardware measurements; went into greater depth
- on some of the tests, such as context switching; and went to great
- lengths to make the benchmark portable and extensible.
- .BU "Networking benchmarks" :
- \f(CWNetperf\fP measures networking bandwidth and latency and
- was written by Rick Jones of Hewlett-Packard.
- \*[lmbench] includes a smaller,
- less complex benchmark that produces similar results.
- .SP
- \f(CWttcp\fP is a widely used benchmark in the Internet community.
- Our version of the same benchmark
- routinely delivers bandwidth numbers that are within 2% of the numbers
- quoted by \f(CWttcp\fP.
- .BU "McCalpin's stream benchmark" :
- .RN McCalpin95
- has memory bandwidth measurements and results for a large number of
- high-end systems.
- We did not use these because we discovered them only after
- we had results using our versions.
- We will probably include McCalpin's benchmarks in \*[lmbench]
- in the future.
- .PP
- In summary, we rolled our own because we wanted simple, portable
- benchmarks that accurately measured a wide variety of operations that we
- consider crucial to performance on today's systems. While portions of
- other benchmark suites include similar work, none includes all of it,
- few are as portable, and almost all are far more complex. Less filling,
- tastes great.
- .NH 1
- Benchmarking notes
- .NH 2
- Sizing the benchmarks
- .PP
- The proper sizing of various benchmark parameters is crucial to ensure
- that the benchmark is measuring the right component of system performance.
- For example, memory-to-memory copy
- speeds are dramatically affected by the location of the data: if
- the size parameter is too small so
- the data is in a cache, then the performance may be as much as ten times
- faster than if the data is in memory.
- On the other hand, if the memory size parameter is too big so the data
- is paged to disk, then performance may be slowed to such an extent
- that the benchmark seems to `never finish.'
- .PP
- \*[lmbench] takes the following approach to the cache and memory
- size issues:
- .BU
- All of the benchmarks that could be affected
- by cache size are run in a loop,
- with increasing sizes (typically powers of two) until some maximum size
- is reached. The results may then be plotted to see where the benchmark
- no longer fits in the cache.
- .BU
- The benchmark verifies that there is sufficient memory to run all of the
- benchmarks in main memory. A small test program allocates as much memory
- as it can, clears the memory,
- and then strides through that memory a page at a time, timing
- each reference. If any reference takes more than a few microseconds, the
- page is no longer in memory. The test program starts small and works forward
- until either enough memory is seen as present or the memory limit is reached.
- .NH 2
- Compile time issues
- .PP
- The GNU C compiler, \*[gcc], is the compiler we chose because
- it gave the most reproducible results across platforms.
- When \*[gcc] was not present, we used the vendor-supplied \f(CWcc\fP.
- All of the benchmarks were compiled with optimization \f(CW-O\fP
- except
- the benchmarks that calculate clock speed and the context switch times,
- which must be compiled without optimization in order to produce
- correct results. No other optimization flags were enabled because
- we wanted results that would be commonly seen by application writers.
- .PP
- All of the benchmarks were linked using the default manner of
- the target system. For most if not all systems, the
- binaries were linked using shared libraries.
- .NH 2
- Multiprocessor issues
- .PP
- All of the multiprocessor systems ran the benchmarks in the same way as
- the uniprocessor systems. Some systems allow users to pin processes
- to a particular CPU, which sometimes results in better cache reuse. We
- do not pin processes because it defeats the MP scheduler.
- .\" XXX - I should do this on an IP19 and mark it as pinned.
- In certain cases, this decision yields interesting results discussed later.
- .NH 2
- Timing issues
- .LP
- .sp -.5
- .BU "Clock resolution" :
- The benchmarks measure the elapsed time by reading the system clock via the
- \*[gettimeofday] interface. On some systems this interface has a resolution
- of 10 milliseconds, a long time relative to many of the benchmarks which
- have results measured in tens to hundreds of microseconds. To compensate for
- the coarse clock resolution, the benchmarks are hand-tuned to measure
- many operations within a single time interval lasting for many clock ticks.
- Typically, this is done by executing the operation in a small loop, sometimes
- unrolled if the operation is exceedingly fast, and then dividing
- the loop time by the loop count.
- .BU Caching :
- If the benchmark expects the data to be in the cache, the benchmark is
- typically run several times; only the last result is recorded.
- .SP
- If the benchmark does not want to measure cache performance it sets
- the size parameter larger than the cache. For example, the
- \*[bcopy] benchmark by default copies 8 megabytes to 8 megabytes,
- which largely defeats any second-level cache in use today. (Note that the
- benchmarks are not trying to defeat the file or process page cache,
- only the hardware caches.)
- .br
- .di bigtable
- .ev keep
- .ps 8
- .vs 9
- .so systems.tbl
- .ps \n[PS]
- .vs \n[VS]
- .nr TABLE \n[TABLE]+1
- .ce 1
- .SP
- \fBTable \n[TABLE].\ \ System descriptions.\fP
- .SP
- .di
- .ev
- .nr WHEN \n[dn]+\n[FM]
- .nr THT \n[dn]
- .de print*table
- ' sp .5
- ' ev keep
- ' nf
- ' bigtable
- . ne 1
- . wh -\n[WHEN]u skip*page
- . fi
- . ev
- ..
- .de skip*page
- ' sp \n[THT]u
- . wh -\n[WHEN]u
- ..
- .wh -\n[WHEN]u print*table
- .BU Variability :
- The results of some benchmarks, most notably the context switch benchmark, had a tendency
- to vary quite a bit, up to 30%. We suspect that the
- operating system is not using the same set of physical
- pages each time a process is created and we are seeing the effects of
- collisions in the external caches. We compensate by running the
- benchmark in a loop and taking the minimum result. Users interested in
- the most accurate data are advised to verify the results on their
- own platforms.
- .PP
- Many of the results included in the database were donated by users
- and were not created by the authors.
- Good benchmarking practice suggests that one should run the benchmarks
- as the only user of a machine, without other resource intensive
- or unpredictable processes or daemons.
- .NH 2
- Using the \f(CBlmbench\fP database
- .PP
- \*[lmbench] includes a database of results that
- is useful for comparison purposes. It is quite easy to
- build the source, run the benchmark, and produce a table of results
- that includes the run. All of the tables in this paper were produced
- from the database included in \*[lmbench]. This paper is also
- included with \*[lmbench] and may be reproduced incorporating new results.
- For more information, consult the file \f(CWlmbench-HOWTO\fP in the
- \*[lmbench] distribution.
- .NH 1
- Systems tested
- .PP
- \*[lmbench] has been run on a wide variety of platforms. This
- paper includes results from a representative subset of machines and
- operating systems.
- Comparisons between similar hardware running different operating
- systems can be very illuminating, and we have included a few examples
- in our results.
- .PP
- The systems are briefly characterized in Table 1. Please note that the list prices
- are very approximate as is the year of introduction.
- The SPECInt92 numbers are a little suspect since
- some vendors have been ``optimizing'' for certain parts of SPEC. We try and
- quote the original SPECInt92 numbers where we can.
- .NH 2
- Reading the result tables
- .PP
- Throughout the rest of this paper, we present tables of results for many of the
- benchmarks. All of the tables are sorted, from best to worst. Some tables
- have multiple columns of results and those tables are sorted on only one of
- the columns. The sorted column's heading will be in \fBbold\fP.
- .NH 1
- Bandwidth benchmarks
- .PP
- By bandwidth, we mean the rate at which a particular facility can move
- data.
- We attempt to measure the data movement ability of a number of
- different facilities:
- library \*[bcopy],
- hand-unrolled \*[bcopy],
- direct-memory read and write (no copying),
- pipes,
- TCP sockets,
- the \*[read] interface,
- and
- the \*[mmap] interface.
- .NH 2
- Memory bandwidth
- .PP
- Data movement is fundamental to any operating system.
- In the past, performance
- was frequently measured in MFLOPS because floating point units were
- slow enough that microprocessor systems were
- rarely limited by memory bandwidth. Today, floating point units are usually much
- faster than memory bandwidth, so many current MFLOP ratings can not be
- maintained using memory-resident data; they are ``cache only'' ratings.
- .PP
- We measure the ability to
- copy, read, and write data over a varying set of sizes.
- There are too many results to report all of them here, so we concentrate on
- large memory transfers.
- .PP
- We measure copy bandwidth two ways. The first is the user-level library
- \*[bcopy] interface.
- The second is a hand-unrolled loop that loads and stores
- aligned 8-byte words.
- In both cases, we took care to
- ensure that the source and destination locations would not map to the same
- lines if the any of the caches were direct-mapped.
- In order to test memory bandwidth rather than cache bandwidth,
- both benchmarks copy an 8M\** area to another 8M area.
- (As secondary caches reach 16M, these benchmarks will have to
- be resized to reduce caching effects.)
- .FS
- Some of the PCs had less than 16M of available memory;
- those machines copied 4M.
- .FE
- .PP
- The copy results actually represent one-half to one-third of the memory
- bandwidth used to obtain those results since we are reading and writing
- memory. If the cache line size is larger than the word stored, then
- the written cache line will typically be read before it is written. The
- actual amount of memory bandwidth used varies because some architectures
- have special instructions specifically designed for the \*[bcopy]
- function. Those architectures will move twice as much memory as
- reported by this benchmark; less advanced architectures move three
- times as much memory: the memory read, the memory read because it is
- about to be overwritten, and the memory written.
- .PP
- The \*[bcopy] results reported in Table 2
- may be correlated with John McCalpin's \*[stream]
- .RN McCalpin95
- benchmark results in the following manner:
- the \*[stream] benchmark reports all of the memory moved
- whereas the \*[bcopy] benchmark reports the bytes copied. So our
- numbers should be approximately one-half to one-third of his numbers.
- .PP
- Memory reading is measured by an unrolled loop that sums up a series of
- integers. On most (perhaps all) systems measured the integer
- size is 4 bytes. The loop is unrolled such that most compilers generate
- code that uses a constant offset with the load, resulting in a load and
- an add for each word of memory. The add is an integer add that completes
- in one cycle on all of the processors. Given that today's processor
- typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000
- ns per cache line, the results reported here should be dominated by the
- memory subsystem, not the processor add unit.
- .PP
- The memory contents are added up because almost all C compilers
- would optimize out the whole loop when optimization was turned on, and
- would generate far too many instructions without optimization.
- The solution is to
- add up the data and pass the result as an unused argument to the
- ``finish timing'' function.
- .PP
- Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect
- that pure reads should run at roughly twice the speed of \*[bcopy].
- Exceptions to this rule should be studied, for exceptions indicate a bug
- in the benchmarks, a problem in \*[bcopy], or some unusual hardware.
- .TSTART
- .so ../Results/tmp/bw_allmem.tbl
- .TEND "Memory bandwidth (MB/s)"
- .PP
- Memory writing is measured by an unrolled loop that stores a value into
- an integer (typically a 4 byte integer) and then increments the pointer.
- The processor cost of each memory operation is approximately the same
- as the cost in the read case.
- .PP
- The numbers reported in Table \n[TABLE]
- are not the raw hardware speed in some cases.
- The Power2\** is capable of up to 800M/sec read rates
- .FS
- Someone described this machine as a $1,000 processor on a $99,000 memory
- subsystem.
- .FE
- .RN McCalpin95
- and HP PA RISC (and other prefetching)
- systems also do better if higher levels of code optimization used
- and/or the code is hand tuned.
- .PP
- The Sun libc bcopy in Table \n[TABLE]
- is better because they use a hardware specific bcopy
- routine that uses instructions new in SPARC V9 that were added specifically
- for memory movement.
- .PP
- The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because,
- according to Intel, the write transaction turns into a read followed by
- a write to maintain cache consistency for MP systems.
- .NH 2
- IPC bandwidth
- .PP
- Interprocess communication bandwidth is frequently a performance issue.
- Many Unix applications are composed of several processes communicating
- through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation
- system that prepared this paper, the \f(CWX Window System\fP, remote file access,
- and \f(CWWorld Wide Web\fP servers.
- .PP
- Unix pipes are an interprocess communication mechanism implemented as
- a one-way byte stream. Each end of the stream has an associated file
- descriptor; one is the write descriptor and the other the read
- descriptor.
- TCP sockets are similar
- to pipes except they are bidirectional and can cross machine
- boundaries.
- .PP
- Pipe bandwidth is measured by creating two processes, a writer and a
- reader, which transfer 50M of data in 64K transfers.
- The transfer size was chosen so that the overhead of system calls
- and context switching would not dominate the benchmark time.
- The reader prints the timing results, which guarantees that all
- data has been moved before the timing is finished.
- .PP
- TCP bandwidth is measured similarly, except the data is transferred in
- 1M page aligned transfers instead of 64K transfers. If the TCP
- implementation supports it, the send and receive socket buffers are
- enlarged to 1M, instead of the default 4-60K. We have found that
- setting the transfer size equal to the socket buffer size produces the
- greatest throughput over the most implementations.
- .TSTART
- .so ../Results/tmp/bw_ipc.tbl
- .TEND "Pipe and local TCP bandwidth (MB/s)"
- .PP
- \*[bcopy] is important to this test because the
- pipe write/read is typically implemented as a \*[bcopy] into the kernel
- from the writer and then a \*[bcopy] from the kernel to the reader.
- Ideally, these results would be approximately one-half of the
- \*[bcopy] results. It is possible for the kernel \*[bcopy]
- to be faster than the C library \*[bcopy] since the kernel may have
- access to \*[bcopy] hardware unavailable to the C library.
- .PP
- It is interesting to compare pipes with TCP because the TCP benchmark is
- identical to the pipe benchmark except for the transport mechanism.
- Ideally, the TCP bandwidth would be as good as the pipe
- bandwidth. It is not widely known that the
- majority of the TCP cost is in the \*[bcopy], the checksum,
- and the network interface driver.
- The checksum and the driver may be safely eliminated in the loopback
- case and if the costs have been eliminated, then TCP should be just as
- fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to
- see that Solaris and HP-UX have done this optimization.
- .PP
- Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the
- pipe transfers are done in 64K buffers, a size that frequently fits in
- caches, while the bcopy is typically an 8M-to-8M copy, which does not
- fit in the cache.
- .PP
- In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than
- the SGI MP on pipe bandwidth because of caching effects - in the UP
- case, both processes share the cache; on the MP, each process is
- communicating with a different cache.
- .PP
- All of the TCP results in Table \n[TABLE] are in loopback mode \(em that
- is both ends of the socket are on the same machine. It was impossible
- to get remote networking results for all the machines included in this
- paper. We are interested in receiving more results for identical
- machines with a dedicated network connecting them. The results we have
- for over the wire TCP bandwidth are shown below.
- .TSTART
- .so tcp_bw.tbl
- .TEND "Remote TCP bandwidth (MB/s)"
- .PP
- The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE].
- The SGI Hippi interface has hardware support for TCP checksums and
- the IRIX operating system uses virtual memory tricks to avoid copying
- data as much as possible.
- For larger transfers, SGI Hippi has reached 92MB/s over TCP.
- .PP
- 100baseT is looking quite competitive when compared to FDDI in Table
- \n[TABLE], even though FDDI has packets that are almost three times
- larger. We wonder how long it will be before we see gigabit ethernet
- interfaces.
- .NH 2
- Cached I/O bandwidth
- .PP
- Experience has shown us that reusing data in the file system
- page cache can be a performance issue. This
- section measures that operation through two interfaces, \*[read] and
- \*[mmap].
- The benchmark here is not an I/O benchmark in that no disk activity is
- involved.
- We wanted to measure the overhead
- of reusing data, an overhead that is CPU intensive, rather than disk intensive.
- .PP
- The \*[read] interface copies data from the kernel's file system page cache into the
- process's buffer, using 64K buffers. The transfer size was chosen
- to minimize the kernel entry overhead while
- remaining realistically sized.
- .PP
- The difference between the \*[bcopy] and the \*[read] benchmarks
- is the cost of the file and virtual memory system overhead. In most
- systems, the \*[bcopy] speed should be faster than the \*[read] speed. The
- exceptions usually have hardware specifically designed
- for the \*[bcopy] function and that hardware may be available only to
- the operating system.
- .PP
- The \*[read] benchmark is implemented by rereading a file
- (typically 8M) in 64K
- buffers. Each buffer is summed as a series of integers in the user
- process. The summing is done for two reasons: for an apples-to-apples
- comparison the memory-mapped benchmark needs to touch all the data,
- and the file system can sometimes transfer data into memory faster than the
- processor can read the data.
- For example, \s-1SGI\s0's XFS can move data into memory at
- rates in excess of 500M per second, but it can move data into
- the cache at only 68M per second. The intent is to measure performance
- delivered to the application, not DMA performance to memory.
- .TSTART
- .so ../Results/tmp/bw_reread2.tbl
- .TEND "File vs. memory bandwidth (MB/s)"
- .PP
- The \*[mmap] interface provides a way to access the kernel's file cache
- without copying the data.
- The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M)
- into the
- process's address space. The file is then summed to force the data
- into the cache.
- .PP
- In Table \n[TABLE],
- a good system will have \fIFile read\fP as fast as (or even faster than)
- \fILibc bcopy\fP because as the file system overhead goes to zero, the
- file reread case is virtually the same as the library \*[bcopy] case.
- However, file reread can be faster because the kernel may have access to
- \*[bcopy] assist hardware not available to the C library.
- Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP
- performance, but \*[mmap] is often dramatically worse.
- Judging by the results, this looks to be a
- potential area for operating system improvements.
- .PP
- In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes
- full advantage of the memory subsystem from inside the kernel.
- The mmap reread is probably slower because of the lower clock rate;
- the page faults start to show up as a significant cost.
- .PP
- It is surprising that the Sun Ultra1 was able to bcopy at the high
- rates shown in Table 2 but did not show those rates for file reread
- in Table \n[TABLE].
- HP has the opposite problem, they get file reread faster than bcopy,
- perhaps because the kernel \*[bcopy] has access to hardware support.
- .PP
- The Unixware system has outstanding mmap reread rates, better than
- systems of substantially higher cost. Linux needs to do some work on
- the \f(CWmmap\fP code.
- .NH 1
- Latency measurements
- .PP
- Latency is an often-overlooked
- area of performance problems, possibly because resolving latency issues
- is frequently much harder than resolving bandwidth issues. For example,
- memory bandwidth may be increased by making wider cache lines and increasing
- memory ``width'' and interleave,
- but memory latency can be improved only by shortening paths or increasing
- (successful) prefetching.
- The first step toward improving latency is understanding the
- current latencies in a system.
- .PP
- The latency measurements included in this suite are
- memory latency,
- basic operating system entry cost,
- signal handling cost,
- process creation times,
- context switching,
- interprocess communication,
- .\" virtual memory system latency,
- file system latency,
- and disk latency.
- .NH 2
- Memory read latency background
- .PP
- In this section, we expend considerable effort to define the different memory
- latencies and to explain and justify our benchmark.
- The background is a bit tedious but important, since we believe the
- memory
- latency measurements to be one of the most thought-provoking and useful
- measurements in \*[lmbench].
- .PP
- The most basic latency measurement is memory latency since most of
- the other latency measurements can be expressed in terms of memory
- latency. For example, context switches require saving the current
- process state and loading the state of the next process. However, memory
- latency is rarely accurately measured and frequently misunderstood.
- .PP
- Memory read latency has many definitions;
- the most common,
- in increasing time order,
- are memory chip cycle time, processor-pins-to-memory-and-back time,
- load-in-a-vacuum time, and back-to-back-load time.
- .BU "Memory chip cycle latency" :
- Memory chips are rated in nanoseconds; typical speeds are around 60ns.
- A general overview on DRAM architecture may be found in
- .RN Hennessy96 .
- The
- specific information we describe here is from
- .RN Toshiba94
- and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0
- \s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the
- time from
- .ps -1
- .nr width \w'R\&A\&S'
- .nr height \n[rst]+1000
- RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
- .ps
- assertion to the when
- the data will be available on the \s-1DRAM\s0 pins (assuming
- .ps -1
- .nr width \w'C\&A\&S'
- .nr height \n[rst]+1000
- CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
- .ps
- access time requirements were met).
- While it is possible
- to get data out of a \s-1DRAM\s0 in 60ns, that is not all of
- the time involved. There is a precharge time that must occur after
- every access.
- .RN Toshiba94
- quotes 110ns as the random read or write cycle time and this
- time is more representative of the cycle time.
- .\" For example, most systems offer a wide range of memory
- .\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number
- .\" of simms range from 1 to 16. The more simms there are, the more
- .\" capacitance there is in the memory subsystem. More capacitance means
- .\" longer setup times for the fully populated memory subsystem. System
- .\" designers have to allow time for this setup.
- .\" For more details, consult [XXX - reference on DRAM].
- .\" This is sometimes referred to as the chip latency. The
- .\" chip cycle time is the chip latency plus the time required to restore
- .\" the data in the capacitors which is often referred to as the precharge
- .\" time. This means that 60 nanosecond memory chips really are more like
- .\" 100 nanosecond memory chips. Some systems operate memory in ``page
- .\" mode'' or ``static column'' memory systems hold either RAS or CAS and
- .\" allow subsequent accesses in the same row or column in one cycle instead
- .\" of two.
- .BU "Pin-to-pin latency" :
- This number represents the time needed
- for the memory request to travel from the processor's pins to the memory
- subsystem and back again. Many vendors have used the pin-to-pin
- definition of memory latency in their reports. For example,
- .RN Fenwick95
- while describing the \s-1DEC\s0 8400
- quotes memory latencies of 265ns; a careful
- reading of that paper shows that these are pin-to-pin numbers. In spite
- of the historical precedent in vendor reports, this definition of memory
- latency is misleading since it ignores actual delays seen when a load
- instruction is immediately followed by a use of the data being loaded.
- The number of additional cycles inside the processor can be significant
- and grows more significant with today's highly pipelined architectures.
- .PP
- It is worth noting that the pin-to-pin numbers
- include the amount of time it takes to charge
- the lines going to the \s-1SIMM\s0s, a time that increases with the
- (potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean
- more capacitance which requires in longer charge times. This is one reason
- why personal computers frequently have better memory latencies than
- workstations: the PCs typically have less memory capacity.
- .BU "Load-in-a-vacuum latency" :
- A load in a vacuum is the time that the processor will wait for one load that
- must be fetched from main memory (i.e., a cache miss). The ``vacuum''
- means that there is no other activity on the system bus, including no other
- loads.
- While this number is frequently used as the memory latency, it is not very
- useful. It is basically a ``not to exceed'' number important only for
- marketing reasons.
- Some architects point out that since most processors implement nonblocking
- loads (the load does not cause a stall until the data is used), the perceived
- load latency may be much less that the real latency. When pressed, however,
- most will admit that cache misses occur in bursts, resulting in perceived
- latencies of at least the load-in-a-vacuum latency.
- .BU "Back-to-back-load latency" :
- Back-to-back-load latency is the time that each load takes, assuming
- that the instructions before and after are also cache-missing loads.
- Back-to-back loads may take longer than loads in a vacuum for the
- following reason: many systems implement something known as \fIcritical
- word first\fP, which means that the subblock of the cache line that
- contains the word being loaded is delivered to the processor before the
- entire cache line has been brought into the cache. If another load
- occurs quickly enough after the processor gets restarted from the
- current load, the second load may stall because the cache is still
- busy filling the cache line for the previous load. On some systems,
- such as the current implementation of UltraSPARC,
- the difference between back to back and load in a vacuum is about 35%.
- .PP
- \*[lmbench] measures back-to-back-load latency because it is the
- only measurement that may be easily measured from software and
- because we feel that it is what most software developers consider to be memory
- latency. Consider the following C code fragment:
- .DS
- .nf
- .ft CW
- p = head;
- while (p->p_next)
- p = p->p_next;
- .ft
- .fi
- .DE
- On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the
- load. A 300 Mhz processor has a 3.33ns cycle time, so the loop
- could execute in slightly less than 10ns. However, the load itself
- takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the
- instructions cost 10ns but the load stalls for 400. Another
- way to look at it is that 400/3.3, or 121, nondependent,
- nonloading instructions following the load would be needed
- to hide the load latency.
- Because superscalar processors typically execute multiple operations
- per clock cycle, they need even more useful operations between cache
- misses to keep the processor from stalling.
- .PP
- This benchmark illuminates the tradeoffs in processor cache design.
- Architects like large cache lines, up to 64 bytes or so, because
- the prefetch effect of gathering a whole line increases
- hit rate given reasonable spatial locality.
- Small stride sizes have high spatial locality and should have higher
- performance, but large stride sizes have poor spatial locality causing
- the system to prefetch useless data.
- So the benchmark provides the following insight into negative
- effects of large line prefetch:
- .BU
- Multi-cycle fill operations are typically atomic events at the
- caches, and sometimes block other cache accesses until they
- complete.
- .BU
- Caches are typically single-ported. Having a large line prefetch
- of unused data causes extra bandwidth
- demands at the cache, and can cause increased access latency for
- normal cache accesses.
- .PP
- In summary, we believe that processors are so fast that the average
- load latency for cache misses will be closer to the
- back-to-back-load number than to the load-in-a-vacuum number. We are
- hopeful that the industry will standardize on this definition of
- memory latency.
- .NH 2
- Memory read latency
- .PP
- The entire memory hierarchy can be measured, including on-board data
- cache latency and size, external data cache latency and size, and
- main memory latency.
- Instruction caches are not measured.
- TLB miss latency can also be measured, as in
- .RN Saavedra92 ,
- but we stopped at main memory. Measuring TLB miss time is problematic
- because different systems map different amounts of memory with their
- TLB hardware.
- .PP
- The benchmark varies two parameters, array size and array stride.
- For each size, a list of pointers is created for all of the different
- strides. Then the list is walked thus:
- .DS
- .ft CW
- mov r4,(r4) # C code: p = *p;
- .ft
- .DE
- The time to do about 1,000,000 loads (the list wraps) is measured and
- reported. The time reported is pure latency time and may be zero even though
- the load instruction does not execute in zero time. Zero is defined as one
- clock cycle; in other words, the time reported is \fBonly\fP memory latency
- time, as it does not include the instruction execution time. It is assumed
- that all processors can do a load instruction in one processor cycle
- (not counting stalls). In other words, if the processor cache load time
- is 60ns on a 20ns processor, the load latency reported
- would be 40ns, the additional 20ns is for the load instruction
- itself.\**
- .FS
- In retrospect, this was a bad idea because we calculate the clock
- rate to get the instruction execution time. If the clock rate is off,
- so is the load time.
- .FE
- Processors that can manage to get the load address out to the
- address pins before the end of the load cycle get some free time in this
- benchmark (we don't know of any processors that do that).
- .PP
- This benchmark has been validated by logic analyzer measurements
- on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer
- Research Center.
- .TSTART 1
- .so mem.pic
- .FEND "Memory latency" 1
- .PP
- Results from the memory latency benchmark are plotted as a series of data sets
- as shown in Figure \n[FIGURE].
- Each data set represents a stride size,
- with the array size varying from 512 bytes up to 8M or more.
- The curves contain a series of
- horizontal plateaus, where each plateau represents a level in the
- memory hierarchy.
- The point where each plateau ends and the line rises marks the
- end of that portion of the memory hierarchy (e.g., external cache).
- Most machines have similar memory hierarchies:
- on-board cache, external cache, main memory, and main memory plus TLB
- miss costs.
- There are variations: some processors are missing a cache, while
- others add another cache to the hierarchy.
- .\" XXX Larry please double-check this; I am going on dim memory...
- For example, the Alpha 8400 has two on-board caches, one 8K
- and the other 96K.
- .PP
- The cache line size can be derived by comparing curves and noticing which
- strides are faster than main memory times. The smallest stride that is
- the same as main memory speed is likely to be the cache line size because
- the strides that are faster than memory are
- getting more than one hit per cache line.
- .\" Prefetching may confuse
- .\" the issue because a demand read may stall behind a prefetch load,
- .\" causing cache lines to appear twice as large as they are.
- .\" XXX
- .\" Larry --- can we use prime modulus arithmetic to set up pointer
- .\" loops which might appear random but which really aren't and which
- .\" hit every stride once before looping?
- .\"
- .\" XXX
- .\" Larry --- is there any way we can defeat/disable prefetching
- .\" so the cache line size can be more accurately determined?
- .\"
- .\" XXX
- .\" Larry --- can we create a benchmark for TLB misses?
- .\" I think it was Tom Rokicki who suggested that we create a
- .\" benchmark where the data fits in the cache, but the pages don't
- .\" fit in the TLB.
- .\"
- .\" XXX
- .\" Larry --- is the description of the memory hierarchy correct?
- .\" I am not sure I haven't added an extra level of external cache...
- .EQ
- delim $$
- .EN
- .PP
- Figure \n[FIGURE] shows memory latencies on a nicely made machine,
- a \s-1DEC\s0 Alpha.
- We use this machine as the example
- because it shows the latencies and sizes of
- the on-chip level 1 and motherboard level 2 caches, and because it
- has good all-around numbers, especially considering it can support a
- 4M level 2 cache.
- The on-board cache is $2 sup 13$ bytes or 8K, while the
- external cache is $2 sup 19$ bytes or 512K.
- .EQ
- delim off
- .EN
- .TSTART
- .so lat_allmem.tbl
- .TEND "Cache and memory latency (ns)"
- .nr MEMTABLE \n[TABLE]
- .PP
- Table \n[TABLE] shows the cache size, cache latency, and main memory
- latency as extracted from the memory latency graphs.
- The graphs and the tools for extracting the data are
- included with \*[lmbench].
- It is worthwhile to plot all of the graphs and examine them since the
- table is missing some details, such as the
- \s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache.
- .PP
- We sorted Table \n[TABLE] on level 2 cache latency because we think
- that many applications will fit in the level 2 cache. The HP and IBM
- systems have only one level of cache so we count that as both level 1
- and level 2. Those two systems have remarkable cache performance for
- caches of that size. In both cases, the cache delivers data in one
- clock cycle after the load instruction.
- .PP
- HP systems usually focus on
- large caches as close as possible to the processor. A older HP
- multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped
- cache with a 2K victim cache, accessible in one clock (16ns).\** That system is
- primarily a database server.
- .FS
- The Usenix version of this paper had this as a set associate cache; that was
- incorrect.
- .FE
- .PP
- The IBM focus is on low latency, high
- bandwidth memory. The IBM memory subsystem is good because all of
- memory is close to the processor, but has the weakness that it is
- extremely difficult to evolve the design to a multiprocessor system.
- .PP
- The 586 and PowerPC motherboards have quite poor second level caches,
- the caches are not substantially better than main memory.
- .PP
- The Pentium Pro and Sun Ultra second level caches are of medium speed
- at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared
- against the HP and IBM one cycle latency caches of similar size.
- Given the tight integration of the Pentium Pro level 2 cache, it is
- surprising that it has such high latencies.
- .PP
- The 300Mhz DEC Alpha has a rather high 22 clock latency to the second
- level cache which is probably one of the reasons that they needed a 96K
- level 1.5 cache. SGI and DEC have used large second level caches
- to hide their long latency from main memory.
- .PP
- .NH 2
- Operating system entry
- .PP
- Entry into the operating system is required for many system facilities.
- When calculating the cost of a facility, it is useful to know how
- expensive it is to perform a nontrivial entry into the operating system.
- .PP
- We measure nontrivial entry into the system by repeatedly writing one
- word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but
- discard the data. This particular entry point was chosen because it has
- never been optimized in any system that we have measured. Other entry
- points, typically \*[getpid] and \*[gettimeofday], are heavily used,
- heavily optimized, and sometimes implemented as user-level library
- routines rather than system calls.
- A write to the \f(CW/dev/null\fP driver will go
- through the system call table to \*[write], verify the user area as
- readable, look up the file descriptor to get the vnode, call the vnode's
- write function, and then return.
- .TSTART
- .so ../Results/tmp/lat_nullsys.tbl
- .TEND "Simple system call time (microseconds)"
- .PP
- Linux is the clear winner in the system call time. The reasons are
- twofold: Linux is a uniprocessor operating system, without any
- MP overhead, and Linux is a small operating system, without all
- of the ``features'' accumulated by the commercial offers.
- .PP
- Unixware and Solaris are doing quite well, given that they are both fairly
- large, commercially oriented operating systems with a large accumulation
- of ``features.''
- .NH 2
- Signal handling cost
- .PP
- Signals in Unix are a way to tell another process to handle an event. They
- are to processes as interrupts are to the CPU.
- .PP
- Signal handling is often critical to layered systems. Some applications,
- such as databases, software development environments, and threading libraries
- provide an operating system-like layer on top of the operating system,
- making signal handling a critical path in many of these applications.
- .PP
- \*[lmbench] measure both signal installation and signal dispatching in two separate
- loops, within the context of one process.
- It measures signal handling by installing a signal handler and then repeatedly
- sending itself the signal.
- .TSTART
- .so ../Results/tmp/lat_signal.tbl
- .TEND "Signal times (microseconds)"
- .PP
- Table \n[TABLE] shows the signal handling costs.
- Note that there are no context switches in this benchmark; the signal goes
- to the same process that generated the signal. In real applications,
- the signals usually go to another process, which implies
- that the true cost of sending that signal is the signal overhead plus the
- context switch overhead. We wanted to measure signal and context
- switch overheads separately since context
- switch times vary widely among operating systems.
- .PP
- SGI does very well on signal processing,
- especially since their hardware is of an older generation than
- many of the others.
- .PP
- The Linux/Alpha signal handling numbers are so poor
- that we suspect that this is a bug, especially given that the Linux/x86
- numbers are quite reasonable.
- .NH 2
- Process creation costs
- .PP
- Process benchmarks are used to measure the basic process primitives,
- such as creating a new process, running a different program, and context
- switching. Process creation benchmarks are of particular interest
- in distributed systems since many remote operations include the creation
- of a remote process to shepherd the remote operation to completion.
- Context switching is important for the same reasons.
- .BU "Simple process creation" .
- The Unix process creation primitive is \*[fork], which
- creates a (virtually) exact copy of the calling process.
- Unlike VMS and some other operating systems, Unix starts any new process
- with a \*[fork].
- Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and
- ``light,'' facts that many have been ignoring for some time.
- .PP
- \*[lmbench] measures simple process creation by creating a process
- and immediately
- exiting the child process. The parent process waits for the child
- process to exit.
- The benchmark is intended to measure the overhead for creating a
- new thread of control, so it includes the \*[fork] and
- the \*[exit] time.
- .PP
- The benchmark also includes a \f(CWwait\fP system call in the parent and
- context switches from the parent to the child and back again. Given that
- context switches of this sort are on the order of 20 microseconds and a
- system call is on the order of 5 microseconds, and that the entire benchmark
- time is on the order of a millisecond or more, the extra overhead
- is insignificant.
- Note that even this relatively simple task is very expensive and is
- measured in milliseconds while most of the other operations we consider are
- measured in microseconds.
- .BU "New process creation" .
- The preceding benchmark did not create a new application; it created a
- copy of the old application. This benchmark measures the cost of creating a
- new process and changing that process into a new application, which.
- forms the basis of every Unix command
- line interface, or shell.
- \*[lmbench] measures this facility by forking a new child and having that child
- execute a new program \(em in this case, a tiny program that prints
- ``hello world'' and exits.
- .PP
- The startup cost is especially noticeable
- on (some) systems that have shared libraries. Shared libraries can
- introduce a substantial (tens of milliseconds) startup cost.
- .\" XXX - statically linked example?
- .TSTART
- .so ../Results/tmp/lat_allproc.tbl
- .TEND "Process creation time (milliseconds)"
- .BU "Complicated new process creation" .
- When programs start other programs, they frequently use one of
- three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first
- two interfaces start a new process by invoking the standard command
- interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way
- guarantees that the shell will look for the requested application
- in all of the places that the user would look \(em in other words, the shell
- uses the user's $PATH variable as a list of places to find the
- application. \*[execlp] is a C library routine which also looks for the
- program using the user's $PATH variable.
- .PP
- Since this is a common way of starting applications, we felt it
- was useful to show the costs of the generality.
- .PP
- We measure this by starting \f(CW/bin/sh\fP to start the same tiny
- program we ran in the last case.
- In Table \n[TABLE] the cost of asking the shell to go
- look for the program is
- quite large, frequently ten times as expensive as just creating a
- new process, and four times as expensive as explicitly naming the location
- of the new program.
- .PP
- The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results.
- Given that the processor is one of the fastest, the problem is likely to be
- software. There is room for substantial improvement in the Solaris
- process creation code.
- .NH 2
- Context switching
- .PP
- Context switch time is defined here as
- the time needed to save the state of one process and restore the state
- of another process.
- .PP
- Context switches are frequently in the critical performance path of
- distributed applications. For example, the multiprocessor versions
- of the IRIX operating system use
- processes to move data through the networking stack. This means that the
- processing time for each new packet arriving at an idle system includes
- the time needed to switch in the networking process.
- .PP
- Typical context switch benchmarks measure just the minimal context switch
- time \(em the time to switch between two processes that are doing nothing
- but context switching. We feel that this is
- misleading because there are frequently more than two active processes,
- and they usually have a larger working set (cache footprint)
- than the benchmark processes.
- .PP
- Other benchmarks frequently include the cost of
- the system calls needed to force the context switches.
- For example, Ousterhout's context switch benchmark
- measures context switch time plus a \*[read] and a \*[write]
- on a pipe.
- In many of the systems measured by \*[lmbench], the pipe overhead
- varies between 30% and 300% of the context switch time, so we were
- careful to factor out the pipe overhead.
- .BU "Number of processes."
- The context switch benchmark is implemented as
- a ring of two to twenty processes that are connected with Unix pipes.
- A token is passed from process to process, forcing context switches.
- The benchmark measures the time needed to pass
- the token two thousand times from process to process.
- Each transfer of the token has two costs: the context switch, and
- the overhead of passing the token.
- In order to calculate just the context switching time, the benchmark first
- measures the cost of passing the token through a ring of pipes in a
- single process. This overhead time is defined as the cost of passing
- the token and is not included in the reported context switch time.
- .BU "Size of processes."
- In order to measure more realistic context switch times, we add
- an artificial variable size ``cache footprint'' to the switching
- processes. The cost of the context switch then includes the cost
- of restoring user-level state (cache footprint). The cache footprint
- is implemented by having the process allocate an array of data\**
- .FS
- All arrays are at the same virtual
- address in all processes.
- .FE
- and sum
- the array as a series of integers after receiving the token but before
- passing the token to the next process. Since most systems will cache data
- across context switches, the working set for the benchmark is slightly
- larger than the number of processes times the array size.
- .PP
- It is worthwhile to point out that the overhead mentioned above
- also includes the cost of accessing the data, in the same way as
- the actual benchmark. However, because the overhead is measured
- in a single process, the cost is typically the cost with ``hot''
- caches. In the Figure 2, each size is plotted as a line, with
- context switch times on the Y axis, number of processes on the
- X axis, and the process size as the data set.
- The process size and the hot cache overhead costs for
- the pipe read/writes and any data access is what is labeled
- as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead
- is in microseconds.
- .PP
- The context switch time does not include anything other than
- the context switch, provided that all the benchmark processes fit in the
- cache. If the total size of all of the benchmark processes is larger
- than the cache size, the cost of each context switch will include cache
- misses.
- We are trying to show realistic context switch times as a
- function of both size and number of processes.
- .TSTART 1
- .so ctx.pic
- .FEND "Context switch times" 1
- .PP
- Results for an Intel Pentium Pro system running Linux at 167 MHz are
- shown in Figure \n[FIGURE].
- The data points on the figure are labeled with the working set
- due to the sum of data in all of the processes. The actual working set is
- larger, as it includes the process and kernel overhead as well.
- One would expect the context switch times to stay constant until
- the working set is
- approximately the size of the second level cache. The Intel system has a
- 256K second level cache, and the context switch times
- stay almost constant until about 256K (marked as .25M in the graph).
- .BU "Cache issues"
- The context switch benchmark is a deliberate measurement of the
- effectiveness of the caches across process context switches. If the
- cache does not include the process identifier (PID, also sometimes
- called an address space identifier) as part of the address, then the
- cache must be flushed on every context switch. If the cache does not map
- the same virtual addresses from different processes to different cache
- lines, then the cache will appear to be flushed on every context
- switch.
- .PP
- If the caches do
- not cache across context switches there would be no grouping at the
- lower left corner of Figure \n[FIGURE], instead, the graph would
- appear as a series of straight, horizontal, parallel lines. The number
- of processes will not matter, the two process case will be just as bad
- as the twenty process case since the cache would not be
- useful across context switches.
- .TSTART
- .so ../Results/tmp/ctx.tbl
- .TEND "Context switch time (microseconds)"
- .PP
- We picked four points on the graph and extracted those values for Table
- \n[TABLE]. The complete set of values, as well as tools to graph them,
- are included with \*[lmbench].
- .PP
- Note that multiprocessor context switch times are frequently more expensive
- than uniprocessor context switch times. This is because multiprocessor
- operating systems tend to have very complicated scheduling code.
- We believe that multiprocessor context switch times can be, and should be,
- within 10% of the uniprocessor times.
- .PP
- Linux does quite well on context switching, especially on the more
- recent architectures. By comparing the Linux 2 0K processes to the
- Linux 2 32K processes, it is apparent that there is something wrong
- with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can
- find at least part of the cause. The second level cache latency for the
- i586 is substantially worse than either the i686 or the Alpha.
- .PP
- Given the poor second level cache behavior of the PowerPC, it is surprising
- that it does so well on context switches, especially the larger sized cases.
- .PP
- The Sun Ultra1 context switches quite well in part because of enhancements
- to the register window handling in SPARC V9.
- .NH 2
- Interprocess communication latencies
- .PP
- Interprocess communication latency is important because many operations
- are control messages to another process (frequently on another
- system). The time to tell the remote process to
- do something is pure overhead and is frequently in the critical path
- of important functions such as distributed applications (e.g.,
- databases, network servers).
- .PP
- The interprocess communication latency benchmarks typically have the
- following form: pass a small message (a byte or so) back and forth between two
- processes. The reported results are always the microseconds needed
- to do one round trip. For one way timing,
- about half the round trip is right. However, the CPU cycles tend to be
- somewhat asymmetric for one trip: receiving is typically more
- expensive than sending.
- .BU "Pipe latency" .
- Unix pipes are an interprocess communication mechanism implemented as
- a one-way byte stream. Each end of the stream has an associated file
- descriptor; one is the write descriptor and the other the read
- descriptor.
- .PP
- Pipes are frequently used as a local IPC mechanism. Because of the
- simplicity of pipes, they are frequently the fastest portable
- communication mechanism.
- .PP
- Pipe latency is measured by creating a pair of pipes, forking a child process,
- and passing a word back and forth. This benchmark is identical to the
- two-process, zero-sized context switch benchmark, except that it includes
- both the context switching time and the pipe overhead in the results.
- .nr NTABLE \n[TABLE]+1
- .nr LTABLE \n[TABLE]
- Table \n[NTABLE] shows the round trip latency from process A to process B
- and back to process A.
- .TSTART
- .so ../Results/tmp/lat_pipe.tbl
- .TEND "Pipe latency (microseconds)"
- .PP
- The time can be broken down to two context switches plus four system calls
- plus the pipe overhead. The context switch component is two of the small
- processes in Table \n[LTABLE].
- This benchmark is identical to the context switch benchmark in
- .RN Ousterhout90 .
- .BU "TCP and RPC/TCP latency" .
- TCP sockets may be viewed as an interprocess communication mechanism similar
- to pipes with the added feature that TCP sockets work across machine
- boundaries.
- .PP
- TCP and RPC/TCP connections are frequently used in low-bandwidth,
- latency-sensitive applications. The default Oracle distributed
- lock manager uses TCP sockets, and the locks per second available
- from this service are accurately modeled by the TCP latency test.
- .TSTART
- .so ../Results/tmp/lat_tcp.tbl
- .TEND "TCP latency (microseconds)"
- .PP
- Sun's RPC is layered either over TCP or over UDP.
- The RPC layer is responsible for managing connections (the port mapper),
- managing different byte orders and word sizes (XDR), and implementing a
- remote procedure call abstraction.
- Table \n[TABLE] shows the same benchmark with and
- without the RPC layer to show the cost of the RPC implementation.
- .PP
- TCP latency is measured by having a server process that waits for connections
- and a client process that connects to the server. The two processes then
- exchange a word between them in a loop. The latency reported is one
- round-trip time. The measurements in Table \n[TABLE] are local
- or loopback measurements,
- since our intent is to show the overhead of the software. The same benchmark
- may be, and frequently is, used to measure host-to-host latency.
- .PP
- Note that the RPC layer frequently adds hundreds of microseconds of
- additional latency. The problem is not the external data
- representation (XDR) layer \(em the
- data being passed back and forth is a byte, so there is no XDR to be done.
- There is no justification for the extra cost; it is simply
- an expensive implementation. DCE RPC is worse.
- .TSTART
- .so ../Results/tmp/lat_udp.tbl
- .TEND "UDP latency (microseconds)"
- .BU "UDP and RPC/UDP latency" .
- UDP sockets are an alternative to TCP sockets. They differ in that UDP
- sockets are unreliable messages that leave the retransmission issues to
- the application. UDP sockets have a few advantages, however. They preserve
- message boundaries, whereas TCP does not; and a single UDP socket may
- send messages
- to any number of other sockets, whereas TCP sends data to only one place.
- .PP
- UDP and RPC/UDP messages are commonly used in many client/server applications.
- NFS is probably the most widely used RPC/UDP application in the world.
- .PP
- Like TCP latency, UDP latency is measured by having a server process
- that waits for connections
- and a client process that connects to the server. The two processes then
- exchange a word between them in a loop. The latency reported is round-trip
- time. The measurements in Table \n[TABLE] are local or loopback measurements,
- since our intent is to show the overhead of the software.
- Again, note that the RPC library can add hundreds of microseconds of extra
- latency.
- .\" .PP
- .\" It is interesting to compare UDP latency with TCP latency. In many cases the
- .\" TCP latency is \fBless\fP than the UDP latency. This flies in the face
- .\" of conventional wisdom, which says that TCP is an inherently more expensive
- .\" protocol than UDP. The reasons that TCP may appear faster are: in this
- .\" benchmark, the protocol costs are dwarfed by the other costs (context
- .\" switching, system calls, and driver overhead); and TCP is frequently
- .\" hand-tuned for performance, while UDP is rarely hand-tuned.
- .TSTART
- .so ipc.tbl
- .TEND "Remote latencies (microseconds)"
- .BU "Network latency" .
- We have a few results for over the wire latency included in Table \n[TABLE].
- As might be expected, the most heavily used network interfaces (i.e., ethernet)
- have the lowest latencies. The times shown include the time on the wire,
- which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit
- ethernet and FDDI, and less than 10 microseconds for Hippi.
- .BU "TCP connection latency" .
- TCP is a connection-based, reliable, byte-stream-oriented protocol. As
- part of this reliability, a connection must be established before any
- data can be transferred. The connection is accomplished by a ``three-way
- handshake,'' an exchange of packets when the client attempts to connect
- to the server.
- .PP
- Unlike UDP, where no connection is established, TCP sends packets
- at startup time. If an application creates a TCP connection to send
- one message, then the startup time can be a substantial
- fraction of the total connection and transfer costs.
- The benchmark shows that the connection cost is approximately half of
- the cost.
- .PP
- Connection cost is measured by having a server, registered using
- the port mapper, waiting for connections. The client figures out where the
- server is registered and then repeatedly times a \*[connect] system call to
- the server. The socket is closed after each connect. Twenty connects
- are completed and the fastest of them is used as the result. The time measured
- will include two of the three packets that make up the three way TCP handshake,
- so the cost is actually greater than the times listed.
- .\" XXX Larry --- if a machine's clock granularity is on the order of
- .\" 10 milliseconds, won't this benchmark run into granularity problems?
- .TSTART
- .so ../Results/tmp/lat_connect.tbl
- .TEND "TCP connect latency (microseconds)"
- .PP
- Table \n[TABLE] shows that if the need is to send
- a quick message to another process, given that most packets get through,
- a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive
- acknowledgments are needed, which they are in order to have an apples-to-apples
- comparison with TCP). If the transmission medium is 10Mbit Ethernet, the
- time on the wire will be approximately 65 microseconds each way, or 130
- microseconds total. To do the same thing with a short-lived TCP
- connection would cost 896 microseconds of wire time alone.
- .PP
- The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor
- is the point to suggest that all messages should be UDP. In many cases,
- the difference between 130 microseconds and 900 microseconds is
- insignificant compared with other aspects of application performance.
- However, if the application is very latency sensitive
- and the transmission medium is slow (such as serial link or a message
- through many routers), then a UDP message may prove cheaper.
- .NH 2
- File system latency
- .PP
- File system latency is defined as the time required to create or delete
- a zero length file.
- We define it this way because in many file systems,
- such as the BSD fast file system, the directory operations are done
- synchronously in order to maintain on-disk integrity. Since the
- file data is typically cached and sent to disk at some later date,
- the file creation and deletion become the bottleneck
- seen by an application. This bottleneck is substantial: to do
- a synchronous update to a disk is a matter of tens of milliseconds.
- In many cases, this bottleneck is much more of a perceived performance
- issue than processor speed.
- .PP
- The benchmark creates 1,000 zero-sized files and then deletes them.
- All the files are created in one directory and their names are
- short, such as "a", "b", "c", ... "aa", "ab", ....
- .TSTART
- .so lat_fs.tbl
- .TEND "File system latency (microseconds)"
- .PP
- The create and delete latencies are shown in Table \n[TABLE].
- Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster
- than the slowest systems. However, Linux does not guarantee
- anything about the disk integrity; the directory operations are done in
- memory. Other fast systems, such as SGI's XFS, use a log to guarantee the
- file system integrity.
- The slower systems, all those with ~10 millisecond file latencies, are
- using synchronous writes to guarantee the file system integrity.
- Unless Unixware has modified UFS substantially, they must be running in
- an unsafe mode since the FreeBSD UFS is much slower and both file
- systems are basically the 4BSD fast file system.
- .NH 2
- Disk latency
- .\" XXX - either get more results for this benchmark or delete it.
- .\" I'd really like to not delete it - lmdd is probably the most
- .\" useful tool and it gets the least press.
- .PP
- Included with \*[lmbench] is a small benchmarking program useful for
- measuring disk and file I/O. \*[lmdd], which is patterned after
- the Unix utility \f(CWdd\fP, measures both sequential and random I/O,
- optionally generates patterns on output and checks them on input,
- supports flushing the data from the buffer cache on systems that
- support \f(CWmsync\fP, and has a very flexible user interface.
- Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script
- wrapped around \*[lmdd].
- .PP
- While we could have generated both sequential and random I/O results as
- part of this paper, we did not because those benchmarks are heavily
- influenced by the performance of the disk drives used in the test. We
- intentionally measure only the system overhead of a SCSI command since
- that overhead may become a bottleneck in large database configurations.
- .PP
- Some important applications, such as transaction processing, are
- limited by random disk IO latency.
- Administrators can increase the number of disk operations per
- second by buying more disks, until the processor overhead becomes
- the bottleneck.
- The \f(CWlmdd\fP benchmark measures the processor overhead associated with each
- disk operation, and it can provide an upper bound on the number of
- disk operations the processor can support.
- It is designed for SCSI disks, and it assumes that most
- disks have 32-128K read-ahead buffers and that they can read ahead
- faster than the processor can request the chunks of data.\**
- .FS
- This may not always be true: a processor could be fast enough to make the
- requests faster than the rotating disk.
- If we take 6M/second to be disk
- speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or
- 81 microseconds/IO. We don't know of any processor/OS/IO controller
- combinations that can do an IO in 81 microseconds.
- .FE
- .PP
- The benchmark simulates a large number of disks by reading 512byte
- transfers sequentially from the raw disk device (raw disks are unbuffered
- and are not read ahead by Unix).
- Since the disk can read ahead faster than the system can request
- data, the benchmark is doing small transfers of data from the
- disk's track buffer.
- Another way to look at this is that the benchmark
- is doing memory-to-memory transfers across a SCSI channel.
- It is possible to generate loads of more than 1,000 SCSI
- operations/second on a single SCSI disk. For comparison, disks under
- database load typically run at 20-80 operations per second.
- .TSTART
- .so ../Results/tmp/lat_disk.tbl
- .TEND "SCSI I/O overhead (microseconds)"
- .PP
- The resulting overhead number represents a
- \fBlower\fP bound on the overhead of a disk I/O.
- The real overhead numbers will be higher on SCSI systems because
- most SCSI controllers will not disconnect if the request can be
- satisfied immediately.
- During the benchmark, the processor simply sends the request and
- transfers the data, while
- during normal operation, the processor will send the request,
- disconnect, get interrupted, reconnect, and transfer the data.
- .PP
- This technique can be used to discover how many drives a system can support
- before the system becomes CPU-limited because it can produce the
- overhead load of a fully configured system with just a few disks.
- .NH 1
- Future work
- .PP
- There are several known improvements and extensions that could be made
- to \*[lmbench].
- .BU "Memory latency" .
- The current benchmark measures clean-read latency. By clean, we mean that
- the cache lines being replaced are highly likely to be unmodified, so there
- is no associated write-back cost. We would like to extend the benchmark
- to measure dirty-read latency, as well as write latency. Other changes
- include making the benchmark impervious to sequential prefetching and
- measuring TLB miss cost.
- .BU "MP benchmarks" .
- None of the benchmarks in \*[lmbench] is designed to measure any
- multiprocessor features directly. At a minimum, we could measure
- cache-to-cache latency as well as cache-to-cache bandwidth.
- .BU "Static vs. dynamic processes" .
- In the process creation section, we allude to the cost of starting up processes
- that use shared libraries. When we figure out how to create statically linked
- processes on all or most systems, we could quantify these costs exactly.
- .BU "McCalpin's stream benchmark" .
- We will probably incorporate part or all of this benchmark into \*[lmbench].
- .BU "Automatic sizing" .
- We have enough technology that we could determine the size of the external
- cache and autosize the memory used such that the external cache had no effect.
- .BU "More detailed papers" .
- There are several areas that could yield some interesting papers. The
- memory latency section could use an in-depth treatment, and the
- context switching section could turn into an interesting discussion of
- caching technology.
- .NH 1
- Conclusion
- .PP
- \*[lmbench] is a useful, portable micro-benchmark suite designed to
- measure important aspects of system performance. We have found that a good
- memory subsystem is at least as important as the processor speed.
- As processors get faster and faster, more and more of the system design
- effort will need to move to the cache and memory subsystems.
- .NH 1
- Acknowledgments
- .PP
- Many people have provided invaluable help and insight into both the
- benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers
- were especially helpful.
- We thank all of them
- and especially thank:
- Ken Okin \s-1(SUN)\s0,
- Kevin Normoyle \s-1(SUN)\s0,
- Satya Nishtala \s-1(SUN)\s0,
- Greg Chesson \s-1(SGI)\s0,
- John Mashey \s-1(SGI)\s0,
- Neal Nuckolls \s-1(SGI)\s0,
- John McCalpin \s-1(Univ. of Delaware)\s0,
- Ron Minnich \s-1(Sarnoff)\s0,
- Chris Ruemmler \s-1(HP)\s0,
- Tom Rokicki \s-1(HP)\s0,
- and
- John Weitz \s-1(Digidesign)\s0.
- .PP
- We would also like to thank all of the people that have run the
- benchmark and contributed their results; none of this would have been possible
- without their assistance.
- .PP
- Our thanks to
- all of the free software community for tools that were used during this
- project.
- \*[lmbench] is currently developed on Linux, a copylefted Unix written by
- Linus Torvalds and his band of happy hackers.
- This paper and all of the
- \*[lmbench] documentation was produced using
- the \f(CWgroff\fP suite of tools written by James Clark.
- Finally, all of the data processing of the results is done with
- \f(CWperl\fP written by Larry Wall.
- .PP
- Sun Microsystems, and in particular Paul Borrill,
- supported the initial development of this project. Silicon Graphics
- has supported ongoing development that turned into far more time then we
- ever imagined. We are grateful to both of these companies for their
- financial support.
- .NH 1
- Obtaining the benchmarks
- .PP
- The benchmarks are available at
- .ft I
- http://reality.sgi.com/employees/lm_engr/lmbench.tgz
- .ft
- as well as via a mail server.
- You may request the latest version of \*[lmbench] by sending email
- to \fIarchives@slovax.engr.sgi.com\fP with \fIlmbench-current*\fP
- as the subject.
- .\" .R1
- .\" bibliography references
- .\" .R2
- .\"********************************************************************
- .\" Redefine the IP paragraph format so it won't insert a useless line
- .\" break when the paragraph tag is longer than the indent distance
- .\"
- .de @IP
- .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
- .par*start \\n[\\n[.ev]:ai] 0
- .if !'\\$1'' \{\
- . \" Divert the label so as to freeze any spaces.
- . di par*label
- . in 0
- . nf
- \&\\$1
- . di
- . in
- . fi
- . chop par*label
- . ti -\\n[\\n[.ev]:ai]u
- . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
- . el \{\
- \\*[par*label]
- .\". br
- . \}
- . rm par*label
- .\}
- ..
- .\"********************************************************************
- .\" redefine the way the reference tag is printed so it is enclosed in
- .\" square brackets
- .\"
- .de ref*end-print
- .ie d [F .IP "[\\*([F]" 2
- .el .XP
- \\*[ref*string]
- ..
- .\"********************************************************************
- .\" Get journal number entries right. Now will print as V(N) rather
- .\" than the awful V, N.
- .\"
- .de ref*add-N
- .ref*field N "" ( )
- ..
- .\"********************************************************************
- .\" Get journal volume entries right. Now will print as V(N) rather
- .\" than the awful V, N.
- .\"
- .de ref*add-V
- .ref*field V , "" "" ""
- ..
- .\"********************************************************************
- .\" Get the date entry right. Should not be enclosed in parentheses.
- .\"
- .de ref*add-D
- .ref*field D ","
- ..
- .R1
- accumulate
- sort A+DT
- database references
- label-in-text
- label A.nD.y-2
- bracket-label [ ] ", "
- bibliography references
- .R2
- .so bios
|