prefetch.bib

% Partial bibliography on data prefetching.

@string(alewife = "Alewife group, MIT LCS")
@string(can = "ACM Computer Architecture News")
@string(comp = "IEEE Computer")
@string(concurrency = "Concurrency: Practice and Experience")
@string(cva = "Concurrent VLSI Architecture group, MIT AI Lab")
@string(dc = "Distributed Computing")
@string(dmcc = "Distributed Memory Computing Conference")
@string(fgcs = "Future Generation Computer Systems")
@string(focs = "Symposium on the Foundations of Computer Science")
@string(ibmtech = "IBM Technical Disclosure Bulletin")
@string(ic = "Information and Computation")
@string(ics = "ACM International Conference on Supercomputing")
@string(icpp = "International Conference on Parallel Processing")
@string(ieee = "Journal of the IEEE")
@string(ijpp = "International Journal of Parallel Programming")
@string(ipps = "International Parallel Programming Symposium")
@string(isca = "Annual International Symposium on Computer Architecture")
@string(ism = "Annual International Symposium on Microarchitecture")
@string(jpdc = "Journal of Parallel and Distributed Computing")
@string(js = "The Journal of Supercomputing")
@string(jvlsics = "Journal of VLSI and Computer Systems")
@string(jvlsisp = "Journal of VLSI Signal Processing")
@string(micro = "IEEE Micro")
@string(ncc = "AFIPS Proceedings of the National Computer Conference")
@string(parcomp = "Parallel Computing")
@string(pieee = "Proceedings of the IEEE")
@string(ppopp = "ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming")
@string(sc = "Proceedings of Supercomputing")
@string(scjapan = "Systems and Computers in Japan")
@string(sfmpc = "Symposium on the Frontiers of Massively Parallel Computation")
@string(siamcomp = "SIAM Journal on Computing")
@string(spaa = "ACM Symposium on Parallel Architectures and Algorithms")
@string(stoc = "ACM Symposium on Theory of Computing")
@string(tcj = "The Computer Journal")
@string(tcomms = "IEEE Transactions on Communications")
@string(tcomps = "IEEE Transactions on Computers")
@string(tds = "ACM Transactions on Database Systems")
@string(tpami="IEEE Transactions on Pattern Analysis and Machine Intelligence")
@string(tpds = "IEEE Transactions on Parallel and Distributed Systems")
@string(ureport = "Microprocessor Report")
@string(wlcpc = "Workshop on Languages and Compilers for Parallel Computing")

@inproceedings(rogers-asplos,
	title = "Software support for speculative loads",
	author = "Anne Rogers and Kai Li",
	booktitle = "Fifth " # asplos,
	year = 1992, month = oct,
	pages = "38--50",
	annote = {\bn {\bf Article \#2.}  This article uses {\em
	speculative loads} as its key technology, replacing normal
	loads to registers, and augmenting the register file with
	presence and poison bits \`a la scoreboarding.  SPIM with no
	second-level cache and write-through first level cache is used.
	Lifting speculative loads in various ways is described, though
	no compiler support exists.  They show various Livermore Loops
	kernels, applying unrolling and lifting for the loads.  Results
	range from 10\% to about four times improvement.   They mention
	problems caused by extra register pressure. \en}
)

@inproceedings(chen-baer-asplos,
	title = "Reducing memory latency via non-blocking and prefetching
	caches",
	author = "Tien-Fu Chen and Jean-Loup Baer",
	booktitle = "Fifth " # asplos,
	year = 1992, month = oct,
	pages = "51--61",
	note = {Also available as U. Washington CS TR 92-06-03.},
	annote = {Also available as U. Washington CS TR 92-06-03.
	\bn {\bf Article \#3.}  This paper explores two hardware-based
	solutions to memory latency in a variety of memory systems.
	They run SPEC on a MIPS architectures.  Starting with
	non-blocking writes with bypass, they add prefetching using a
	branch-prediction plus write-prediction lookahead in hardware
	to reduce the read penalty by 35\%, and use non-blocking caches
	to reduce it by 16\%.  They also show a hybrid scheme that works
	well.  Instruction scheduling and register renaming are used to
	improve the results by up to 24\%.  \en}
)

@inproceedings(baer-sc,
	title = "An effective on-chip preloading scheme to reduce data access
	penalty",
	author = "Jean-Loup Baer and Tien-Fu Chen",
	booktitle = sc # " '91",
	year = 1991, month = nov,
	pages = "176--186",
	note = {Also available as U. Washington CS TR 91-03-07.},
	annote = {Also available as U. Washington CS TR 91-03-07.
	\bn This article explores the LA-PC and RPT \cite{chen-baer-asplos}
	technique in more detail.  They give good comparisons of hardware
	and software techniques.  They have hardware keyed to PC address
	which detects zero-stride and irregular stride accesses, using
	a standard branch prediction technique, and
	only prefetches constant stride.  They uses {\em pixie} on MIPS
	with some scientific and Unix programs, and nonoverlapped,
	overlapped, and pipelined memory.  They vary the LA distance and
	conclude 1-2 $\times$ memory latency is good. \en}
)

@inproceedings(mowry-lam-asplos,
	title = "Design and evaluation of a compiler algorithm for prefetching",
	author = "Todd C. Mowry and Monica S. Lam and Anoop Gupta",
	booktitle = "Fifth " # asplos,
	year = 1992, month = oct,
	pages = "62--73",
	annote = {\bn This paper presents a strong compiler prefetch
	scheme (MIPS with a mixed suite of apps).  Prefetch instructions
	are issued after performing locality analysis and peeling and
	splitting loops.  They show selective prefetch is better than
	indiscriminate, and that their scheme is robust in the face of
	various parameters.  It also interacts well with locality
	optimizers, where it covers what blocking misses.  It has a
	good related work section. \en}
)

@inproceedings(przybylski,
	title = "The performance impact of block sizes and fetch strategies",
	author = "Steven Przybylski",
	booktitle = "The 17th " # isca,
	year = 1990, month = jun,
	pages = "160--169",
	annote = {\bn Good simulation methodology, compared simple
	fetching against some simple hardware prefetching techniques, and
	concluded that it wasn't worthwhile to work hard on prefetching
	(gain of about 2--5\%).  Investigated cycle-time penalties for
	larger cache blocks, which quickly yields small optimal blocks.
	Investigates useful prefetching space in terms of bus utilization. \en}
)

@inproceedings(klaiber-91,
	title = "An architecture for software-controlled data prefetching",
	author = "Alexander C. Klaiber and Henry M. Levy",
	booktitle = "The 18th " # isca,
	year = 1991, month = may,
	pages = "43--53",
	annote = {\bn {\tt FETCH} instructions are used via fetch queues to
	a fetch unit that loads a
	fully-associative {\em fetchbuffer}, and a {\em prewriteback}
	technique is discussed to avoid replacement writeback stalls.
	Data is either in the cache or the fetchbuffer, not both.  There
	is an analysis of optimal prefetch distance in terms of number
	of references; no compiler support yet. Simulation is via a simple
	pessimistic MIPS simulator, on extended Livermore Loops.  They get
	up to 10-fold speedup on some loops, much less on others. \en}
)

@inproceedings(fu-patel-isca,
	title = "Data prefetching in multiprocessor vector cache memories",
	author = "John W. C. Fu and Janak H. Patel",
	booktitle = "The 18th " # isca,
	year = 1991, month = may,
	pages = "54--63",
	annote = {\bn Using Perfect Club (e.g. vectorized numeric) benchmarks,
	the paper shows how two simple prefetching schemes can outperform
	a non-prefetching cache when dealing with long stride vector
	accesses and multiprocessor block invalidations in vector caches.
	Simulation was with the authors' `cheap' simulator, with Alliant
	FX/8 traces.  Stride distance is assumed available to the cache.
	{\em Sequential prefetch} only prefetches scalar and short stride
	vectors, and works well.  {\em Stride-prefetch} prefetches with the
	stride for long stride vectors, and is better.  The paper discusses
	both miss ratio and completion time. \en}
)

@inproceedings(callahan-sc,
	title = "Data cache performance of supercomputer applications",
	author = "David Callahan and Allan Porterfield",
	booktitle = sc # " '90",
	year = 1990, month = nov,
	pages = "564--572",
	annote = {\bn This paper describes PFC-Sim and RiCEPS in detail.
	Cache performance is examined (size, associativity).  Simple
	line-size (overall sucessful) and next-block prefetching (generally
	a big win and little overhead) are assessed.  Hardware
	prefetching ``very effective'' for column-eriented programs;
	looks to software for improvements. \en}
)

@inproceedings(callahan-asplos,
	title = "Software prefetching",
	author = "David Callahan and Ken Kennedy and Allan Porterfield",
	booktitle = "Fourth " # asplos,
	year = 1991, month = apr,
	pages = "40--52",
	annote = {\bn A nonblocking prefetch instruction with a simple
	compiler algorithm is used, and tested on the RiCEPS suite (large
	scientific codes) with PFC-Sim.  Long line lengths and simple
	hardware prefetching are examined, and are OK.  Software prefetching
	is done by prefetching the next iteration of the loop, including
	indices for sparse matrices.  Simulation gave results on how far
	ahead the value was returned.  The paper assesses address generation
	overhead for prefetch, and looks to multiple
	issue, register allocation, and compiler technology to reduce it.
	Best avg. overhead is estimated at 12\%, which is suggested to
	cover 20+ cycle latencies.  \en}
)

@phdthesis(porterfield-rice,
	title = "Software Methods for Improvement of Cache Performance
	on Supercomputer Applications",
	author = "A. K. Porterfield",
	school = "Rice University",
	year = 1989, month = may,
	note = {Also available as Rice COMP TR 89-93.},
	annote = {Also available as Rice COMP TR 89-93.
	\bn This thesis expands on \cite{callahan-asplos}, giving most
	of the corroborating detail.  Simple hardware prefetching is
	shown to work well.  Various compiler transformations ar
	ediscussed.  Cache prefetching is considered, as well as
	overhead, and methods to reduce it. \en}
)

@inproceedings(gupta-isca,
	title = "Comparative evaluation of latency reducing and tolerating
	techniques",
	author = "Anoop Gupta and John Hennessy and Kourosh Gharachorloo
	and Todd Mowry and Wolf-Dietrich Weber",
	booktitle = "The 18th " # ISCA,
	year = 1991, month = may,
	pages = "254--263",
	annote = {\bn This article is an overview of four techniques: hardware
	coherency, relaxed memory consistency, software prefetching, and
	multiple contexts.  They modelled a prefetch buffer and introduced
	read and read-exclusive prefetching commands manually, getting
	generally good results, though less so where prefetch overhead was
	high.  They found prefetching did not work well with multiple
	contexts, since if one technique would suffice, you still get two
	overheads, and since context switching may invalidate prefetched
	data. \en}
)

@article(trivedi-tcomps,
	title = "Prepaging and applications to array algorithms",
	author = "Kishor S. Trivedi",
	journal = tcomps,
	volume = "C-25", number = 9,
	year = 1976, month = sep,
	pages = "915--921",
	annote = {\bn This paper examines main-memory paging algorithms,
	defining the optimal algorithm DPMIN as well as realizable
	algorithms with knowledge about dead pages and PRE(x)
	instructions inserted into the code.  The result is compared
	with DPMIN and shown to do well, up to an order of magnitude.
	Compiler work has not been done. \en}
)

@inproceedings(chen-pm,
	title = "Data access microarchitectures for superscalar processors
	with compiler-assisted data prefetching",
	author = "William Y. Chen and Scott A. Mahlke and Pohua P. Chang
	and Hwu, Wen-mei W.",
	booktitle = "The 24th " # ism,
	year = 1991, month = nov,
	pages = "69--73",
	annote = {\bn This paper examines tradeoffs between fetching into
	a cache and fetching into a prefetch buffer that forwards
	to the cache on cache misses, using a MIPS-like
	simulator on some Unix benchmarks (eqntott, espresso, tbl,
	xlisp, and yacc).  They compare both restricted and general
	code percolation.  A 32-entry prefetch buffer is shown to be
	enough.  Apparently they insert prefetches with the compiler.  \en}
)

@inproceedings(gornish-ics,
	title = "Compiler-directed data prefetching in multiprocessors
	with memory hierarchies",
	author = "Edward H. Gornish and Elana D. Granston and
	Alexander V. Veidenbaum",
	booktitle = ics,
	year = 1990, month = jun,
	pages = "354--368",
	annote = {\bn This paper examines multiprocessor issues.  They
	examine the earliest time at which you can prefetch, {\em
	suppressing} pulling out a reference based on control
	dependences and for data dependences.  They {\em freeze} loops,
	then examine them inside out to unfreeze and prefetch where
	possible, using vector prefetching where possible.  They
	consider both a memory-reference overlap and no-overlap model,
	using a wide variety of sample codes in the static analyes (LINPACK,
	EISPACK, REALSET).  For real simulation they used a pipelined system
	with memory across a switch from the processors.  They show a
	7\% improvement with overlapped fetches, and a further 2\%--61\%
	with prefetching. They suggest better results if they had
	more bandwidth. \en}
)

@mastersthesis(gornish-ms,
	title = "Compile Time Analysis for Data Prefetching",
	author = "Edward H. Gornish",
	school = "University of Illinois at Urbana-Champaign",
	year = 1989, month = dec,
	note = {Also available as Illinois CSRD-949.},
	annote = {Also available as Illinois CSRD-949.
	\bn This is an expanded version of \cite{gornish-ics}.
	No simulation results are given; for their overlapped model
	they predict speedups of 1.94 to 2.74 \en}
)

@inproceedings(lee-yew-isca,
	title = "Multiprocessor cache design considerations",
	author = "Roland L. Lee and Pen-Chung Yew and Duncan H. Lawrie",
	booktitle = "The 14th " # isca,
	pages = "253--262",
	year = 1987, month = jun,
	annote = {\bn This paper only briefly discusses prefetching.  A
	data buffer is used which holds the contents of instructions
	that are being decoded, and holds memory results and forwards
	data to functional units when necessary.  They conclude two
	things in the paper, (a) that miss ratios are worthless
	measures, particularly in multiprocessor systems (they use
	speedup instead), and (b) that the optimal line size for
	multiprocessors is small, only one or two words. \en}
)

@inproceedings(lee-yew-icpp,
	title = "Data Prefetching in Shared Memory Multiprocessors",
	author = "Roland L. Lee and Pen-Chung Yew and Duncan H. Lawrie",
	booktitle = icpp,
	year = 1987, month = aug,
	pages = "28--31",
	note = {Also as Illinois CSRD TR 639, January 1987.},
	annote = {Also as Illinois CSRD TR 639, January 1987.
	\bn This paper compares data prefetching with caches.  Prefetching
	is done with branch guessing (50\% either way) and data reference
	prefetching with data forwarding. It doesn't work too well with
	branches and synchronization due to non-cacheable references.
	They show that prefetching works better than caching, though the
	combination is still better. They use a collection of numeric
	routines, and test caches of 32--32K and prefetch buffers of
	0-16 words. \en}
)

@phdthesis(lee-thesis,
	title = "The Effectiveness of Caches and Data Prefetch
              Buffers in Large-Scale Shared Memory Multiprocessors",
	author = "Roland Lun Lee",
	school = "University of Illinois at Urbana-Champaign",
	year = 1987, month = may,
	note = {Also available as Illinois CSRD 670.},
	annote = {Also available as Illinois CSRD 670.
	\bn This work is basically the same as what is in
	\cite{lee-yew-isca,lee-yew-icpp}.  \en}
)

@article(johnson-acm,
	title = "Working set prefetching for cache memories",
	author = "Eric E. Johnson",
	journal = can,
	volume = 6, number = 17,
	year = 1989, month = dec,
	pages = "137--141",
	annote = {\bn This paper discusses ordinary prefetching, both
	history-based and {\em prescient}, i.e. compiler-assisted.  It
	then turns to {\em working set prefetching}, where the compiler
	determines working sets and then whole sets are prefetching
	when the program goes through phase transitions.
	Multiprocessors are considered, where sets are tagged and broadcast
	on request to reduce likely bandwidth required.  He also
	suggests a sequence number so that data-flow style cache
	updating of working sets can be done.  No numbers are presented.  \en}
)

@inproceedings(jouppi-isca,
	title = "Improving direct-mapped cache performance by the addition of
	a small fully-associative cache and prefetch buffers",
	author = "Norman P. Jouppi",
	booktitle = "The 17th " # ISCA,
	year = 1990, month = may,
	pages = "364--373",
	annote = {\bn Jouppi discusses {\em victim caches}, which are
	small (4-8 word) fully associative caches which hold words that
	are evicted from the main cache, and can then supply them with
	low latency if they are requested again.  It removes about 10\%
	of cache misses, but does better with larger cache lines.  He
	also suggests {\em stream buffers}.  When a miss occurs, a
	stream starts prefetching successive lines, providing fetched
	data to the cache and pipelining reads.  Multiple streams are
	also used with new streams selected at miss-time in LRU
	fashion.  The multi-way streams can remove about 43\% of the
	misses in the six benchmark programs.  The combined techniques
	give 143\% improvement in system performance on average. \en}
)

@article(smith-ieee,
	title = "Sequential program prefetching in memory hierarchies",
	author = "Alan Jay Smith",
	journal = comp,
	volume = 11, number = 12,
	year = 1978, month = dec,
	pages = "7--21",
	annote = {\bn This article deals just with sequential
	prefetching.  He finds prefetching works best for small cache
	lines (about 32--64 bytes), and not well for larger lines.  The
	paper examines the Amdhal 470V/6 and IBM 370/168.  He suggests
	improvements on the order of 10--25\%. \en}
)

@article(bennett-ibm,
	title = "Prefetching in a multilevel hierarchy",
	author = "B. T. Bennett and J. H. Pomerene and T. R. Puzak and
	R. N. Rechtschaffen",
	journal = ibmtech,
	volume = 25, number = 1,
	year = 1982, month = jun,
	pages = 88,
	annote = {\bn Short blurb about incorporating a {\em prefetching engine}
	for L1 cache in L2 cache. \en}
)

@article(gindele-ibm,
	title = "Buffer block prefetching method",
	author = "J. D. Gindele",
	journal = ibmtech,
	volume = 20, number = 2,
	year = 1977, month = jul,
	pages = "696--697",
	annote = {\bn Sets forth the next-block hardware prefetch strategy,
	and claims 32-128 byte lines are good and allow for caches of
	1/2 to 1/4 capacity to maintain the same hit ratio via prefetching.
	\en}
)

@inproceedings(decoupled-isca,
	title = "Decoupled access/execute computer architecture",
	author = "James E. Smith",
	booktitle = "The 9th " # isca,
	year = 1982, month = apr,
	pages = "112--119",
	annote = {\bn This paper gives a different slant on prefetch; the
	prefetching is handled by a decoupled {\em address} unit which
	communicates with queues back and forth with the {\em execute}
	unit.  No speculation is performed; the A unit is assumed to
	handle most of the conditional branching directly and provide
	the result to the E unit on a branch queue.  Results are given
	for 14 Livermore Loops using semi-Cray instructions, and
	speedups of up to 2.5 are seen, average 1.71, with Cray-1 style
	latencies, e.g. 11-cycle loads. \en}
)

@inproceedings(kroft-isca,
	title = "Lockup-free instruction fetch/prefetch cache organization",
	author = "David Kroft",
	booktitle = "The 8th " # ISCA,
	year = 1981, month = may,
	pages = "81--87",
	annote = {\bn Kroft documents how to build a cache which can have
	multiple outstanding requests (up to some hardware limit, which
	he suggests should be 4).  Miss information/status holding registers
	(MSHRs) are the key resources. \en}
)

@article(gannon-jpdc,
	title = "Strategies for cache and local memory management by
	global program transformation",
	author = "Dennis Gannon and William Jalby and Kyle Gallivan",
	journal = jpdc,
	volume = 5, number = 5,
	year = 1988, month = oct,
	pages = "587--616",
	note = {Also in the Proceedings of ICS '87.},
	annote = {Also in the Proceedings of ICS '87.
	\bn This paper is very mathematical, so I only skimmed it.
	Each variable is associated with a `reference window' which is
	used to determine what should be kept in cache (when controlled
	in software) or what the hit rate will be (for hardware
	control).  They consider program restructuring \`a la
	\cite{abu-sufah-79}, plus loop blocking. \en}
)

@inproceedings(farrens-isca,
	title = "Improving performance of small on-chip instruction caches",
	author = "Matthew K. Farrens and Andrew R. Pleszkun",
	booktitle = "The 16th " # isca,
	year = 1989, month = jun,
	pages = "234--241",
	annote = {\bn This paper does simple instruction prefetching by
	putting two queues between the instruction register and the
	cache; the queues force prefetches as they drain.  This gives
	speedup of up to $\times$2 over conventional caches, although
	they only simulate caches up to 512 bytes in size.  They also
	document PIPE chip features such as PBR (prepare to branch) and
	exposes load/store pipelines. \en}
)

@inproceedings(smith-isca-12,
	title = "Cache evaluation and the impact of workload choice",
	author = "Alan Jay Smith",
	booktitle = "12th " # isca,
	year = 1985, month = jun,
	pages = "64--73",
	annote = {\bn This article only touches on prefetching, where it
	looks at the simple $i+1$ prefetch model.  He finds prefetching
	is increasingly useful with increasing cache size, prefetching
	always cuts the ifetch miss ratio, and cuts the data miss ratioo
	for data caches of 8K or more.  He notes icache traffic goes up
	towards 30--40\% more with prefetching, and seemingly by almost
	a factor of 2 for data prefetching. \en}
)

@inproceedings(scheurich-dubois,
	title = "Concurrent miss resolution in multiprocessor caches",
	author = "Christoph Scheurich and Michel Dubois",
	booktitle = icpp,
	year = 1988, month = aug,
	volume = "I",
	pages = "118--125",
	annote = {\bn This article describes a good cache architecture:
	one word cache lines allow writes to avoid writebacks, but the
	lockup-free nature of the cache allows multiple outstanding
	reads to avoid taking a big hit from the higher miss rate.
	A compiler-enforced weak consistency model also simplifies the
	hardware.  Prefetching is suggested as helpful in this situation.
	Some analytical curves are given. \en}
)

@article(anant-tocs,
	title = "An Analytical Cache Model",
	author = "Anant Agarwal and Mark Horowitz and John Hennessy",
	journal = tocs,
	year = 1989, month = may,
	volume = 7, number = 2,
	pages = "184--215",
	annote = {\bn A very interesting paper in which an analytical
	model for cache performance is derived and compared with
	results.  The model is yields hit rate as a function of
	including cache size, associativity, block size, sub-block
	size, multiprogramming level, task switch interval, and
	observation interval.  Measured factors include the number of
	blocks referenced per interval, the total number of blocks
	referenced, and the {\em collision rate}.  \en}
)

@article(smith-cache,
	title = "Cache memories",
	author = "Alan Jay Smith",
	journal = acmcs,
	volume = 14, number = 3,
	year = 1982, month = sep,
	pages = "473--530",
	annote = {\bn This giant paper steps through many features of
	caches and comments on them.  One of the initial sections gives
	a good overview of his views on prefetching, specifically for
	OBL (one block lookahead); he compares always prefetch with
	tagged prefetch, which basically prefetches on the second
	reference to a block. \en}
)

@article(driscoll-ibm,
	title = "Cache miss directory---a means of prefetching cache `missed'
	lines",
	author = "G. C. Driscoll and J. J. Losq and T. R. Puzak and G. S. Rao
	and H. E. Sachar and R. D. Villani",
	journal = ibmtech,
	volume = 25, number = "3A",
	year = 1982, month = aug,
	pages = "1286",
	annote = {\bn This one-page paper outlines a scheme where a number of
	{\em trigger} addresses are assocated with {\em target} prefetch
	addresses, and referencing the trigger causes prefetching of the
	target, at the lowest priority and aborted by a different accesss.
	The CMD is loaded by an initial run with a hardware monitor. \en}
)

@article(rechtschaffen-ibm,
	title = "Using a branch history table to prefetch cache lines",
	author = "R. N. Rechtschaffen",
	journal = ibmtech,
	volume = 22, number = 12,
	year = 1980, month = may,
	pages = "5539",
	annote = {\bn The BHT is suggested as a way to prefetch instruction
	lines. \en}
)

@techreport(chi-purdue,
	title = "Compiler-driven cache policy (known reference string)",
	author = "Chi-Hung Chi and Henry G. Dietz",
	institution = "Purdue University",
	number = "TR-EE 87-21",
	year = 1987, month = jun,
	annote = {\bn This paper doesn't say too much about prefetching.
	Their main focus is showing how you can take a known reference
	string of cache lines, add weights to transitions between cache
	states, and do a shortest-path search, then allow tags on your
	instructions for fetch-bypass-cache and fetch-evict-line-NN
	when the data is not already in cache.  They give the compiler
	performance as $\Theta(k^{-1}mn)$, and tiny examples of improved
	run-time performance, but no real results yet. More work
	coming. \en}
)

@unpublished(selvidge-isca,
	title = "Compilation-based Data Prefetching",
	author = "Charles Selvidge",
	year = 1992, month = may,
	note = {May, 1992.  Draft of paper for submission.},
	annote = {May, 1992.  Draft of paper for submission.
	\bn This paper employs good prefetching techniques.
	References are divided into {\em badrefs} which need
	prefetching, and {\em goodrefs}, which can be used to add
	parallelism as necessary.  Prefetching is done both on {\em
	RAAD}s and also in a trace-driven manner, both proving useful
	for different codes.  The SPEC benchmark is simulated on Sparc
	with a prefetch instruction and stall and interlocked memory;
	interlocked memory with 160 cycle latency shows 2-6 $\times$
	improvement with prefetching. \en}
)

@phdthesis(selvidge,
	title = "Compilation-Based Prefetching for Memory Latency Tolerance",
	author = "Charles Selvidge",
	school = "MIT",
	year = 1992, month = may,
	annote = {\bn This thesis is the extended version of
	\cite{selvidge-isca}; it provides implementation specifics and more
	detailed results. \en}
)

@article(comp-survey,
	title = "A survey of {RISC} processors and computers of the mid-1980's",
	author = "Charles E. Gimarc and Veljko M. Milutinovi\'c",
	journal = comp,
	volume = 20, number = 9,
	year = 1987, month = sep,
	pages = "59--69",
	annote = {\bn A general survey of RISC chips.  It doesn't discuss
	prefetching much. \en}
)

@techreport(rau-stanford,
	title = "Sequential Prefetch Strategies for Instructions and Data",
	author = "B. R. Rau",
	institution = "Stanford University, Digital Systems Laboratory",
	number = "CSL-TR-77-131",
	year = 1977,
	annote = {\bn Unread. \en}
)

@inproceedings(bpt-asplos,
	title = "Improving the accuracy of dynamic branch prediction
	using branch correlation",
	author = "Pan, Shien-Tai and Kimming So and Joseph T. Rahmeh",
	booktitle = "The Fifth " #asplos,
	year = 1992, month = oct,
	pages = "76--84",
	annote = {\bn A handy reference for some prefetching techniques. \en}
)

@article(hill-assoc,
	title = "Evaluating associativity in {CPU} caches",
	author = "Mark D. Hill and Alan Jay Smith",
	journal = tcomps,
	volume = 38, number = 12,
	year = 1989, month = dec,
	pages = "1612--1630",
	annote = {\bn Information on compulsory/capacity/conflict misses. \en}
)

@unpublished(alpha,
	title = "Alpha Architecture Technical Summary",
	author = "Dick Sites and Rich Witek",
	year = 1992,
	note = "Available for anonymous FTP from
	{\tt gatekeeper.dec.com} as {\tt pub/DEC/Alpha/technical-summary.txt}",
	annote = {\bn Not read. \en}
)

@book(rs6000,
	title = "IBM Journal of Research and Development, Special
	Issue on RISC System/6000",
	author = "IBM",
	publisher = "IBM",
	year = 1990, month = jan,
	annote = {\bn Not read. \en}
)

@book(moto88100,
	title = "MCS88100:  RISC Microprocessor User's Manual",
	author = "Motorola",
	publisher = "Motorola Inc.",
	year = 1988,
	annote = {\bn Not read. \en}
)

@inproceedings(fp-micro,
	title = "Stride directed prefetching in scalar processors",
	author = "J. W. C. Fu and J. H. Patel",
	booktitle = "Proceedings of the 25th MICRO",
	pages = "102--110",
	year = 1992,
	annote = {\bn Expands on their previous paper with an SPT (stride
	prediction table), with no IP prediction component.  They give
	a good discussion of miss rates and prefetch overhead without
	actually showing performance figures. \en}
)

@phdthesis(chen-thesis,
	title = "Data Prefetching for High-Performance Processors",
	author = "Tien-Fu Chen",
	school = "University of Washington at Seattle",
	year = 1993, month = jul,
	note = {Also available as TR 93-07-01.},
	annote = {\bn Gives interesting results beyond \cite{chen-baer-asplos};
	software prefetching is compared to their hardware-only scheme,
	as well as a combined scheme.  Presents as good an argument as I've
	seen for the all-hardware approach. \en}
)

@article(spec-cache,
	title = "Cache performance of the {SPEC92} benchmark suite",
	author = "Jeffrey D. Gee and Mark D. Hill and Dionisis N.
	Pnevmatikatos and Alan Jay Smith",
	journal = micro,
	year = 1993, month = aug,
	volume = 13, number = 4,
	pages = "17--27",
	annote = {\bn Interesting reading. \en}
)

@inproceedings(spaniol,
	title = "Demand prepaging algorithms based on a model
	of locality of programs",
	author = "Otto Spaniol",
	booktitle = "Computer Architectures and Networks",
	year = 1974, month = aug,
	editor = "E. Gelenbe and R. Mahl",
	pages = "515--527",
	publisher = "North-Holland",
 	annote = {\bn Not too directly relevant to prefetching, but
	interesting. \en}
)

@article(sklenar-can,
	title = "Prefetch unit for vector operations on scalar computers",
	author = "Ivan Sklen\'a\v{r}",
	journal = can,
	volume = 20, number = 4,
	year = 1992, month = sep,
	pages = "31--37",
	annote = {Very similar to constant-stride prefetch described
	in baer-sc, so not very interesting.  No prediction attempts.}
)

@article(smith-bib,
	title = "Bibliography and Readings on {CPU} Cache Memories and
	Related Topics",
	author = "Alan Jay Smith",
	journal = can,
	volume = 14, number = 1,
	year = 1986, month = jan,
	pages = "22--42",
	annote = {\bn Didn't really read through it. \en}
)

@phdthesis(anant-su,
	title = "Analysis of Cache Performance for Operating Systems and
	Multiprogramming",
	author = "Anant Agarwal",
	school = "Stanford University",
	year = 1987, month = may,
	annote = {\bn Available as Technical Report CSL-TR-87-332.
	Didn't read. \en}
)

@article(bennett,
	title = "Cache Memory with Prefetching of Data by Priority",
	author = "B. T. Bennett and P. A. Franaczek",
	journal = ibmtech,
	volume = 18, number = 12,
	year = 1976, month = may,
	pages = "4231--4232",
	annote = {\bn Couldn't find. \en}
)

@article(enger-ibm,
	title = "Paged control store prefetch mechanism",
	author = "T. A. Enger",
	journal = ibmtech,
	volume = 16, number = 7,
	year = 1973, month = dec,
	pages = "2140--2141",
	annote = {\bn Couldn't find. \en}
)

@inproceedings(fu-patel-ipps,
	title = "Data prefetching strategies for vector cache memories",
	author = "John W. C. Fu and Janak H. Patel",
	booktitle = "5th " # ipps,
	year = 1991, month = may,
	pages = "555--560",
	annote = {\bn See \cite{fu-patel-isca} for all details; this paper
	has identical content. \en}
)