-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathprefetch.bib
797 lines (742 loc) · 30.9 KB
/
prefetch.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
% Partial bibliography on data prefetching.
@string(alewife = "Alewife group, MIT LCS")
@string(can = "ACM Computer Architecture News")
@string(comp = "IEEE Computer")
@string(concurrency = "Concurrency: Practice and Experience")
@string(cva = "Concurrent VLSI Architecture group, MIT AI Lab")
@string(dc = "Distributed Computing")
@string(dmcc = "Distributed Memory Computing Conference")
@string(fgcs = "Future Generation Computer Systems")
@string(focs = "Symposium on the Foundations of Computer Science")
@string(ibmtech = "IBM Technical Disclosure Bulletin")
@string(ic = "Information and Computation")
@string(ics = "ACM International Conference on Supercomputing")
@string(icpp = "International Conference on Parallel Processing")
@string(ieee = "Journal of the IEEE")
@string(ijpp = "International Journal of Parallel Programming")
@string(ipps = "International Parallel Programming Symposium")
@string(isca = "Annual International Symposium on Computer Architecture")
@string(ism = "Annual International Symposium on Microarchitecture")
@string(jpdc = "Journal of Parallel and Distributed Computing")
@string(js = "The Journal of Supercomputing")
@string(jvlsics = "Journal of VLSI and Computer Systems")
@string(jvlsisp = "Journal of VLSI Signal Processing")
@string(micro = "IEEE Micro")
@string(ncc = "AFIPS Proceedings of the National Computer Conference")
@string(parcomp = "Parallel Computing")
@string(pieee = "Proceedings of the IEEE")
@string(ppopp = "ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming")
@string(sc = "Proceedings of Supercomputing")
@string(scjapan = "Systems and Computers in Japan")
@string(sfmpc = "Symposium on the Frontiers of Massively Parallel Computation")
@string(siamcomp = "SIAM Journal on Computing")
@string(spaa = "ACM Symposium on Parallel Architectures and Algorithms")
@string(stoc = "ACM Symposium on Theory of Computing")
@string(tcj = "The Computer Journal")
@string(tcomms = "IEEE Transactions on Communications")
@string(tcomps = "IEEE Transactions on Computers")
@string(tds = "ACM Transactions on Database Systems")
@string(tpami="IEEE Transactions on Pattern Analysis and Machine Intelligence")
@string(tpds = "IEEE Transactions on Parallel and Distributed Systems")
@string(ureport = "Microprocessor Report")
@string(wlcpc = "Workshop on Languages and Compilers for Parallel Computing")
@inproceedings(rogers-asplos,
title = "Software support for speculative loads",
author = "Anne Rogers and Kai Li",
booktitle = "Fifth " # asplos,
year = 1992, month = oct,
pages = "38--50",
annote = {\bn {\bf Article \#2.} This article uses {\em
speculative loads} as its key technology, replacing normal
loads to registers, and augmenting the register file with
presence and poison bits \`a la scoreboarding. SPIM with no
second-level cache and write-through first level cache is used.
Lifting speculative loads in various ways is described, though
no compiler support exists. They show various Livermore Loops
kernels, applying unrolling and lifting for the loads. Results
range from 10\% to about four times improvement. They mention
problems caused by extra register pressure. \en}
)
@inproceedings(chen-baer-asplos,
title = "Reducing memory latency via non-blocking and prefetching
caches",
author = "Tien-Fu Chen and Jean-Loup Baer",
booktitle = "Fifth " # asplos,
year = 1992, month = oct,
pages = "51--61",
note = {Also available as U. Washington CS TR 92-06-03.},
annote = {Also available as U. Washington CS TR 92-06-03.
\bn {\bf Article \#3.} This paper explores two hardware-based
solutions to memory latency in a variety of memory systems.
They run SPEC on a MIPS architectures. Starting with
non-blocking writes with bypass, they add prefetching using a
branch-prediction plus write-prediction lookahead in hardware
to reduce the read penalty by 35\%, and use non-blocking caches
to reduce it by 16\%. They also show a hybrid scheme that works
well. Instruction scheduling and register renaming are used to
improve the results by up to 24\%. \en}
)
@inproceedings(baer-sc,
title = "An effective on-chip preloading scheme to reduce data access
penalty",
author = "Jean-Loup Baer and Tien-Fu Chen",
booktitle = sc # " '91",
year = 1991, month = nov,
pages = "176--186",
note = {Also available as U. Washington CS TR 91-03-07.},
annote = {Also available as U. Washington CS TR 91-03-07.
\bn This article explores the LA-PC and RPT \cite{chen-baer-asplos}
technique in more detail. They give good comparisons of hardware
and software techniques. They have hardware keyed to PC address
which detects zero-stride and irregular stride accesses, using
a standard branch prediction technique, and
only prefetches constant stride. They uses {\em pixie} on MIPS
with some scientific and Unix programs, and nonoverlapped,
overlapped, and pipelined memory. They vary the LA distance and
conclude 1-2 $\times$ memory latency is good. \en}
)
@inproceedings(mowry-lam-asplos,
title = "Design and evaluation of a compiler algorithm for prefetching",
author = "Todd C. Mowry and Monica S. Lam and Anoop Gupta",
booktitle = "Fifth " # asplos,
year = 1992, month = oct,
pages = "62--73",
annote = {\bn This paper presents a strong compiler prefetch
scheme (MIPS with a mixed suite of apps). Prefetch instructions
are issued after performing locality analysis and peeling and
splitting loops. They show selective prefetch is better than
indiscriminate, and that their scheme is robust in the face of
various parameters. It also interacts well with locality
optimizers, where it covers what blocking misses. It has a
good related work section. \en}
)
@inproceedings(przybylski,
title = "The performance impact of block sizes and fetch strategies",
author = "Steven Przybylski",
booktitle = "The 17th " # isca,
year = 1990, month = jun,
pages = "160--169",
annote = {\bn Good simulation methodology, compared simple
fetching against some simple hardware prefetching techniques, and
concluded that it wasn't worthwhile to work hard on prefetching
(gain of about 2--5\%). Investigated cycle-time penalties for
larger cache blocks, which quickly yields small optimal blocks.
Investigates useful prefetching space in terms of bus utilization. \en}
)
@inproceedings(klaiber-91,
title = "An architecture for software-controlled data prefetching",
author = "Alexander C. Klaiber and Henry M. Levy",
booktitle = "The 18th " # isca,
year = 1991, month = may,
pages = "43--53",
annote = {\bn {\tt FETCH} instructions are used via fetch queues to
a fetch unit that loads a
fully-associative {\em fetchbuffer}, and a {\em prewriteback}
technique is discussed to avoid replacement writeback stalls.
Data is either in the cache or the fetchbuffer, not both. There
is an analysis of optimal prefetch distance in terms of number
of references; no compiler support yet. Simulation is via a simple
pessimistic MIPS simulator, on extended Livermore Loops. They get
up to 10-fold speedup on some loops, much less on others. \en}
)
@inproceedings(fu-patel-isca,
title = "Data prefetching in multiprocessor vector cache memories",
author = "John W. C. Fu and Janak H. Patel",
booktitle = "The 18th " # isca,
year = 1991, month = may,
pages = "54--63",
annote = {\bn Using Perfect Club (e.g. vectorized numeric) benchmarks,
the paper shows how two simple prefetching schemes can outperform
a non-prefetching cache when dealing with long stride vector
accesses and multiprocessor block invalidations in vector caches.
Simulation was with the authors' `cheap' simulator, with Alliant
FX/8 traces. Stride distance is assumed available to the cache.
{\em Sequential prefetch} only prefetches scalar and short stride
vectors, and works well. {\em Stride-prefetch} prefetches with the
stride for long stride vectors, and is better. The paper discusses
both miss ratio and completion time. \en}
)
@inproceedings(callahan-sc,
title = "Data cache performance of supercomputer applications",
author = "David Callahan and Allan Porterfield",
booktitle = sc # " '90",
year = 1990, month = nov,
pages = "564--572",
annote = {\bn This paper describes PFC-Sim and RiCEPS in detail.
Cache performance is examined (size, associativity). Simple
line-size (overall sucessful) and next-block prefetching (generally
a big win and little overhead) are assessed. Hardware
prefetching ``very effective'' for column-eriented programs;
looks to software for improvements. \en}
)
@inproceedings(callahan-asplos,
title = "Software prefetching",
author = "David Callahan and Ken Kennedy and Allan Porterfield",
booktitle = "Fourth " # asplos,
year = 1991, month = apr,
pages = "40--52",
annote = {\bn A nonblocking prefetch instruction with a simple
compiler algorithm is used, and tested on the RiCEPS suite (large
scientific codes) with PFC-Sim. Long line lengths and simple
hardware prefetching are examined, and are OK. Software prefetching
is done by prefetching the next iteration of the loop, including
indices for sparse matrices. Simulation gave results on how far
ahead the value was returned. The paper assesses address generation
overhead for prefetch, and looks to multiple
issue, register allocation, and compiler technology to reduce it.
Best avg. overhead is estimated at 12\%, which is suggested to
cover 20+ cycle latencies. \en}
)
@phdthesis(porterfield-rice,
title = "Software Methods for Improvement of Cache Performance
on Supercomputer Applications",
author = "A. K. Porterfield",
school = "Rice University",
year = 1989, month = may,
note = {Also available as Rice COMP TR 89-93.},
annote = {Also available as Rice COMP TR 89-93.
\bn This thesis expands on \cite{callahan-asplos}, giving most
of the corroborating detail. Simple hardware prefetching is
shown to work well. Various compiler transformations ar
ediscussed. Cache prefetching is considered, as well as
overhead, and methods to reduce it. \en}
)
@inproceedings(gupta-isca,
title = "Comparative evaluation of latency reducing and tolerating
techniques",
author = "Anoop Gupta and John Hennessy and Kourosh Gharachorloo
and Todd Mowry and Wolf-Dietrich Weber",
booktitle = "The 18th " # ISCA,
year = 1991, month = may,
pages = "254--263",
annote = {\bn This article is an overview of four techniques: hardware
coherency, relaxed memory consistency, software prefetching, and
multiple contexts. They modelled a prefetch buffer and introduced
read and read-exclusive prefetching commands manually, getting
generally good results, though less so where prefetch overhead was
high. They found prefetching did not work well with multiple
contexts, since if one technique would suffice, you still get two
overheads, and since context switching may invalidate prefetched
data. \en}
)
@article(trivedi-tcomps,
title = "Prepaging and applications to array algorithms",
author = "Kishor S. Trivedi",
journal = tcomps,
volume = "C-25", number = 9,
year = 1976, month = sep,
pages = "915--921",
annote = {\bn This paper examines main-memory paging algorithms,
defining the optimal algorithm DPMIN as well as realizable
algorithms with knowledge about dead pages and PRE(x)
instructions inserted into the code. The result is compared
with DPMIN and shown to do well, up to an order of magnitude.
Compiler work has not been done. \en}
)
@inproceedings(chen-pm,
title = "Data access microarchitectures for superscalar processors
with compiler-assisted data prefetching",
author = "William Y. Chen and Scott A. Mahlke and Pohua P. Chang
and Hwu, Wen-mei W.",
booktitle = "The 24th " # ism,
year = 1991, month = nov,
pages = "69--73",
annote = {\bn This paper examines tradeoffs between fetching into
a cache and fetching into a prefetch buffer that forwards
to the cache on cache misses, using a MIPS-like
simulator on some Unix benchmarks (eqntott, espresso, tbl,
xlisp, and yacc). They compare both restricted and general
code percolation. A 32-entry prefetch buffer is shown to be
enough. Apparently they insert prefetches with the compiler. \en}
)
@inproceedings(gornish-ics,
title = "Compiler-directed data prefetching in multiprocessors
with memory hierarchies",
author = "Edward H. Gornish and Elana D. Granston and
Alexander V. Veidenbaum",
booktitle = ics,
year = 1990, month = jun,
pages = "354--368",
annote = {\bn This paper examines multiprocessor issues. They
examine the earliest time at which you can prefetch, {\em
suppressing} pulling out a reference based on control
dependences and for data dependences. They {\em freeze} loops,
then examine them inside out to unfreeze and prefetch where
possible, using vector prefetching where possible. They
consider both a memory-reference overlap and no-overlap model,
using a wide variety of sample codes in the static analyes (LINPACK,
EISPACK, REALSET). For real simulation they used a pipelined system
with memory across a switch from the processors. They show a
7\% improvement with overlapped fetches, and a further 2\%--61\%
with prefetching. They suggest better results if they had
more bandwidth. \en}
)
@mastersthesis(gornish-ms,
title = "Compile Time Analysis for Data Prefetching",
author = "Edward H. Gornish",
school = "University of Illinois at Urbana-Champaign",
year = 1989, month = dec,
note = {Also available as Illinois CSRD-949.},
annote = {Also available as Illinois CSRD-949.
\bn This is an expanded version of \cite{gornish-ics}.
No simulation results are given; for their overlapped model
they predict speedups of 1.94 to 2.74 \en}
)
@inproceedings(lee-yew-isca,
title = "Multiprocessor cache design considerations",
author = "Roland L. Lee and Pen-Chung Yew and Duncan H. Lawrie",
booktitle = "The 14th " # isca,
pages = "253--262",
year = 1987, month = jun,
annote = {\bn This paper only briefly discusses prefetching. A
data buffer is used which holds the contents of instructions
that are being decoded, and holds memory results and forwards
data to functional units when necessary. They conclude two
things in the paper, (a) that miss ratios are worthless
measures, particularly in multiprocessor systems (they use
speedup instead), and (b) that the optimal line size for
multiprocessors is small, only one or two words. \en}
)
@inproceedings(lee-yew-icpp,
title = "Data Prefetching in Shared Memory Multiprocessors",
author = "Roland L. Lee and Pen-Chung Yew and Duncan H. Lawrie",
booktitle = icpp,
year = 1987, month = aug,
pages = "28--31",
note = {Also as Illinois CSRD TR 639, January 1987.},
annote = {Also as Illinois CSRD TR 639, January 1987.
\bn This paper compares data prefetching with caches. Prefetching
is done with branch guessing (50\% either way) and data reference
prefetching with data forwarding. It doesn't work too well with
branches and synchronization due to non-cacheable references.
They show that prefetching works better than caching, though the
combination is still better. They use a collection of numeric
routines, and test caches of 32--32K and prefetch buffers of
0-16 words. \en}
)
@phdthesis(lee-thesis,
title = "The Effectiveness of Caches and Data Prefetch
Buffers in Large-Scale Shared Memory Multiprocessors",
author = "Roland Lun Lee",
school = "University of Illinois at Urbana-Champaign",
year = 1987, month = may,
note = {Also available as Illinois CSRD 670.},
annote = {Also available as Illinois CSRD 670.
\bn This work is basically the same as what is in
\cite{lee-yew-isca,lee-yew-icpp}. \en}
)
@article(johnson-acm,
title = "Working set prefetching for cache memories",
author = "Eric E. Johnson",
journal = can,
volume = 6, number = 17,
year = 1989, month = dec,
pages = "137--141",
annote = {\bn This paper discusses ordinary prefetching, both
history-based and {\em prescient}, i.e. compiler-assisted. It
then turns to {\em working set prefetching}, where the compiler
determines working sets and then whole sets are prefetching
when the program goes through phase transitions.
Multiprocessors are considered, where sets are tagged and broadcast
on request to reduce likely bandwidth required. He also
suggests a sequence number so that data-flow style cache
updating of working sets can be done. No numbers are presented. \en}
)
@inproceedings(jouppi-isca,
title = "Improving direct-mapped cache performance by the addition of
a small fully-associative cache and prefetch buffers",
author = "Norman P. Jouppi",
booktitle = "The 17th " # ISCA,
year = 1990, month = may,
pages = "364--373",
annote = {\bn Jouppi discusses {\em victim caches}, which are
small (4-8 word) fully associative caches which hold words that
are evicted from the main cache, and can then supply them with
low latency if they are requested again. It removes about 10\%
of cache misses, but does better with larger cache lines. He
also suggests {\em stream buffers}. When a miss occurs, a
stream starts prefetching successive lines, providing fetched
data to the cache and pipelining reads. Multiple streams are
also used with new streams selected at miss-time in LRU
fashion. The multi-way streams can remove about 43\% of the
misses in the six benchmark programs. The combined techniques
give 143\% improvement in system performance on average. \en}
)
@article(smith-ieee,
title = "Sequential program prefetching in memory hierarchies",
author = "Alan Jay Smith",
journal = comp,
volume = 11, number = 12,
year = 1978, month = dec,
pages = "7--21",
annote = {\bn This article deals just with sequential
prefetching. He finds prefetching works best for small cache
lines (about 32--64 bytes), and not well for larger lines. The
paper examines the Amdhal 470V/6 and IBM 370/168. He suggests
improvements on the order of 10--25\%. \en}
)
@article(bennett-ibm,
title = "Prefetching in a multilevel hierarchy",
author = "B. T. Bennett and J. H. Pomerene and T. R. Puzak and
R. N. Rechtschaffen",
journal = ibmtech,
volume = 25, number = 1,
year = 1982, month = jun,
pages = 88,
annote = {\bn Short blurb about incorporating a {\em prefetching engine}
for L1 cache in L2 cache. \en}
)
@article(gindele-ibm,
title = "Buffer block prefetching method",
author = "J. D. Gindele",
journal = ibmtech,
volume = 20, number = 2,
year = 1977, month = jul,
pages = "696--697",
annote = {\bn Sets forth the next-block hardware prefetch strategy,
and claims 32-128 byte lines are good and allow for caches of
1/2 to 1/4 capacity to maintain the same hit ratio via prefetching.
\en}
)
@inproceedings(decoupled-isca,
title = "Decoupled access/execute computer architecture",
author = "James E. Smith",
booktitle = "The 9th " # isca,
year = 1982, month = apr,
pages = "112--119",
annote = {\bn This paper gives a different slant on prefetch; the
prefetching is handled by a decoupled {\em address} unit which
communicates with queues back and forth with the {\em execute}
unit. No speculation is performed; the A unit is assumed to
handle most of the conditional branching directly and provide
the result to the E unit on a branch queue. Results are given
for 14 Livermore Loops using semi-Cray instructions, and
speedups of up to 2.5 are seen, average 1.71, with Cray-1 style
latencies, e.g. 11-cycle loads. \en}
)
@inproceedings(kroft-isca,
title = "Lockup-free instruction fetch/prefetch cache organization",
author = "David Kroft",
booktitle = "The 8th " # ISCA,
year = 1981, month = may,
pages = "81--87",
annote = {\bn Kroft documents how to build a cache which can have
multiple outstanding requests (up to some hardware limit, which
he suggests should be 4). Miss information/status holding registers
(MSHRs) are the key resources. \en}
)
@article(gannon-jpdc,
title = "Strategies for cache and local memory management by
global program transformation",
author = "Dennis Gannon and William Jalby and Kyle Gallivan",
journal = jpdc,
volume = 5, number = 5,
year = 1988, month = oct,
pages = "587--616",
note = {Also in the Proceedings of ICS '87.},
annote = {Also in the Proceedings of ICS '87.
\bn This paper is very mathematical, so I only skimmed it.
Each variable is associated with a `reference window' which is
used to determine what should be kept in cache (when controlled
in software) or what the hit rate will be (for hardware
control). They consider program restructuring \`a la
\cite{abu-sufah-79}, plus loop blocking. \en}
)
@inproceedings(farrens-isca,
title = "Improving performance of small on-chip instruction caches",
author = "Matthew K. Farrens and Andrew R. Pleszkun",
booktitle = "The 16th " # isca,
year = 1989, month = jun,
pages = "234--241",
annote = {\bn This paper does simple instruction prefetching by
putting two queues between the instruction register and the
cache; the queues force prefetches as they drain. This gives
speedup of up to $\times$2 over conventional caches, although
they only simulate caches up to 512 bytes in size. They also
document PIPE chip features such as PBR (prepare to branch) and
exposes load/store pipelines. \en}
)
@inproceedings(smith-isca-12,
title = "Cache evaluation and the impact of workload choice",
author = "Alan Jay Smith",
booktitle = "12th " # isca,
year = 1985, month = jun,
pages = "64--73",
annote = {\bn This article only touches on prefetching, where it
looks at the simple $i+1$ prefetch model. He finds prefetching
is increasingly useful with increasing cache size, prefetching
always cuts the ifetch miss ratio, and cuts the data miss ratioo
for data caches of 8K or more. He notes icache traffic goes up
towards 30--40\% more with prefetching, and seemingly by almost
a factor of 2 for data prefetching. \en}
)
@inproceedings(scheurich-dubois,
title = "Concurrent miss resolution in multiprocessor caches",
author = "Christoph Scheurich and Michel Dubois",
booktitle = icpp,
year = 1988, month = aug,
volume = "I",
pages = "118--125",
annote = {\bn This article describes a good cache architecture:
one word cache lines allow writes to avoid writebacks, but the
lockup-free nature of the cache allows multiple outstanding
reads to avoid taking a big hit from the higher miss rate.
A compiler-enforced weak consistency model also simplifies the
hardware. Prefetching is suggested as helpful in this situation.
Some analytical curves are given. \en}
)
@article(anant-tocs,
title = "An Analytical Cache Model",
author = "Anant Agarwal and Mark Horowitz and John Hennessy",
journal = tocs,
year = 1989, month = may,
volume = 7, number = 2,
pages = "184--215",
annote = {\bn A very interesting paper in which an analytical
model for cache performance is derived and compared with
results. The model is yields hit rate as a function of
including cache size, associativity, block size, sub-block
size, multiprogramming level, task switch interval, and
observation interval. Measured factors include the number of
blocks referenced per interval, the total number of blocks
referenced, and the {\em collision rate}. \en}
)
@article(smith-cache,
title = "Cache memories",
author = "Alan Jay Smith",
journal = acmcs,
volume = 14, number = 3,
year = 1982, month = sep,
pages = "473--530",
annote = {\bn This giant paper steps through many features of
caches and comments on them. One of the initial sections gives
a good overview of his views on prefetching, specifically for
OBL (one block lookahead); he compares always prefetch with
tagged prefetch, which basically prefetches on the second
reference to a block. \en}
)
@article(driscoll-ibm,
title = "Cache miss directory---a means of prefetching cache `missed'
lines",
author = "G. C. Driscoll and J. J. Losq and T. R. Puzak and G. S. Rao
and H. E. Sachar and R. D. Villani",
journal = ibmtech,
volume = 25, number = "3A",
year = 1982, month = aug,
pages = "1286",
annote = {\bn This one-page paper outlines a scheme where a number of
{\em trigger} addresses are assocated with {\em target} prefetch
addresses, and referencing the trigger causes prefetching of the
target, at the lowest priority and aborted by a different accesss.
The CMD is loaded by an initial run with a hardware monitor. \en}
)
@article(rechtschaffen-ibm,
title = "Using a branch history table to prefetch cache lines",
author = "R. N. Rechtschaffen",
journal = ibmtech,
volume = 22, number = 12,
year = 1980, month = may,
pages = "5539",
annote = {\bn The BHT is suggested as a way to prefetch instruction
lines. \en}
)
@techreport(chi-purdue,
title = "Compiler-driven cache policy (known reference string)",
author = "Chi-Hung Chi and Henry G. Dietz",
institution = "Purdue University",
number = "TR-EE 87-21",
year = 1987, month = jun,
annote = {\bn This paper doesn't say too much about prefetching.
Their main focus is showing how you can take a known reference
string of cache lines, add weights to transitions between cache
states, and do a shortest-path search, then allow tags on your
instructions for fetch-bypass-cache and fetch-evict-line-NN
when the data is not already in cache. They give the compiler
performance as $\Theta(k^{-1}mn)$, and tiny examples of improved
run-time performance, but no real results yet. More work
coming. \en}
)
@unpublished(selvidge-isca,
title = "Compilation-based Data Prefetching",
author = "Charles Selvidge",
year = 1992, month = may,
note = {May, 1992. Draft of paper for submission.},
annote = {May, 1992. Draft of paper for submission.
\bn This paper employs good prefetching techniques.
References are divided into {\em badrefs} which need
prefetching, and {\em goodrefs}, which can be used to add
parallelism as necessary. Prefetching is done both on {\em
RAAD}s and also in a trace-driven manner, both proving useful
for different codes. The SPEC benchmark is simulated on Sparc
with a prefetch instruction and stall and interlocked memory;
interlocked memory with 160 cycle latency shows 2-6 $\times$
improvement with prefetching. \en}
)
@phdthesis(selvidge,
title = "Compilation-Based Prefetching for Memory Latency Tolerance",
author = "Charles Selvidge",
school = "MIT",
year = 1992, month = may,
annote = {\bn This thesis is the extended version of
\cite{selvidge-isca}; it provides implementation specifics and more
detailed results. \en}
)
@article(comp-survey,
title = "A survey of {RISC} processors and computers of the mid-1980's",
author = "Charles E. Gimarc and Veljko M. Milutinovi\'c",
journal = comp,
volume = 20, number = 9,
year = 1987, month = sep,
pages = "59--69",
annote = {\bn A general survey of RISC chips. It doesn't discuss
prefetching much. \en}
)
@techreport(rau-stanford,
title = "Sequential Prefetch Strategies for Instructions and Data",
author = "B. R. Rau",
institution = "Stanford University, Digital Systems Laboratory",
number = "CSL-TR-77-131",
year = 1977,
annote = {\bn Unread. \en}
)
@inproceedings(bpt-asplos,
title = "Improving the accuracy of dynamic branch prediction
using branch correlation",
author = "Pan, Shien-Tai and Kimming So and Joseph T. Rahmeh",
booktitle = "The Fifth " #asplos,
year = 1992, month = oct,
pages = "76--84",
annote = {\bn A handy reference for some prefetching techniques. \en}
)
@article(hill-assoc,
title = "Evaluating associativity in {CPU} caches",
author = "Mark D. Hill and Alan Jay Smith",
journal = tcomps,
volume = 38, number = 12,
year = 1989, month = dec,
pages = "1612--1630",
annote = {\bn Information on compulsory/capacity/conflict misses. \en}
)
@unpublished(alpha,
title = "Alpha Architecture Technical Summary",
author = "Dick Sites and Rich Witek",
year = 1992,
note = "Available for anonymous FTP from
{\tt gatekeeper.dec.com} as {\tt pub/DEC/Alpha/technical-summary.txt}",
annote = {\bn Not read. \en}
)
@book(rs6000,
title = "IBM Journal of Research and Development, Special
Issue on RISC System/6000",
author = "IBM",
publisher = "IBM",
year = 1990, month = jan,
annote = {\bn Not read. \en}
)
@book(moto88100,
title = "MCS88100: RISC Microprocessor User's Manual",
author = "Motorola",
publisher = "Motorola Inc.",
year = 1988,
annote = {\bn Not read. \en}
)
@inproceedings(fp-micro,
title = "Stride directed prefetching in scalar processors",
author = "J. W. C. Fu and J. H. Patel",
booktitle = "Proceedings of the 25th MICRO",
pages = "102--110",
year = 1992,
annote = {\bn Expands on their previous paper with an SPT (stride
prediction table), with no IP prediction component. They give
a good discussion of miss rates and prefetch overhead without
actually showing performance figures. \en}
)
@phdthesis(chen-thesis,
title = "Data Prefetching for High-Performance Processors",
author = "Tien-Fu Chen",
school = "University of Washington at Seattle",
year = 1993, month = jul,
note = {Also available as TR 93-07-01.},
annote = {\bn Gives interesting results beyond \cite{chen-baer-asplos};
software prefetching is compared to their hardware-only scheme,
as well as a combined scheme. Presents as good an argument as I've
seen for the all-hardware approach. \en}
)
@article(spec-cache,
title = "Cache performance of the {SPEC92} benchmark suite",
author = "Jeffrey D. Gee and Mark D. Hill and Dionisis N.
Pnevmatikatos and Alan Jay Smith",
journal = micro,
year = 1993, month = aug,
volume = 13, number = 4,
pages = "17--27",
annote = {\bn Interesting reading. \en}
)
@inproceedings(spaniol,
title = "Demand prepaging algorithms based on a model
of locality of programs",
author = "Otto Spaniol",
booktitle = "Computer Architectures and Networks",
year = 1974, month = aug,
editor = "E. Gelenbe and R. Mahl",
pages = "515--527",
publisher = "North-Holland",
annote = {\bn Not too directly relevant to prefetching, but
interesting. \en}
)
@article(sklenar-can,
title = "Prefetch unit for vector operations on scalar computers",
author = "Ivan Sklen\'a\v{r}",
journal = can,
volume = 20, number = 4,
year = 1992, month = sep,
pages = "31--37",
annote = {Very similar to constant-stride prefetch described
in baer-sc, so not very interesting. No prediction attempts.}
)
@article(smith-bib,
title = "Bibliography and Readings on {CPU} Cache Memories and
Related Topics",
author = "Alan Jay Smith",
journal = can,
volume = 14, number = 1,
year = 1986, month = jan,
pages = "22--42",
annote = {\bn Didn't really read through it. \en}
)
@phdthesis(anant-su,
title = "Analysis of Cache Performance for Operating Systems and
Multiprogramming",
author = "Anant Agarwal",
school = "Stanford University",
year = 1987, month = may,
annote = {\bn Available as Technical Report CSL-TR-87-332.
Didn't read. \en}
)
@article(bennett,
title = "Cache Memory with Prefetching of Data by Priority",
author = "B. T. Bennett and P. A. Franaczek",
journal = ibmtech,
volume = 18, number = 12,
year = 1976, month = may,
pages = "4231--4232",
annote = {\bn Couldn't find. \en}
)
@article(enger-ibm,
title = "Paged control store prefetch mechanism",
author = "T. A. Enger",
journal = ibmtech,
volume = 16, number = 7,
year = 1973, month = dec,
pages = "2140--2141",
annote = {\bn Couldn't find. \en}
)
@inproceedings(fu-patel-ipps,
title = "Data prefetching strategies for vector cache memories",
author = "John W. C. Fu and Janak H. Patel",
booktitle = "5th " # ipps,
year = 1991, month = may,
pages = "555--560",
annote = {\bn See \cite{fu-patel-isca} for all details; this paper
has identical content. \en}
)