-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Extend jump tables #107831
JIT: Extend jump tables #107831
Conversation
1926677
to
998a205
Compare
@EgorBo, is it a bug in the diff calculation that this is showing up as regression and not an improvement: @@ -171,7 +171,7 @@ G_M23203_IG02: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=180000 {x19 x20
movk x0, #0xD1FFAB1E LSL #16
movk x0, #1 LSL #32
ldapr w0, [x0]
- tbz w0, #0, G_M23203_IG80
+ tbz w0, #0, G_M23203_IG88
;; size=48 bbWeight=1 PerfScore 9.50
G_M23203_IG03: ; bbWeight=1, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref, isz
movz x0, #0xD1FFAB1E // data for Microsoft.CodeAnalysis.PooledObjects.PooledStringBuilder:s_poolInstance
@@ -199,8 +199,7 @@ G_M23203_IG03: ; bbWeight=1, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRe
; gcrRegs -[x0]
cmp w3, w0
bge G_M23203_IG07
- b G_M23203_IG04
- ;; size=80 bbWeight=1 PerfScore 30.00
+ ;; size=76 bbWeight=1 PerfScore 29.00
G_M23203_IG04: ; bbWeight=4, gcVars=00000000000000000000800000000002 {V12 V13}, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, gcvars, byref, isz
; gcrRegs -[x4-x5]
ldr x6, [x19]
@@ -211,7 +210,7 @@ G_M23203_IG04: ; bbWeight=4, gcVars=00000000000000000000800000000002 {V12
movk x3, #0xD1FFAB1E LSL #16
movk x3, #1 LSL #32
ldapr w3, [x3]
- tbz w3, #0, G_M23203_IG81
+ tbz w3, #0, G_M23203_IG89
;; size=28 bbWeight=4 PerfScore 38.00
G_M23203_IG05: ; bbWeight=4, gcVars=00000000000000000000800000000012 {V12 V13 V33}, gcrefRegs=B800040 {x6 x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, gcvars, byref, isz
movz x3, #0xD1FFAB1E // data for Microsoft.CodeAnalysis.MetadataHelpers+SerializedTypeDecoder:s_typeNameDelimiters
@@ -232,14 +231,14 @@ G_M23203_IG05: ; bbWeight=4, gcVars=00000000000000000000800000000012 {V12
; GC ptr vars -{V33}
blr x6
; gcrRegs -[x0-x1]
- tbnz w0, #31, G_M23203_IG10
+ tbnz w0, #31, G_M23203_IG08
ldr x3, [x19]
; gcrRegs +[x3]
mov x2, x3
; gcrRegs +[x2]
ldr w1, [x2, #0x08]
cmp w0, w1
- bhs G_M23203_IG86
+ bhs G_M23203_IG94
add x2, x2, #12
; gcrRegs -[x2]
; byrRegs +[x2]
@@ -248,58 +247,36 @@ G_M23203_IG05: ; bbWeight=4, gcVars=00000000000000000000800000000012 {V12
ldr w1, [x19, #0x08]
sub w2, w0, w1
; byrRegs -[x2]
- cbnz w2, G_M23203_IG16
+ cbnz w2, G_M23203_IG14
;; size=100 bbWeight=4 PerfScore 140.00
G_M23203_IG06: ; bbWeight=2, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref
; gcrRegs -[x3]
movz x5, #8
movk x5, #0xD1FFAB1E LSL #16
movk x5, #1 LSL #32
- b G_M23203_IG18
+ b G_M23203_IG16
;; size=16 bbWeight=2 PerfScore 5.00
G_M23203_IG07: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
; byrRegs -[x19]
- ldr x6, [fp, #0x28] // [V13 loc9]
- ; gcrRegs +[x6]
- ;; size=4 bbWeight=0.50 PerfScore 1.00
-G_M23203_IG08: ; bbWeight=1, gcrefRegs=B800040 {x6 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref, isz
- mov x0, x6
- ; gcrRegs +[x0]
- movz x1, #0xD1FFAB1E // code for <unknown method>
- movk x1, #0xD1FFAB1E LSL #16
- movk x1, #1 LSL #32
- ldr x1, [x1]
- ldr wzr, [x0]
- ; GC ptr vars -{V13}
- blr x1
- ; gcrRegs -[x6]
- mov x19, x0
- ; gcrRegs +[x19]
- ldr w0, [x19, #0x08]
- ; gcrRegs -[x0]
- cbz w0, G_M23203_IG72
- ;; size=40 bbWeight=1 PerfScore 13.50
-G_M23203_IG09: ; bbWeight=0.50, gcrefRegs=B880000 {x19 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref, isz
- cbz w28, G_M23203_IG68
- mov x23, x19
- b G_M23203_IG72
- ;; size=12 bbWeight=0.50 PerfScore 1.25
-G_M23203_IG10: ; bbWeight=0.50, gcVars=00000000000000000000800000000002 {V12 V13}, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, gcvars, byref, isz
- ; gcrRegs -[x19]
+ ldr x5, [fp, #0x28] // [V13 loc9]
+ ; gcrRegs +[x5]
+ b G_M23203_IG69
+ ;; size=8 bbWeight=0.50 PerfScore 1.50
+G_M23203_IG08: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref, isz
+ ; gcrRegs -[x5]
; byrRegs +[x19]
- ; GC ptr vars +{V01 V13}
ldr x3, [x19]
; gcrRegs +[x3]
ldr w2, [x3, #0x08]
ldr w1, [x19, #0x08]
sub w0, w2, w1
- cbz w0, G_M23203_IG12
+ cbz w0, G_M23203_IG10
ldr w0, [x3, #0x08]
cmp w0, w2
- blt G_M23203_IG11
+ blt G_M23203_IG09
str w2, [x19, #0x08]
;; size=36 bbWeight=0.50 PerfScore 8.00
-G_M23203_IG11: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref
+G_M23203_IG09: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref
; gcrRegs -[x3]
ldr w2, [x19, #0x08]
sub w2, w2, w1
@@ -312,22 +289,22 @@ G_M23203_IG11: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byre
ldr wzr, [x0]
blr x3
; byrRegs -[x19]
- b G_M23203_IG13
+ b G_M23203_IG11
;; size=40 bbWeight=0.50 PerfScore 8.00
-G_M23203_IG12: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
+G_M23203_IG10: ; bbWeight=0.50, gcrefRegs=B800000 {x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
; gcrRegs -[x0]
movz x0, #8
movk x0, #0xD1FFAB1E LSL #16
movk x0, #1 LSL #32
;; size=12 bbWeight=0.50 PerfScore 0.75
-G_M23203_IG13: ; bbWeight=0.50, gcrefRegs=B800001 {x0 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref, isz
+G_M23203_IG11: ; bbWeight=0.50, gcrefRegs=B800001 {x0 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref, isz
; gcrRegs +[x0]
ldr x22, [fp, #0x28] // [V13 loc9]
; gcrRegs +[x22]
ldrsb wzr, [x22]
- cbz x0, G_M23203_IG15
+ cbz x0, G_M23203_IG13
;; size=12 bbWeight=0.50 PerfScore 3.00
-G_M23203_IG14: ; bbWeight=0.50, gcrefRegs=BC00001 {x0 x22 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
+G_M23203_IG12: ; bbWeight=0.50, gcrefRegs=BC00001 {x0 x22 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
ldr w2, [x0, #0x08]
add x1, x0, #12
; byrRegs +[x1]
@@ -339,25 +316,25 @@ G_M23203_IG14: ; bbWeight=0.50, gcrefRegs=BC00001 {x0 x22 x23 x24 x25 x27
blr x3
; gcrRegs -[x0]
; byrRegs -[x1]
- mov x6, x22
- ; gcrRegs +[x6]
- b G_M23203_IG08
+ mov x5, x22
+ ; gcrRegs +[x5]
+ b G_M23203_IG69
;; size=40 bbWeight=0.50 PerfScore 5.49
-G_M23203_IG15: ; bbWeight=0.00, gcrefRegs=BC00000 {x22 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
- ; gcrRegs -[x6]
- mov x6, x22
- ; gcrRegs +[x6]
- b G_M23203_IG08
+G_M23203_IG13: ; bbWeight=0.00, gcrefRegs=BC00000 {x22 x23 x24 x25 x27}, byrefRegs=100000 {x20}, byref
+ ; gcrRegs -[x5]
+ mov x5, x22
+ ; gcrRegs +[x5]
+ b G_M23203_IG69
;; size=8 bbWeight=0.00 PerfScore 0.00
-G_M23203_IG16: ; bbWeight=2, gcrefRegs=B800008 {x3 x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref, isz
- ; gcrRegs -[x6 x22] +[x3]
+G_M23203_IG14: ; bbWeight=2, gcrefRegs=B800008 {x3 x23 x24 x25 x27}, byrefRegs=180000 {x19 x20}, byref, isz
+ ; gcrRegs -[x5 x22] +[x3]
; byrRegs +[x19]
... Looks like it has ignored |
PTAL @AndyAyersMS @dotnet/jit-contrib Diffs. It seems that there are some additional diffs from merging two switches (on top of the same variable) or expanding value tests on all edges, not just the edge pointing to the The only tricky thing was to re-distribute weights. It seems that currently we don't support having separate likelihoods for different edges pointing to the same target block for switch (at least, the profile weight validation does not). There are a few cases where by expanding the default target we break the bit-test optimization (because that requires switch to have just 2 unique successors), but I presume a jump-table is still nice to have and there are not too many of them to worry about memory impact. |
This happens when a diff is too large and is trimmed, so the part where it regressed was trimmed out here. We normally inspect such diffs locally by hands. |
FlowEdge* edgeToExpand = switchTargets->getDefault(); | ||
BasicBlock* edgeToExpandDst = edgeToExpand->getDestinationBlock(); | ||
|
||
// IsConstantTestCondBlock will tell us if the default target is a value test block (e.g. "x == cns") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you also need to check if edgeToExpandDst
has other statements, and if so, bail out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, oops, good point! I thought that IsConstantTestCondBlock
checked for that but it asks caller for that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
if (block->KindIs(BBJ_COND) && !block->isRunRarely() && optSwitchDetectAndConvert(block)) | ||
{ | ||
JITDUMP("Converted block " FMT_BB " to switch\n", block->bbNum) | ||
modified = true; | ||
} | ||
#endif | ||
|
||
// See if we can merge BBJ_COND into an existing switch block |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed for tier1-instr code this causes a lot of code expansion -- maybe skip doing this if we're instrumenting?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
weight_t newTestLikelihood = edgeToExpandLikelihood * testPasses->getLikelihood(); | ||
weight_t newDefLikelihood = edgeToExpandLikelihood * testFails->getLikelihood(); | ||
|
||
// However, it is tricky if testPasses or testFails are already presented in the jump table. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This might create a new dominant case for switch peeling (which happens later during block layout). If the default case was the most likely case before, and the new non-default case is now the most likely case and is reachable by just one case value, consider setting bbsHasDominantCase
and bbsDominantCase
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This might create a new dominant case for switch peeling (which happens later during block layout)
I don't think this is true. We currently run switch recognition after optOptimizeLayout
, and the late reordering pass in the backend only reorders blocks, without any of the additional opt passes like fgOptimizeSwitchJumps
. Perhaps we should consider moving switch recognition before layout?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should consider moving switch recognition before layout?
Afair, the order of when to do it was driven by diffs, I can check again
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, I plan to move layout completely to the backend, and leave all the flow opts (like switch peeling) where they are now, so it might be easier to flip optOptimizeLayout
and optSwitchRecognition
after that change goes in.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I've re-ordered the phases, diffs didn't change much
src/coreclr/jit/error.cpp
Outdated
@@ -268,9 +268,10 @@ extern "C" void __cdecl assertAbort(const char* why, const char* file, unsigned | |||
{ | |||
phaseName = PhaseNames[env->compiler->mostRecentlyActivePhase]; | |||
_snprintf_s(buff, BUFF_SIZE, _TRUNCATE, | |||
"Assertion failed '%s' in '%s' during '%s' (IL size %d; hash 0x%08x; %s)\n", why, | |||
"Assertion failed '%s' in '%s' during '%s' (IL size %d; BBs: %d; hash 0x%08x; %s)\n", why, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we really want this in the assert? That seems like an internal JIT detail that can be found by the JIT developer when analyzing the assert. The other data here helps clarify exactly which function the assert hits in. (The "during" helps triage the failure)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@BruceForstall sure, I can remove this change, but for me personally, when I run SPMI, I try to pick the most trivial repro to investigate first, and ILSize doesn't really help since it doesn't take inlining into account, e.g. 200 bytes of IL could be in fact 300 basic blocks 🙂
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
} | ||
|
||
// We're less conservative than Roslyn, but we still have some limits | ||
if (static_cast<size_t>(cns + switchTargetOffset) > SWITCH_MAX_DISTANCE) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we have a stress mode without a limit? I.e., is this correctness or profitability?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, good idea to enable it under stress, it's purely a heuristic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
@AndyAyersMS @amanasifkhalid @BruceForstall feedback is addressed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since you added a new stress mode, maybe run stress?
/azp list |
This comment was marked as resolved.
This comment was marked as resolved.
/azp run runtime-coreclr outerloop, runtime-coreclr jitstress, Fuzzlyn |
Azure Pipelines successfully started running 3 pipeline(s). |
Failures are unrelated |
This reverts commit a35f722.
This reverts commit a35f722.
Closes #107801
Roslyn may create jump-tables (switch IL opcode), however, it uses fairly conservative heuristics on when to emit those. Let's consider the following example:
Here we have values (int32):
9, 10, 13, 32
, or when normalized (minimal value is subtracted from all values):0, 1, 4, 23
. Roslyn decides (see sharplab) to use the switch only for 0,1,4 and perform a regular if-else for 32. JIT can afford being less conservative and still include it, due to:Codegen example:
Current codegen:
New codegen: