KEMBAR78
Improve constant folding for some frozen objects by EgorBo · Pull Request #86318 · dotnet/runtime · GitHub
Skip to content

Conversation

@EgorBo
Copy link
Member

@EgorBo EgorBo commented May 16, 2023

Just an exercise to improve constant folding in VN around frozen objects. As a demonstration:

void Foo() => Bar("TruE");

void Bar(string str)
{
    if (bool.TryParse(str, out bool value))
        Console.WriteLine(value);
}

New codegen:

; Method Program:Foo():this
       sub      rsp, 56
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
       mov      byte  ptr [rsp+30H], 0
       mov      dword ptr [rsp+30H], 1
       movzx    rcx, byte  ptr [rsp+30H]
       call     [System.Console:WriteLine(bool)]
       nop      
       add      rsp, 56
       ret      
; Total bytes of code: 41
old codegen was terrible
; Method Program:Foo():this
G_M52879_IG01:  ;; offset=0000H
       push     rsi
       sub      rsp, 112
       vzeroupper 
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+20H], xmm4
       vmovdqa  xmmword ptr [rsp+30H], xmm4
       vmovdqa  xmmword ptr [rsp+40H], xmm4
       vmovdqa  xmmword ptr [rsp+50H], xmm4
       vmovdqa  xmmword ptr [rsp+60H], xmm4
						;; size=42 bbWeight=1 PerfScore 12.58

G_M52879_IG02:  ;; offset=002AH
       mov      rdx, 0x21000209280      ; 'TruE'
       add      rdx, 12
       mov      bword ptr [rsp+60H], rdx
       mov      dword ptr [rsp+68H], 4
       lea      rdx, bword ptr [rsp+60H]
       cmp      dword ptr [rdx+08H], 4
       jne      SHORT G_M52879_IG05
						;; size=38 bbWeight=1 PerfScore 7.00

G_M52879_IG03:  ;; offset=0050H
       mov      rdx, bword ptr [rdx]
       mov      rcx, 0x20002000200020
       or       rcx, qword ptr [rdx]
       mov      rdx, 0x65007500720074
       cmp      rcx, rdx
       jne      SHORT G_M52879_IG05
						;; size=31 bbWeight=0.25 PerfScore 1.69

G_M52879_IG04:  ;; offset=006FH
       mov      ecx, 1
       jmp      G_M52879_IG24
						;; size=10 bbWeight=0.50 PerfScore 1.12

G_M52879_IG05:  ;; offset=0079H
       lea      rcx, bword ptr [rsp+60H]
       mov      rdx, bword ptr [rcx]
       cmp      dword ptr [rcx+08H], 5
       jne      SHORT G_M52879_IG08
						;; size=14 bbWeight=0.50 PerfScore 3.25

G_M52879_IG06:  ;; offset=0087H
       mov      rcx, 0x20002000200020
       or       rcx, qword ptr [rdx]
       mov      rax, 0x73006C00610066
       xor      rcx, rax
       mov      edx, dword ptr [rdx+06H]
       or       edx, 0x200020
       xor      edx, 0x650073
       or       rdx, rcx
       jne      SHORT G_M52879_IG08
						;; size=46 bbWeight=0.25 PerfScore 1.88

G_M52879_IG07:  ;; offset=00B5H
       xor      ecx, ecx
       jmp      G_M52879_IG24
						;; size=7 bbWeight=0.50 PerfScore 1.12

G_M52879_IG08:  ;; offset=00BCH
       vmovdqu  xmm0, xmmword ptr [rsp+60H]
       vmovdqu  xmmword ptr [rsp+50H], xmm0
						;; size=12 bbWeight=0.50 PerfScore 2.00

G_M52879_IG09:  ;; offset=00C8H
       mov      esi, dword ptr [rsp+58H]
       cmp      dword ptr [rsp+58H], 5
       jl       G_M52879_IG22
						;; size=15 bbWeight=0.50 PerfScore 2.00

G_M52879_IG10:  ;; offset=00D7H
       vmovdqu  xmm0, xmmword ptr [rsp+50H]
       vmovdqu  xmmword ptr [rsp+20H], xmm0
						;; size=12 bbWeight=0.50 PerfScore 2.00

G_M52879_IG11:  ;; offset=00E3H
       lea      rdx, [rsp+20H]
       lea      rcx, [rsp+50H]
       call     [System.Boolean:TrimWhiteSpaceAndNull(System.ReadOnlySpan`1[ushort]):System.ReadOnlySpan`1[ushort]]
       cmp      dword ptr [rsp+58H], esi
       je       G_M52879_IG22
						;; size=26 bbWeight=0.50 PerfScore 3.50

G_M52879_IG12:  ;; offset=00FDH
       vmovdqu  xmm0, xmmword ptr [rsp+50H]
       vmovdqu  xmmword ptr [rsp+40H], xmm0
						;; size=12 bbWeight=0.50 PerfScore 2.00

G_M52879_IG13:  ;; offset=0109H
       lea      rcx, bword ptr [rsp+40H]
       cmp      dword ptr [rcx+08H], 4
       jne      SHORT G_M52879_IG16
						;; size=11 bbWeight=0.50 PerfScore 2.25

G_M52879_IG14:  ;; offset=0114H
       mov      rcx, bword ptr [rcx]
       mov      rax, 0x20002000200020
       or       rax, qword ptr [rcx]
       mov      rcx, 0x65007500720074
       cmp      rax, rcx
       jne      SHORT G_M52879_IG16
						;; size=31 bbWeight=0.25 PerfScore 1.69

G_M52879_IG15:  ;; offset=0133H
       mov      ecx, 1
       mov      eax, 1
       jmp      SHORT G_M52879_IG23
						;; size=12 bbWeight=0.50 PerfScore 1.25

G_M52879_IG16:  ;; offset=013FH
       xor      ecx, ecx
						;; size=2 bbWeight=0.50 PerfScore 0.12

G_M52879_IG17:  ;; offset=0141H
       vmovdqu  xmm0, xmmword ptr [rsp+50H]
       vmovdqu  xmmword ptr [rsp+30H], xmm0
						;; size=12 bbWeight=0.50 PerfScore 2.00

G_M52879_IG18:  ;; offset=014DH
       lea      rax, bword ptr [rsp+30H]
       mov      rdx, bword ptr [rax]
       cmp      dword ptr [rax+08H], 5
       jne      SHORT G_M52879_IG20
						;; size=14 bbWeight=0.50 PerfScore 3.25

G_M52879_IG19:  ;; offset=015BH
       mov      rax, 0x20002000200020
       or       rax, qword ptr [rdx]
       mov      r8, 0x73006C00610066
       xor      rax, r8
       mov      edx, dword ptr [rdx+06H]
       or       edx, 0x200020
       xor      edx, 0x650073
       or       rax, rdx
       sete     al
       movzx    rax, al
       jmp      SHORT G_M52879_IG21
						;; size=52 bbWeight=0.25 PerfScore 2.44

G_M52879_IG20:  ;; offset=018FH
       xor      eax, eax
						;; size=2 bbWeight=0.25 PerfScore 0.06

G_M52879_IG21:  ;; offset=0191H
       jmp      SHORT G_M52879_IG23
						;; size=2 bbWeight=0.50 PerfScore 1.00

G_M52879_IG22:  ;; offset=0193H
       xor      ecx, ecx
       xor      eax, eax
						;; size=4 bbWeight=0.50 PerfScore 0.25

G_M52879_IG23:  ;; offset=0197H
       test     eax, eax
       je       SHORT G_M52879_IG25
						;; size=4 bbWeight=0.50 PerfScore 0.62

G_M52879_IG24:  ;; offset=019BH
       call     [System.Console:WriteLine(bool)]
						;; size=6 bbWeight=0.50 PerfScore 1.50

G_M52879_IG25:  ;; offset=01A1H
       nop      
						;; size=1 bbWeight=1 PerfScore 0.25

G_M52879_IG26:  ;; offset=01A2H
       add      rsp, 112
       pop      rsi
       ret      
						;; size=6 bbWeight=1 PerfScore 1.75
; Total bytes of code: 424

@ghost ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label May 16, 2023
@ghost ghost assigned EgorBo May 16, 2023
@ghost
Copy link

ghost commented May 16, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Just an excercise to improve constant folding in VN around frozen objects (can be useful for simialr cases). As a demonstration:

void Foo() => Bar("TruE");

void Bar(string str)
{
    if (bool.TryParse(str, out bool value))
        Console.WriteLine(value);
}

New codegen:

; Method Program:Foo():this
       sub      rsp, 56
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
       mov      byte  ptr [rsp+30H], 0
       mov      dword ptr [rsp+30H], 1
       movzx    rcx, byte  ptr [rsp+30H]
       call     [System.Console:WriteLine(bool)]
       nop      
       add      rsp, 56
       ret      
; Total bytes of code: 41

(old codegen was terrible)

Author: EgorBo
Assignees: -
Labels:

area-CodeGen-coreclr

Milestone: -

@ShreyasJejurkar
Copy link
Contributor

This is great improvement.

And also why old codegen is this much bad, provided the value passed to function is constant string and not even dynamic one! 🙄😕

@EgorBo
Copy link
Member Author

EgorBo commented May 16, 2023

This is great improvement.

And also why old codegen is this much bad, provided the value passed to function is constant string and not even dynamic one! 🙄😕

It's not bad, it's just too big (a lot of stuff inlined), boolean parsing is done via two OridnalIgnoreCase comparisons, e.g.:

internal static bool IsFalseStringIgnoreCase(ReadOnlySpan<char> value)
{
    return value.Equals("False", StringComparison.OrdinalIgnoreCase);
}

emits:

; Method Program:IsFalseStringIgnoreCase(System.ReadOnlySpan`1[ushort]):bool
G_M35886_IG01:  ;; offset=0000H
						;; size=0 bbWeight=1 PerfScore 0.00

G_M35886_IG02:  ;; offset=0000H
       mov      rax, bword ptr [rcx]
       cmp      dword ptr [rcx+08H], 5
       jne      SHORT G_M35886_IG04
						;; size=9 bbWeight=1 PerfScore 6.00

G_M35886_IG03:  ;; offset=0009H
       mov      rdx, 0x20002000200020
       or       rdx, qword ptr [rax]
       mov      rcx, 0x73006C00610066
       xor      rdx, rcx
       mov      eax, dword ptr [rax+06H]
       or       eax, 0x200020
       xor      eax, 0x650073
       or       rax, rdx
       sete     al
       movzx    rax, al
       jmp      SHORT G_M35886_IG05
						;; size=50 bbWeight=0.25 PerfScore 2.44

G_M35886_IG04:  ;; offset=003BH
       xor      eax, eax
						;; size=2 bbWeight=0.25 PerfScore 0.06

G_M35886_IG05:  ;; offset=003DH
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 62

@EgorBo
Copy link
Member Author

EgorBo commented May 16, 2023

@jakobbotsch @SingleAccretion PTAL since you're familiar with this pattern (when we accumulate a final byte offset out of ADD(ADD(ADD... trees) @dotnet/jit-contrib

There are jit-diff improvements, but SPMI is empty because of missing contexts (this opt opens new oportunities to fold frozen objects but we need a JIT-VM api call)

@EgorBo EgorBo requested a review from jakobbotsch May 16, 2023 13:48
return TryParseUncommon(value, out result);

[MethodImpl(MethodImplOptions.NoInlining)]
static bool TryParseUncommon(ReadOnlySpan<char> value, out bool result)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TryParseUncommon used to be inlineable in some cases (e.g. without pgo data)

{
if (!TryParse(value, out bool result))
{
ThrowHelper.ThrowFormatException_BadBoolean(value);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replaced throw with a throw helper to make it inlineable (at least for constant input)

@EgorBo
Copy link
Member Author

EgorBo commented May 16, 2023

I wanted to give it a try in Rust with the same code to see how it looks, is a significant difference. Not sure if it's something related to our limitations that we can't have optimization like this or if it's how it is.

here is the code (C# vs Rust) https://godbolt.org/z/ddnYh1Mc5

image

Judging by your snippet, Rust doesn't support ignore case parsing so the codegen you see is responsible to return some parsing error for you

@ShreyasJejurkar
Copy link
Contributor

Yeah, my mistake, I did not look correctly, that it gave an error! Sometimes compiler explorer windows are so confusing! 😅😅

@ShreyasJejurkar
Copy link
Contributor

@EgorBo I have one doubt, in the below example, can't we generate the same codegen for WithParse method just like we generated for WithoutTryParse given the fact that the input to them are constant string and would be available at compile time itself; Can't JIT see that and further optimize it so that it can we cam avoid those unwanted jump instructions!

https://sharplab.io/#v2:EYLgtghglgdgNAFxBAzmAPgAQEwEYCwAUJgMwAEOZAwmQN5FFlMXnAD2bANmQOpQIALNgFcEAFQBOATwAKECSgCmACgCUdRsy2YA7GXZcAdHIUqARAgnDFZ1QG5NTAL6Oyr0vo7c+gk0rUahFpaBtwSiijCnAgOQcFMUABmZMqhhpKy8v4WVjZwZCIIZOGR0aqqrlr0cfHaeiVRMZXMLjXBumSJEJxKsVqtTkA==

@EgorBo
Copy link
Member Author

EgorBo commented May 16, 2023

@EgorBo I have one doubt, in the below example, can't we generate the same codegen for WithParse method just like we generated for WithoutTryParse given the fact that the input to them are constant string and would be available at compile time itself; Can't JIT see that and further optimize it so that it can we cam avoid those unwanted jump instructions!

https://sharplab.io/#v2:EYLgtghglgdgNAFxBAzmAPgAQEwEYCwAUJgMwAEOZAwmQN5FFlMXnAD2bANmQOpQIALNgFcEAFQBOATwAKECSgCmACgCUdRsy2YA7GXZcAdHIUqARAgnDFZ1QG5NTAL6Oyr0vo7c+gk0rUahFpaBtwSiijCnAgOQcFMUABmZMqhhpKy8v4WVjZwZCIIZOGR0aqqrlr0cfHaeiVRMZXMLjXBumSJEJxKsVqtTkA==

these are two different methods which inline differently. Also, this PR fixes constant folding

@ShreyasJejurkar
Copy link
Contributor

@EgorBo I have one doubt, in the below example, can't we generate the same codegen for WithParse method just like we generated for WithoutTryParse given the fact that the input to them are constant string and would be available at compile time itself; Can't JIT see that and further optimize it so that it can we cam avoid those unwanted jump instructions!
https://sharplab.io/#v2:EYLgtghglgdgNAFxBAzmAPgAQEwEYCwAUJgMwAEOZAwmQN5FFlMXnAD2bANmQOpQIALNgFcEAFQBOATwAKECSgCmACgCUdRsy2YA7GXZcAdHIUqARAgnDFZ1QG5NTAL6Oyr0vo7c+gk0rUahFpaBtwSiijCnAgOQcFMUABmZMqhhpKy8v4WVjZwZCIIZOGR0aqqrlr0cfHaeiVRMZXMLjXBumSJEJxKsVqtTkA==

these are two different methods which inline differently. Also, this PR fixes constant folding

Ohh, I see. I thought it would be the same as those constant strings are valid boolean values, so we can optimize the codegen for TryParse, or maybe Short-Circuting the call to Parse directly from that method once we found that specific input string. But it's ok, there might be some reason of not doing that.

Also, this PR fixes constant folding

I wish, there was an easier way to see codegen of the given program against the PR on browser itself😅😅

Copy link
Contributor

@SingleAccretion SingleAccretion left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jit changes LGTM.

@EgorBo
Copy link
Member Author

EgorBo commented May 16, 2023

cc @dotnet/jit-contrib for a green sign off

@EgorBo
Copy link
Member Author

EgorBo commented May 17, 2023

@dotnet/jit-contrib @jakobbotsch Ping, this PR also fixes a cross-compile bug @yowl hit in the NativeAOT-LLVM branch

if (vnStore->IsVNConstantNonHandle(funcApp.m_args[0]) && (vnStore->TypeOfVN(funcApp.m_args[0]) == TYP_I_IMPL))
{
*pObj = vnStore->ConstantObjHandle(funcApp.m_args[0]);
*byteOffset = vnStore->ConstantValue<ssize_t>(funcApp.m_args[1]);
Copy link
Member

@jakobbotsch jakobbotsch May 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised we haven't hit this assert with this previous code:

T val1 = reinterpret_cast<T*>(c->m_defs)[offset];
T val2 = SafeGetConstantValue<T>(c, offset);
// Detect if there is a mismatch between the VN storage type and explicitly
// passed-in type T.
bool mismatch = false;
if (varTypeIsFloating(c->m_typ))
{
mismatch = (memcmp(&val1, &val2, sizeof(val1)) != 0);
}
else
{
mismatch = (val1 != val2);
}
if (mismatch)
{
assert(
!"Called ConstantValue<T>(vn), but type(T) != type(vn); Use CoercedConstantValue instead.");
}

I would expect it to hit for 64-bit to 32-bit crossgen2 compilations. It seems to indicate that this code is never hit for crossgen2 32-bit compilations? Is that expected?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect it to hit for 64-bit to 32-bit crossgen2 compilations.

That's correct - it will be hit if you replay a 32 bit collection with a cross-targeting Jit today. Evidently, we just don't have people (or machines) doing that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

crossgen doesn't have frozen objects so this path indeed has never been executed, but we could see it on NativeAOT, but I'm not sure we cover the cross compilation case on CI for it

@EgorBo EgorBo merged commit 83f71b5 into dotnet:main May 17, 2023
@EgorBo EgorBo deleted the fold-bool-parse branch May 17, 2023 18:00
@yowl
Copy link
Contributor

yowl commented May 17, 2023

Thanks!

@ghost ghost locked as resolved and limited conversation to collaborators Jun 16, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants