KEMBAR78
[JIT] Fold some bitwise operations to vpternlog by Ruihan-Yin · Pull Request #91227 · dotnet/runtime · GitHub
Skip to content

Conversation

@Ruihan-Yin
Copy link
Member

@Ruihan-Yin Ruihan-Yin commented Aug 28, 2023

Description

This PR is trying to solve #84534.

We implemented the optimization by tracking the use-def chain during lowering, and trying to fold 2 adjacent binary bitwise operations on the same chain into a single ternary node when AVX512 is available.

As we tested internally, we observed some code size reduction in superpmi asmdiff tests, and no tp regression. Moreover, based on the tests where code gen difference is detected, we ran the related micros, and the results will be attached below.

Asmdiff collected locally

Diffs are based on 1,809,204 contexts (477,269 MinOpts, 1,331,935 FullOpts).

MISSED contexts: 6 (0.00%)

Overall (-1,192 bytes)
Collection Base size (bytes) Diff size (bytes)
aspnet.run.windows.x64.checked.mch 37,409,551 +1,229
benchmarks.run.windows.x64.checked.mch 8,733,672 -3
benchmarks.run_pgo.windows.x64.checked.mch 32,571,286 +55
benchmarks.run_tiered.windows.x64.checked.mch 12,695,406 +12
coreclr_tests.run.windows.x64.checked.mch 388,982,917 -242
libraries.pmi.windows.x64.checked.mch 61,238,365 -622
libraries_tests.pmi.windows.x64.checked.mch 124,971,846 -404
realworld.run.windows.x64.checked.mch 13,931,576 -1,217
MinOpts (-31 bytes)
Collection Base size (bytes) Diff size (bytes)
aspnet.run.windows.x64.checked.mch 9,613,163 -58
benchmarks.run_pgo.windows.x64.checked.mch 13,679,370 +5
benchmarks.run_tiered.windows.x64.checked.mch 9,151,525 -2
coreclr_tests.run.windows.x64.checked.mch 273,478,506 +24
FullOpts (-1,161 bytes)
Collection Base size (bytes) Diff size (bytes)
aspnet.run.windows.x64.checked.mch 27,796,388 +1,287
benchmarks.run.windows.x64.checked.mch 8,274,452 -3
benchmarks.run_pgo.windows.x64.checked.mch 18,891,916 +50
benchmarks.run_tiered.windows.x64.checked.mch 3,543,881 +14
coreclr_tests.run.windows.x64.checked.mch 115,504,411 -266
libraries.pmi.windows.x64.checked.mch 59,709,387 -622
libraries_tests.pmi.windows.x64.checked.mch 118,468,217 -404
realworld.run.windows.x64.checked.mch 12,782,968 -1,217
Example diffs
aspnet.run.windows.x64.checked.mch
-5 (-2.38%) : 17633.dasm - Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure.StringUtilities+<>c:<.cctor>b__25_3(System.Span`1[ushort],System.ValueTuple`3[System.String,ushort,uint]):this (Instrumented Tier1)
@@ -80,7 +80,7 @@ G_M41617_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0104 {rdx r8},
        test     rdi, rdi
        je       SHORT G_M41617_IG04
 						;; size=26 bbWeight=1 PerfScore 11.50
-G_M41617_IG03:        ; bbWeight=0.50, gcrefRegs=0080 {rdi}, byrefRegs=0008 {rbx}, byref
+G_M41617_IG03:        ; bbWeight=0.50, gcrefRegs=0080 {rdi}, byrefRegs=0008 {rbx}, byref, isz
        ; byrRegs -[rdx r8]
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
@@ -89,7 +89,7 @@ G_M41617_IG03:        ; bbWeight=0.50, gcrefRegs=0080 {rdi}, byrefRegs=0008 {rbx
        ; byrRegs +[rdx]
        mov      r8d, dword ptr [rdi+0x08]
        cmp      r8d, esi
-       ja       G_M41617_IG06
+       ja       SHORT G_M41617_IG06
        add      r8, r8
        mov      rcx, rbx
        ; byrRegs +[rcx]
@@ -97,7 +97,7 @@ G_M41617_IG03:        ; bbWeight=0.50, gcrefRegs=0080 {rdi}, byrefRegs=0008 {rbx
        ; byrRegs -[rcx rdx]
        ; gcr arg pop 0
        mov      r15d, dword ptr [rdi+0x08]
-						;; size=48 bbWeight=0.50 PerfScore 5.25
+						;; size=44 bbWeight=0.50 PerfScore 5.25
 G_M41617_IG04:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byref, isz
        ; gcrRegs -[rdi]
        cmp      r15d, esi
@@ -112,14 +112,13 @@ G_M41617_IG04:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byr
        vpshufb  xmm0, xmm0, xmmword ptr [reloc @RWD00]
        vpsrldq  xmm1, xmm0, 2
        vpsrld   xmm1, xmm1, 4
-       vpor     xmm0, xmm0, xmm1
-       vpand    xmm0, xmm0, xmmword ptr [reloc @RWD16]
+       vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD16], -88
        vmovups  xmm1, xmmword ptr [reloc @RWD32]
        vpshufb  xmm0, xmm1, xmm0
        vpand    xmm0, xmm0, xmmword ptr [reloc @RWD48]
        movsxd   rax, r15d
        vmovups  xmmword ptr [rbx+2*rax], xmm0
-						;; size=95 bbWeight=1 PerfScore 20.58
+						;; size=94 bbWeight=1 PerfScore 20.25
 G_M41617_IG05:        ; bbWeight=1, epilog, nogc, extend
        add      rsp, 40
        pop      rbx
@@ -147,7 +146,7 @@ RWD32  	dq	3736353433323130h, 4645444342413938h
 RWD48  	dq	00FF00FF00FF00FFh, 00FF00FF00FF00FFh
 
 
-; Total bytes of code 210, prolog size 15, PerfScore 69.83, instruction count 56, allocated bytes for code 210 (MethodHash=507a5d6e) for method Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure.StringUtilities+<>c:<.cctor>b__25_3(System.Span`1[ushort],System.ValueTuple`3[System.String,ushort,uint]):this (Instrumented Tier1)
+; Total bytes of code 205, prolog size 15, PerfScore 69.00, instruction count 55, allocated bytes for code 205 (MethodHash=507a5d6e) for method Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure.StringUtilities+<>c:<.cctor>b__25_3(System.Span`1[ushort],System.ValueTuple`3[System.String,ushort,uint]):this (Instrumented Tier1)
 ; ============================================================
 
 Unwind Info:
-28 (-0.66%) : 51072.dasm - System.SpanHelpers:IndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,short,short,int):int (Instrumented Tier0)
@@ -627,19 +627,17 @@ G_M50250_IG44:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vmovups  zmm1, zmmword ptr [rbp-0x170]
        vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
        vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
-       vmovups  zmm1, zmmword ptr [rbp-0x1B0]
-       vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
-       vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
+       vmovups  zmm2, zmmword ptr [rbp-0x1B0]
+       vpcmpeqw k1, zmm2, zmmword ptr [rbp-0xF0]
+       vpmovm2w zmm2, k1
+       vpternlogd zmm0, zmm1, zmm2, -2
        vmovups  zmm1, zmmword ptr [rbp-0x1F0]
        vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
        vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
-       vmovups  zmm1, zmmword ptr [rbp-0x230]
-       vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
-       vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
+       vmovups  zmm2, zmmword ptr [rbp-0x230]
+       vpcmpeqw k1, zmm2, zmmword ptr [rbp-0xF0]
+       vpmovm2w zmm2, k1
+       vpternlogd zmm0, zmm1, zmm2, -2
        vmovups  zmmword ptr [rbp-0x430], zmm0
        lea      rdx, [rbp-0x430]
        ; byrRegs -[rdx]
@@ -656,7 +654,7 @@ G_M50250_IG44:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; byrRegs +[rcx]
        mov      bword ptr [rbp-0x238], rcx
        jmp      SHORT G_M50250_IG47
-						;; size=259 bbWeight=1 PerfScore 63.08
+						;; size=249 bbWeight=1 PerfScore 62.75
 G_M50250_IG45:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs -[rcx]
        mov      rcx, 0xD1FFAB1E
@@ -706,19 +704,17 @@ G_M50250_IG47:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vmovups  zmm1, zmmword ptr [rbp-0x170]
        vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
        vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
-       vmovups  zmm1, zmmword ptr [rbp-0x1B0]
-       vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
-       vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
+       vmovups  zmm2, zmmword ptr [rbp-0x1B0]
+       vpcmpeqw k1, zmm2, zmmword ptr [rbp-0xF0]
+       vpmovm2w zmm2, k1
+       vpternlogd zmm0, zmm1, zmm2, -2
        vmovups  zmm1, zmmword ptr [rbp-0x1F0]
        vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
        vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
-       vmovups  zmm1, zmmword ptr [rbp-0x230]
-       vpcmpeqw k1, zmm1, zmmword ptr [rbp-0xF0]
-       vpmovm2w zmm1, k1
-       vpord    zmm0, zmm0, zmm1
+       vmovups  zmm2, zmmword ptr [rbp-0x230]
+       vpcmpeqw k1, zmm2, zmmword ptr [rbp-0xF0]
+       vpmovm2w zmm2, k1
+       vpternlogd zmm0, zmm1, zmm2, -2
        vmovups  zmmword ptr [rbp-0x4F0], zmm0
        lea      rdx, [rbp-0x4F0]
        ; byrRegs -[rdx]
@@ -748,7 +744,7 @@ G_M50250_IG47:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; gcr arg pop 0
        mov      dword ptr [rbp-0x3E4], eax
        jmp      G_M50250_IG72
-						;; size=368 bbWeight=1 PerfScore 81.83
+						;; size=358 bbWeight=1 PerfScore 81.50
 G_M50250_IG48:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
@@ -823,16 +819,14 @@ G_M50250_IG54:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vpcmpeqw ymm0, ymm0, ymmword ptr [rbp-0x290]
        vmovups  ymm1, ymmword ptr [rbp-0x2D0]
        vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
-       vmovups  ymm1, ymmword ptr [rbp-0x2F0]
-       vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
+       vmovups  ymm2, ymmword ptr [rbp-0x2F0]
+       vpcmpeqw ymm2, ymm2, ymmword ptr [rbp-0x290]
+       vpternlogd ymm0, ymm1, ymm2, -2
        vmovups  ymm1, ymmword ptr [rbp-0x310]
        vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
-       vmovups  ymm1, ymmword ptr [rbp-0x330]
-       vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
+       vmovups  ymm2, ymmword ptr [rbp-0x330]
+       vpcmpeqw ymm2, ymm2, ymmword ptr [rbp-0x290]
+       vpternlogd ymm0, ymm1, ymm2, -2
        vmovups  ymmword ptr [rbp-0x570], ymm0
        lea      rdx, [rbp-0x570]
        ; byrRegs -[rdx]
@@ -848,7 +842,7 @@ G_M50250_IG54:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; byrRegs +[rcx]
        mov      bword ptr [rbp-0x338], rcx
        jmp      SHORT G_M50250_IG57
-						;; size=187 bbWeight=1 PerfScore 61.08
+						;; size=185 bbWeight=1 PerfScore 60.75
 G_M50250_IG55:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs -[rcx]
        mov      rcx, 0xD1FFAB1E
@@ -896,16 +890,14 @@ G_M50250_IG57:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vpcmpeqw ymm0, ymm0, ymmword ptr [rbp-0x290]
        vmovups  ymm1, ymmword ptr [rbp-0x2D0]
        vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
-       vmovups  ymm1, ymmword ptr [rbp-0x2F0]
-       vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
+       vmovups  ymm2, ymmword ptr [rbp-0x2F0]
+       vpcmpeqw ymm2, ymm2, ymmword ptr [rbp-0x290]
+       vpternlogd ymm0, ymm1, ymm2, -2
        vmovups  ymm1, ymmword ptr [rbp-0x310]
        vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
-       vmovups  ymm1, ymmword ptr [rbp-0x330]
-       vpcmpeqw ymm1, ymm1, ymmword ptr [rbp-0x290]
-       vpor     ymm0, ymm0, ymm1
+       vmovups  ymm2, ymmword ptr [rbp-0x330]
+       vpcmpeqw ymm2, ymm2, ymmword ptr [rbp-0x290]
+       vpternlogd ymm0, ymm1, ymm2, -2
        vmovups  ymmword ptr [rbp-0x5D0], ymm0
        lea      rdx, [rbp-0x5D0]
        ; byrRegs -[rdx]
@@ -934,7 +926,7 @@ G_M50250_IG57:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; gcr arg pop 0
        mov      dword ptr [rbp-0x3E4], eax
        jmp      G_M50250_IG72
-						;; size=292 bbWeight=1 PerfScore 80.83
+						;; size=290 bbWeight=1 PerfScore 80.50
 G_M50250_IG58:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
@@ -1007,16 +999,14 @@ G_M50250_IG64:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vpcmpeqw xmm0, xmm0, xmmword ptr [rbp-0x360]
        vmovaps  xmm1, xmmword ptr [rbp-0x380]
        vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
-       vmovaps  xmm1, xmmword ptr [rbp-0x390]
-       vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
+       vmovaps  xmm2, xmmword ptr [rbp-0x390]
+       vpcmpeqw xmm2, xmm2, xmmword ptr [rbp-0x360]
+       vpternlogd xmm0, xmm1, xmm2, -2
        vmovaps  xmm1, xmmword ptr [rbp-0x3A0]
        vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
-       vmovaps  xmm1, xmmword ptr [rbp-0x3B0]
-       vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
+       vmovaps  xmm2, xmmword ptr [rbp-0x3B0]
+       vpcmpeqw xmm2, xmm2, xmmword ptr [rbp-0x360]
+       vpternlogd xmm0, xmm1, xmm2, -2
        vmovaps  xmmword ptr [rbp-0x610], xmm0
        lea      rdx, [rbp-0x610]
        ; byrRegs -[rdx]
@@ -1032,7 +1022,7 @@ G_M50250_IG64:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; byrRegs +[rcx]
        mov      bword ptr [rbp-0x3B8], rcx
        jmp      SHORT G_M50250_IG66
-						;; size=187 bbWeight=1 PerfScore 52.08
+						;; size=185 bbWeight=1 PerfScore 51.75
 G_M50250_IG65:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs -[rcx]
        mov      rcx, 0xD1FFAB1E
@@ -1074,16 +1064,14 @@ G_M50250_IG66:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        vpcmpeqw xmm0, xmm0, xmmword ptr [rbp-0x360]
        vmovaps  xmm1, xmmword ptr [rbp-0x380]
        vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
-       vmovaps  xmm1, xmmword ptr [rbp-0x390]
-       vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
+       vmovaps  xmm2, xmmword ptr [rbp-0x390]
+       vpcmpeqw xmm2, xmm2, xmmword ptr [rbp-0x360]
+       vpternlogd xmm0, xmm1, xmm2, -2
        vmovaps  xmm1, xmmword ptr [rbp-0x3A0]
        vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
-       vmovaps  xmm1, xmmword ptr [rbp-0x3B0]
-       vpcmpeqw xmm1, xmm1, xmmword ptr [rbp-0x360]
-       vpor     xmm0, xmm0, xmm1
+       vmovaps  xmm2, xmmword ptr [rbp-0x3B0]
+       vpcmpeqw xmm2, xmm2, xmmword ptr [rbp-0x360]
+       vpternlogd xmm0, xmm1, xmm2, -2
        vmovaps  xmmword ptr [rbp-0x640], xmm0
        lea      rdx, [rbp-0x640]
        ; byrRegs -[rdx]
@@ -1112,7 +1100,7 @@ G_M50250_IG66:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ; gcr arg pop 0
        mov      dword ptr [rbp-0x3E4], eax
        jmp      SHORT G_M50250_IG72
-						;; size=289 bbWeight=1 PerfScore 70.83
+						;; size=287 bbWeight=1 PerfScore 70.50
 G_M50250_IG67:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
@@ -1150,7 +1138,7 @@ G_M50250_IG73:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=12 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 4252, prolog size 84, PerfScore 1368.22, instruction count 775, allocated bytes for code 4258 (MethodHash=a1d73bb5) for method System.SpanHelpers:IndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,short,short,int):int (Instrumented Tier0)
+; Total bytes of code 4224, prolog size 84, PerfScore 1363.42, instruction count 763, allocated bytes for code 4230 (MethodHash=a1d73bb5) for method System.SpanHelpers:IndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,short,short,int):int (Instrumented Tier0)
 ; ============================================================
 
 Unwind Info:
-1 (-0.65%) : 105.dasm - System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (FullOpts)
@@ -18,7 +18,7 @@
 ;  V06 loc3         [V06,T09] (  4,  2.93)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V07 loc4         [V07,T12] (  2,  1.85)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V08 loc5         [V08,T13] (  2,  1.85)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V09 loc6         [V09,T14] (  2,  1.85)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V09 loc6         [V09,T14] (  2,  1.85)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V11 loc8         [V11    ] (  0,  0   )     ref  ->  zero-ref    class-hnd <<unknown class>>
 ;* V12 loc9         [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
@@ -47,7 +47,7 @@
 ;* V35 tmp20        [V35,T22] (  0,  0   )  simd16  ->  zero-ref    "field V17.Item3 (fldOffset=0x20)" P-INDEP
 ;  V36 tmp21        [V36,T15] (  2,  1.85)  simd16  ->  mm2         "field V18.Item1 (fldOffset=0x0)" P-INDEP
 ;  V37 tmp22        [V37,T16] (  2,  1.85)  simd16  ->  mm3         "field V18.Item2 (fldOffset=0x10)" P-INDEP
-;  V38 tmp23        [V38,T17] (  2,  1.85)  simd16  ->  mm0         "field V18.Item3 (fldOffset=0x20)" P-INDEP
+;  V38 tmp23        [V38,T17] (  2,  1.85)  simd16  ->  mm1         "field V18.Item3 (fldOffset=0x20)" P-INDEP
 ;* V39 tmp24        [V39    ] (  0,  0   )  simd16  ->  zero-ref    "field V23.Item1 (fldOffset=0x0)" P-INDEP
 ;* V40 tmp25        [V40    ] (  0,  0   )  simd16  ->  zero-ref    "field V23.Item2 (fldOffset=0x10)" P-INDEP
 ;  V41 cse0         [V41,T07] (  3,  3   )  simd16  ->  mm1         "CSE - aggressive"
@@ -81,12 +81,11 @@ G_M39076_IG03:        ; bbWeight=0.93, gcrefRegs=0000 {}, byrefRegs=0001 {rax},
        vpshufb  xmm3, xmm0, xmmword ptr [reloc @RWD64]
        vpshufb  xmm1, xmm1, xmmword ptr [reloc @RWD80]
        vpshufb  xmm0, xmm0, xmmword ptr [reloc @RWD96]
-       vpor     xmm0, xmm1, xmm0
-       vpor     xmm0, xmm0, xmmword ptr [reloc @RWD112]
+       vpternlogd xmm1, xmm0, xmmword ptr [reloc @RWD112], -2
        vmovups  xmmword ptr [rax], xmm2
        vmovups  xmmword ptr [rax+0x10], xmm3
-       vmovups  xmmword ptr [rax+0x20], xmm0
-						;; size=62 bbWeight=0.93 PerfScore 15.12
+       vmovups  xmmword ptr [rax+0x20], xmm1
+						;; size=61 bbWeight=0.93 PerfScore 14.81
 G_M39076_IG04:        ; bbWeight=0.93, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=0.93 PerfScore 0.93
@@ -109,7 +108,7 @@ RWD96  	dq	FFFFFFFFFFFFFFFFh, FF03020100FFFFFFh
 RWD112 	dq	00002D000000002Dh, 2D000000002D0000h
 
 
-; Total bytes of code 153, prolog size 3, PerfScore 52.06, instruction count 30, allocated bytes for code 153 (MethodHash=915f675b) for method System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (FullOpts)
+; Total bytes of code 152, prolog size 3, PerfScore 51.65, instruction count 29, allocated bytes for code 152 (MethodHash=915f675b) for method System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (FullOpts)
 ; ============================================================
 
 Unwind Info:
+10 (+5.32%) : 40640.dasm - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
@@ -14,18 +14,19 @@
 ;* V01 arg1         [V01    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;  V03 loc0         [V03,T01] (  2,200   )   byref  ->  rax        
-;  V04 loc1         [V04,T04] (  6,295.65)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V04 loc1         [V04,T05] (  6,295.65)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V05 loc2         [V05,T02] (  2, 95.65)   byref  ->  rcx        
 ;* V06 loc3         [V06    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V07 loc4         [V07,T00] ( 10,600.00)    long  ->  [rsp+0x220]  tier0-frame
 ;  V08 loc5         [V08,T03] (  1, 95.65)    long  ->  [rsp+0x218]  tier0-frame
-;  V09 loc6         [V09,T05] (  3,295.65)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V09 loc6         [V09,T06] (  3,295.65)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V11 OutArgs      [V11    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V12 tmp1         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V13 tmp2         [V13    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V14 tmp3         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V15 tmp4         [V15    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V16 rat0         [V16,T04] (  3,600   )  simd64  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -40,11 +41,12 @@ G_M60939_IG02:        ; bbWeight=100, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx
        mov      rdx, qword ptr [rsp+0x220]
        vmovups  zmm0, zmmword ptr [rax+2*rdx]
        vmovups  zmm1, zmmword ptr [rax+2*rdx+0x40]
-       vpord    zmm2, zmm0, zmm1
-       vptestmw k1, zmm2, zmmword ptr [reloc @RWD00]
+       vmovaps  zmm2, zmm0
+       vpternlogd zmm2, zmm1, zmmword ptr [reloc @RWD00], -88
+       vptestmw k1, zmm2, zmm2
        kortestd k1, k1
        jne      SHORT G_M60939_IG07
-						;; size=46 bbWeight=100 PerfScore 1633.33
+						;; size=53 bbWeight=100 PerfScore 1625.00
 G_M60939_IG03:        ; bbWeight=95.65, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, byref, isz
        vpmovwb  zmm0, zmm0
        vpmovwb  zmm1, zmm1
@@ -66,11 +68,11 @@ G_M60939_IG05:        ; bbWeight=17.39, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=12 bbWeight=17.39 PerfScore 47.83
-G_M60939_IG06:        ; bbWeight=47.83, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref, isz
+G_M60939_IG06:        ; bbWeight=47.83, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref
        ; byrRegs +[rax rcx]
        mov      qword ptr [rsp+0x220], rdx
-       jmp      SHORT G_M60939_IG02
-						;; size=10 bbWeight=47.83 PerfScore 143.48
+       jmp      G_M60939_IG02
+						;; size=13 bbWeight=47.83 PerfScore 143.48
 G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byref, isz
        ; byrRegs -[rax]
        vptestmw k1, zmm0, zmmword ptr [reloc @RWD00]
@@ -86,7 +88,7 @@ G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byr
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 188, prolog size 19, PerfScore 3639.69, instruction count 35, allocated bytes for code 194 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
+; Total bytes of code 198, prolog size 19, PerfScore 3632.36, instruction count 36, allocated bytes for code 204 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
 ; ============================================================
 
 Unwind Info:
+10 (+5.32%) : 11481.dasm - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
@@ -14,18 +14,19 @@
 ;* V01 arg1         [V01    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;  V03 loc0         [V03,T01] (  2,200   )   byref  ->  rax        
-;  V04 loc1         [V04,T04] (  6,297.22)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V04 loc1         [V04,T05] (  6,297.22)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V05 loc2         [V05,T02] (  2, 97.22)   byref  ->  rcx        
 ;* V06 loc3         [V06    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V07 loc4         [V07,T00] ( 10,597.22)    long  ->  [rsp+0x220]  tier0-frame
 ;  V08 loc5         [V08,T03] (  1, 97.22)    long  ->  [rsp+0x218]  tier0-frame
-;  V09 loc6         [V09,T05] (  3,297.22)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V09 loc6         [V09,T06] (  3,297.22)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V11 OutArgs      [V11    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V12 tmp1         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V13 tmp2         [V13    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V14 tmp3         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V15 tmp4         [V15    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V16 rat0         [V16,T04] (  3,600   )  simd64  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -40,11 +41,12 @@ G_M60939_IG02:        ; bbWeight=100, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx
        mov      rdx, qword ptr [rsp+0x220]
        vmovups  zmm0, zmmword ptr [rax+2*rdx]
        vmovups  zmm1, zmmword ptr [rax+2*rdx+0x40]
-       vpord    zmm2, zmm0, zmm1
-       vptestmw k1, zmm2, zmmword ptr [reloc @RWD00]
+       vmovaps  zmm2, zmm0
+       vpternlogd zmm2, zmm1, zmmword ptr [reloc @RWD00], -88
+       vptestmw k1, zmm2, zmm2
        kortestd k1, k1
        jne      SHORT G_M60939_IG07
-						;; size=46 bbWeight=100 PerfScore 1633.33
+						;; size=53 bbWeight=100 PerfScore 1625.00
 G_M60939_IG03:        ; bbWeight=97.22, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, byref, isz
        vpmovwb  zmm0, zmm0
        vpmovwb  zmm1, zmm1
@@ -66,11 +68,11 @@ G_M60939_IG05:        ; bbWeight=8.33, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=12 bbWeight=8.33 PerfScore 22.92
-G_M60939_IG06:        ; bbWeight=48.61, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref, isz
+G_M60939_IG06:        ; bbWeight=48.61, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref
        ; byrRegs +[rax rcx]
        mov      qword ptr [rsp+0x220], rdx
-       jmp      SHORT G_M60939_IG02
-						;; size=10 bbWeight=48.61 PerfScore 145.83
+       jmp      G_M60939_IG02
+						;; size=13 bbWeight=48.61 PerfScore 145.83
 G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byref, isz
        ; byrRegs -[rax]
        vptestmw k1, zmm0, zmmword ptr [reloc @RWD00]
@@ -86,7 +88,7 @@ G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byr
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 188, prolog size 19, PerfScore 3636.07, instruction count 35, allocated bytes for code 194 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
+; Total bytes of code 198, prolog size 19, PerfScore 3628.73, instruction count 36, allocated bytes for code 204 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
 ; ============================================================
 
 Unwind Info:
+10 (+5.32%) : 31137.dasm - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
@@ -14,18 +14,19 @@
 ;* V01 arg1         [V01    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;  V03 loc0         [V03,T01] (  2,200   )   byref  ->  rax        
-;  V04 loc1         [V04,T04] (  6,295.45)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V04 loc1         [V04,T05] (  6,295.45)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V05 loc2         [V05,T02] (  2, 95.45)   byref  ->  rcx        
 ;* V06 loc3         [V06    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V07 loc4         [V07,T00] ( 10,600.00)    long  ->  [rsp+0x220]  tier0-frame
 ;  V08 loc5         [V08,T03] (  1, 95.45)    long  ->  [rsp+0x218]  tier0-frame
-;  V09 loc6         [V09,T05] (  3,295.45)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V09 loc6         [V09,T06] (  3,295.45)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V11 OutArgs      [V11    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V12 tmp1         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V13 tmp2         [V13    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V14 tmp3         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V15 tmp4         [V15    ] (  0,  0   )  simd64  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V16 rat0         [V16,T04] (  3,600   )  simd64  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -40,11 +41,12 @@ G_M60939_IG02:        ; bbWeight=100, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx
        mov      rdx, qword ptr [rsp+0x220]
        vmovups  zmm0, zmmword ptr [rax+2*rdx]
        vmovups  zmm1, zmmword ptr [rax+2*rdx+0x40]
-       vpord    zmm2, zmm0, zmm1
-       vptestmw k1, zmm2, zmmword ptr [reloc @RWD00]
+       vmovaps  zmm2, zmm0
+       vpternlogd zmm2, zmm1, zmmword ptr [reloc @RWD00], -88
+       vptestmw k1, zmm2, zmm2
        kortestd k1, k1
        jne      SHORT G_M60939_IG07
-						;; size=46 bbWeight=100 PerfScore 1633.33
+						;; size=53 bbWeight=100 PerfScore 1625.00
 G_M60939_IG03:        ; bbWeight=95.45, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, byref, isz
        vpmovwb  zmm0, zmm0
        vpmovwb  zmm1, zmm1
@@ -66,11 +68,11 @@ G_M60939_IG05:        ; bbWeight=18.18, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=12 bbWeight=18.18 PerfScore 50.00
-G_M60939_IG06:        ; bbWeight=47.73, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref, isz
+G_M60939_IG06:        ; bbWeight=47.73, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0003 {rax rcx}, gcvars, byref
        ; byrRegs +[rax rcx]
        mov      qword ptr [rsp+0x220], rdx
-       jmp      SHORT G_M60939_IG02
-						;; size=10 bbWeight=47.73 PerfScore 143.18
+       jmp      G_M60939_IG02
+						;; size=13 bbWeight=47.73 PerfScore 143.18
 G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byref, isz
        ; byrRegs -[rax]
        vptestmw k1, zmm0, zmmword ptr [reloc @RWD00]
@@ -86,7 +88,7 @@ G_M60939_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byr
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 188, prolog size 19, PerfScore 3639.10, instruction count 35, allocated bytes for code 194 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
+; Total bytes of code 198, prolog size 19, PerfScore 3631.76, instruction count 36, allocated bytes for code 204 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)
 ; ============================================================
 
 Unwind Info:
benchmarks.run.windows.x64.checked.mch
-16 (-2.96%) : 17307.dasm - System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte]):bool (FullOpts)
@@ -14,14 +14,14 @@
 ;  V03 loc1         [V03,T08] (  3,  5.50)    long  ->  rcx        
 ;* V04 loc2         [V04,T22] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V05 loc3         [V05,T23] (  0,  0   )   byref  ->  zero-ref    single-def
-;  V06 loc4         [V06,T24] (  3, 24   )  simd16  ->  mm16         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V07 loc5         [V07,T25] (  3, 24   )  simd16  ->  mm17         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V08 loc6         [V08,T26] (  3, 24   )  simd16  ->  mm18         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V06 loc4         [V06,T25] (  3, 24   )  simd16  ->  mm6         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V07 loc5         [V07,T26] (  3, 24   )  simd16  ->  mm17         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V08 loc6         [V08,T27] (  3, 24   )  simd16  ->  mm18         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V09 loc7         [V09    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V10 loc8         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V11 loc9         [V11    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V12 loc10        [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V13 loc11        [V13,T27] (  3, 16   )  simd16  ->  mm18         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V13 loc11        [V13,T28] (  3, 16   )  simd16  ->  mm18         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V14 loc12        [V14    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V15 loc13        [V15    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V16 loc14        [V16    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
@@ -74,17 +74,18 @@
 ;* V63 tmp45        [V63    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref" <System.Span`1[ubyte]>
 ;  V64 tmp46        [V64    ] (  3,  3   )  struct (16) [rsp+0x38]  do-not-enreg[XSF] must-init addr-exposed "by-value struct argument" <System.ReadOnlySpan`1[ushort]>
 ;  V65 tmp47        [V65    ] (  3,  3   )  struct (16) [rsp+0x28]  do-not-enreg[XSF] must-init addr-exposed "by-value struct argument" <System.Span`1[ubyte]>
-;  V66 cse0         [V66,T28] (  2,  9   )  simd16  ->  mm0         "CSE - aggressive"
-;  V67 cse1         [V67,T29] (  2,  9   )  simd16  ->  mm1         "CSE - aggressive"
-;  V68 cse2         [V68,T30] (  2,  9   )  simd16  ->  mm2         "CSE - aggressive"
-;  V69 cse3         [V69,T31] (  2,  9   )  simd16  ->  mm3         "CSE - aggressive"
-;  V70 cse4         [V70,T32] (  2,  9   )  simd16  ->  mm4         "CSE - aggressive"
-;  V71 cse5         [V71,T33] (  2,  9   )  simd16  ->  mm5         "CSE - aggressive"
-;  V72 cse6         [V72,T34] (  2,  9   )  simd16  ->  mm6         "CSE - aggressive"
+;  V66 cse0         [V66,T29] (  2,  9   )  simd16  ->  mm0         "CSE - aggressive"
+;  V67 cse1         [V67,T30] (  2,  9   )  simd16  ->  mm1         "CSE - aggressive"
+;  V68 cse2         [V68,T31] (  2,  9   )  simd16  ->  mm2         "CSE - aggressive"
+;  V69 cse3         [V69,T32] (  2,  9   )  simd16  ->  mm3         "CSE - aggressive"
+;  V70 cse4         [V70,T33] (  2,  9   )  simd16  ->  mm4         "CSE - aggressive"
+;  V71 cse5         [V71,T34] (  2,  9   )  simd16  ->  mm5         "CSE - aggressive"
+;  V72 cse6         [V72,T35] (  2,  9   )  simd16  ->  mm16         "CSE - aggressive"
 ;  V73 cse7         [V73,T04] ( 15,  7.50)     ref  ->  rdx         "CSE - aggressive"
 ;  V74 cse8         [V74,T07] (  3,  6   )    long  ->  rax         "CSE - aggressive"
+;  V75 rat0         [V75,T24] (  3, 48   )  simd16  ->  mm6         "ReplaceWithLclVar is creating a new local variable"
 ;
-; Lcl frame size = 112
+; Lcl frame size = 96
 
 G_M30689_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r14
@@ -92,16 +93,15 @@ G_M30689_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 112
+       sub      rsp, 96
        vzeroupper 
-       vmovaps  xmmword ptr [rsp+0x60], xmm6
-       vmovaps  xmmword ptr [rsp+0x50], xmm7
+       vmovaps  xmmword ptr [rsp+0x50], xmm6
        xor      eax, eax
        mov      qword ptr [rsp+0x28], rax
        vxorps   xmm4, xmm4, xmm4
        vmovdqa  xmmword ptr [rsp+0x30], xmm4
        mov      qword ptr [rsp+0x40], rax
-						;; size=47 bbWeight=1 PerfScore 14.83
+						;; size=41 bbWeight=1 PerfScore 12.83
 G_M30689_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0006 {rcx rdx}, byref, isz
        ; byrRegs +[rcx rdx]
        mov      rbx, bword ptr [rdx]
@@ -161,12 +161,12 @@ G_M30689_IG08:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi},
        vmovups  xmm3, xmmword ptr [reloc @RWD48]
        vmovups  xmm4, xmmword ptr [reloc @RWD64]
        vmovups  xmm5, xmmword ptr [reloc @RWD80]
-       vmovups  xmm6, xmmword ptr [reloc @RWD96]
-						;; size=65 bbWeight=1 PerfScore 22.00
+       vmovups  xmm16, xmmword ptr [reloc @RWD96]
+						;; size=67 bbWeight=1 PerfScore 22.00
 G_M30689_IG09:        ; bbWeight=8, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref, isz
-       vmovups  xmm16, xmmword ptr [rdi+2*r14]
+       vmovups  xmm6, xmmword ptr [rdi+2*r14]
        vmovups  xmm17, xmmword ptr [rdi+2*r14+0x10]
-       vpmovwb  xmm16, xmm18
+       vpmovwb  xmm6, xmm18
        vpmovwb  xmm17, xmm19
        vmovlhps xmm18, xmm18, xmm19
        vpaddb   xmm19, xmm18, xmm0
@@ -176,20 +176,20 @@ G_M30689_IG09:        ; bbWeight=8, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi},
        vpsubb   xmm18, xmm18, xmm4
        vpaddusb xmm18, xmm18, xmm5
        vpminub  xmm18, xmm19, xmm18
-       vpord    xmm7, xmm16, xmm17
-       vptest   xmm7, xmm6
+       vpternlogd xmm6, xmm17, xmm16, -88
+       vptest   xmm6, xmm6
        jne      SHORT G_M30689_IG14
-						;; size=88 bbWeight=8 PerfScore 158.67
+						;; size=88 bbWeight=8 PerfScore 160.00
 G_M30689_IG10:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref, isz
-       vpaddusb xmm7, xmm18, xmmword ptr [reloc @RWD112]
-       vpmovmskb edx, xmm7
+       vpaddusb xmm6, xmm18, xmmword ptr [reloc @RWD112]
+       vpmovmskb edx, xmm6
        test     edx, edx
        jne      SHORT G_M30689_IG14
-       vpmaddubsw xmm16, xmm18, xmmword ptr [reloc @RWD128]
-       vpshufb  xmm16, xmm16, xmmword ptr [reloc @RWD144]
+       vpmaddubsw xmm17, xmm18, xmmword ptr [reloc @RWD128]
+       vpshufb  xmm17, xmm17, xmmword ptr [reloc @RWD144]
        mov      rdx, r14
        shr      rdx, 1
-       vmovd    qword ptr [rbx+rdx], xmm16
+       vmovd    qword ptr [rbx+rdx], xmm17
        add      r14, 16
        cmp      r14, rax
        je       SHORT G_M30689_IG12
@@ -205,16 +205,15 @@ G_M30689_IG12:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M30689_IG13:        ; bbWeight=0.50, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x60]
-       vmovaps  xmm7, xmmword ptr [rsp+0x50]
-       add      rsp, 112
+       vmovaps  xmm6, xmmword ptr [rsp+0x50]
+       add      rsp, 96
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
        ret      
-						;; size=23 bbWeight=0.50 PerfScore 5.88
+						;; size=17 bbWeight=0.50 PerfScore 3.88
 G_M30689_IG14:        ; bbWeight=0.50, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, gcvars, byref, isz
        ; byrRegs +[rbx rdi]
        mov      ecx, r14d
@@ -263,16 +262,15 @@ G_M30689_IG16:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        nop      
 						;; size=40 bbWeight=0.50 PerfScore 4.38
 G_M30689_IG17:        ; bbWeight=0.50, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x60]
-       vmovaps  xmm7, xmmword ptr [rsp+0x50]
-       add      rsp, 112
+       vmovaps  xmm6, xmmword ptr [rsp+0x50]
+       add      rsp, 96
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
        ret      
-						;; size=23 bbWeight=0.50 PerfScore 5.88
+						;; size=17 bbWeight=0.50 PerfScore 3.88
 G_M30689_IG18:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        call     [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
        ; gcr arg pop 0
@@ -290,7 +288,7 @@ RWD128 	dq	0110011001100110h, 0110011001100110h
 RWD144 	dq	0E0C0A0806040200h, 0000000000000000h
 
 
-; Total bytes of code 541, prolog size 47, PerfScore 366.60, instruction count 126, allocated bytes for code 541 (MethodHash=d4b1881e) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte]):bool (FullOpts)
+; Total bytes of code 525, prolog size 41, PerfScore 360.33, instruction count 123, allocated bytes for code 525 (MethodHash=d4b1881e) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte]):bool (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -298,16 +296,14 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
   Flags             : 0x00
-  SizeOfProlog      : 0x19
-  CountOfUnwindCodes: 10
+  SizeOfProlog      : 0x13
+  CountOfUnwindCodes: 8
   FrameRegister     : none (0)
   FrameOffset       : N/A (no FrameRegister) (Value=0)
   UnwindCodes       :
-    CodeOffset: 0x19 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM7 (7)
-      Scaled Small Offset: 5 * 16 = 80 = 0x00050
     CodeOffset: 0x13 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM6 (6)
-      Scaled Small Offset: 6 * 16 = 96 = 0x00060
-    CodeOffset: 0x0A UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 13 * 8 + 8 = 112 = 0x70
+      Scaled Small Offset: 5 * 16 = 80 = 0x00050
+    CodeOffset: 0x0A UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 11 * 8 + 8 = 96 = 0x60
     CodeOffset: 0x06 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbx (3)
     CodeOffset: 0x05 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbp (5)
     CodeOffset: 0x04 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rsi (6)
-11 (-1.95%) : 11088.dasm - System.Runtime.Serialization.DataContracts.DataContract:CheckExplicitDataContractNamespaceUri(System.String,System.Type) (FullOpts)
@@ -202,10 +202,8 @@ G_M24827_IG09:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0008 {rbx
        vmovups  zmm0, zmmword ptr [rbx]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rbx+0x26]
-       vmovups  zmm3, zmmword ptr [rcx+0x26]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x26]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     cl
@@ -213,7 +211,7 @@ G_M24827_IG09:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0008 {rbx
        movzx    rcx, cl
        test     ecx, ecx
        jne      SHORT G_M24827_IG14
-						;; size=97 bbWeight=0.50 PerfScore 15.00
+						;; size=86 bbWeight=0.50 PerfScore 14.25
 G_M24827_IG10:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        ; gcrRegs -[rsi]
        ; byrRegs -[rbx]
@@ -312,7 +310,7 @@ G_M24827_IG15:        ; bbWeight=0, gcrefRegs=0048 {rbx rsi}, byrefRegs=0000 {},
        int3     
 						;; size=54 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 563, prolog size 44, PerfScore 129.33, instruction count 138, allocated bytes for code 565 (MethodHash=74cf9f04) for method System.Runtime.Serialization.DataContracts.DataContract:CheckExplicitDataContractNamespaceUri(System.String,System.Type) (FullOpts)
+; Total bytes of code 552, prolog size 44, PerfScore 127.48, instruction count 136, allocated bytes for code 554 (MethodHash=74cf9f04) for method System.Runtime.Serialization.DataContracts.DataContract:CheckExplicitDataContractNamespaceUri(System.String,System.Type) (FullOpts)
 ; ============================================================
 
 Unwind Info:
-4 (-0.79%) : 757.dasm - System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (FullOpts)
@@ -189,8 +189,7 @@ G_M35621_IG03:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        vmovups  xmm10, xmmword ptr [reloc @RWD32]
        vpsubw   xmm0, xmm6, xmm8
        vpcmpgtw xmm0, xmm9, xmm0
-       vpand    xmm0, xmm0, xmm10
-       vpxor    xmm0, xmm6, xmm0
+       vpternlogd xmm0, xmm10, xmm6, 106
        vmovups  xmmword ptr [rsi], xmm0
        mov      ecx, esi
        and      ecx, 15
@@ -208,7 +207,7 @@ G_M35621_IG03:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        call     [<unknown method>]
        ; gcrRegs -[rcx rdx]
        ; gcr arg pop 0
-						;; size=123 bbWeight=0.50 PerfScore 14.38
+						;; size=121 bbWeight=0.50 PerfScore 14.29
 G_M35621_IG04:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        cmp      rbp, rbx
        jbe      SHORT G_M35621_IG06
@@ -240,12 +239,11 @@ G_M35621_IG08:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        jne      SHORT G_M35621_IG09
        vpsubw   xmm0, xmm6, xmm8
        vpcmpgtw xmm0, xmm9, xmm0
-       vpand    xmm0, xmm0, xmm10
-       vpxor    xmm0, xmm6, xmm0
+       vpternlogd xmm0, xmm10, xmm6, 106
        vmovups  xmmword ptr [rsi+2*rbp], xmm0
        add      rbp, 8
        jmp      SHORT G_M35621_IG04
-						;; size=41 bbWeight=4 PerfScore 55.00
+						;; size=39 bbWeight=4 PerfScore 54.33
 G_M35621_IG09:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        mov      rax, rbx
        sub      rax, rbp
@@ -327,7 +325,7 @@ RWD16  	dq	801A801A801A801Ah, 801A801A801A801Ah
 RWD32  	dq	0020002000200020h, 0020002000200020h
 
 
-; Total bytes of code 509, prolog size 50, PerfScore 224.65, instruction count 125, allocated bytes for code 509 (MethodHash=e97274da) for method System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 505, prolog size 50, PerfScore 223.50, instruction count 123, allocated bytes for code 505 (MethodHash=e97274da) for method System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (FullOpts)
 ; ============================================================
 
 Unwind Info:
+14 (+1.65%) : 22121.dasm - System.SpanHelpers:IndexOfAnyValueType[ubyte,System.SpanHelpers+DontNegate`1[ubyte]](byref,ubyte,ubyte,ubyte,ubyte,int):int (FullOpts)
@@ -75,7 +75,7 @@
 ;  V64 cse4         [V64,T08] ( 11, 16   )     int  ->   r9         "CSE - aggressive"
 ;  V65 cse5         [V65,T09] ( 11, 16   )     int  ->  r11         "CSE - aggressive"
 ;
-; Lcl frame size = 56
+; Lcl frame size = 72
 
 G_M58324_IG01:        ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref, nogc <-- Prolog IG
        push     r15
@@ -84,17 +84,18 @@ G_M58324_IG01:        ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 56
+       sub      rsp, 72
        vzeroupper 
-       vmovaps  xmmword ptr [rsp+0x20], xmm6
+       vmovaps  xmmword ptr [rsp+0x30], xmm6
+       vmovaps  xmmword ptr [rsp+0x20], xmm7
        mov      rsi, rcx
        ; byrRegs +[rsi]
        mov      r14d, edx
        mov      edi, r8d
        mov      ebp, r9d
-       mov      r15d, dword ptr [rsp+0x90]
-       mov      ebx, dword ptr [rsp+0x98]
-						;; size=48 bbWeight=1 PerfScore 12.25
+       mov      r15d, dword ptr [rsp+0xA0]
+       mov      ebx, dword ptr [rsp+0xA8]
+						;; size=54 bbWeight=1 PerfScore 14.25
 G_M58324_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0040 {rsi}, byref, isz
        test     ebx, ebx
        jge      SHORT G_M58324_IG04
@@ -266,9 +267,8 @@ G_M58324_IG25:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0241 {rax rsi r
        vmovups  ymm4, ymmword ptr [rax]
        vpcmpeqb ymm5, ymm0, ymm4
        vpcmpeqb ymm6, ymm1, ymm4
-       vpor     ymm5, ymm5, ymm6
-       vpcmpeqb ymm6, ymm2, ymm4
-       vpor     ymm5, ymm5, ymm6
+       vpcmpeqb ymm7, ymm2, ymm4
+       vpternlogd ymm5, ymm6, ymm7, -2
        vpcmpeqb ymm4, ymm3, ymm4
        vpor     ymm4, ymm5, ymm4
        vptest   ymm4, ymm4
@@ -276,7 +276,7 @@ G_M58324_IG25:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0241 {rax rsi r
        add      rax, 32
        cmp      rax, r9
        jbe      SHORT G_M58324_IG25
-						;; size=48 bbWeight=4 PerfScore 62.00
+						;; size=47 bbWeight=4 PerfScore 61.33
 G_M58324_IG26:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0200 {r9}, byref
        ; byrRegs -[rax rsi]
        mov      eax, ebx
@@ -285,9 +285,8 @@ G_M58324_IG26:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0200 {r9}, b
        vmovups  ymm4, ymmword ptr [r9]
        vpcmpeqb ymm0, ymm0, ymm4
        vpcmpeqb ymm1, ymm1, ymm4
-       vpor     ymm0, ymm0, ymm1
-       vpcmpeqb ymm1, ymm2, ymm4
-       vpor     ymm0, ymm0, ymm1
+       vpcmpeqb ymm2, ymm2, ymm4
+       vpternlogd ymm0, ymm1, ymm2, -2
        vpcmpeqb ymm1, ymm3, ymm4
        vpor     ymm4, ymm0, ymm1
        vptest   ymm4, ymm4
@@ -297,7 +296,7 @@ G_M58324_IG26:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0200 {r9}, b
        tzcnt    eax, r11d
        add      eax, r10d
        jmp      G_M58324_IG32
-						;; size=73 bbWeight=0.50 PerfScore 11.50
+						;; size=72 bbWeight=0.50 PerfScore 11.42
 G_M58324_IG27:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0041 {rax rsi}, byref
        ; byrRegs -[r9] +[rax rsi]
        sub      rax, rsi
@@ -327,9 +326,8 @@ G_M58324_IG29:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0045 {rax rdx r
        vmovups  xmm4, xmmword ptr [rax]
        vpcmpeqb xmm5, xmm0, xmm4
        vpcmpeqb xmm6, xmm1, xmm4
-       vpor     xmm5, xmm5, xmm6
-       vpcmpeqb xmm6, xmm2, xmm4
-       vpor     xmm5, xmm5, xmm6
+       vpcmpeqb xmm7, xmm2, xmm4
+       vpternlogd xmm5, xmm6, xmm7, -2
        vpcmpeqb xmm4, xmm3, xmm4
        vpor     xmm4, xmm5, xmm4
        vptest   xmm4, xmm4
@@ -337,7 +335,7 @@ G_M58324_IG29:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0045 {rax rdx r
        add      rax, 16
        cmp      rax, rdx
        jbe      SHORT G_M58324_IG29
-						;; size=48 bbWeight=4 PerfScore 50.00
+						;; size=47 bbWeight=4 PerfScore 49.33
 G_M58324_IG30:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0004 {rdx}, byref, isz
        ; byrRegs -[rax rsi]
        mov      eax, ebx
@@ -346,9 +344,8 @@ G_M58324_IG30:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0004 {rdx},
        vmovups  xmm4, xmmword ptr [rdx]
        vpcmpeqb xmm0, xmm0, xmm4
        vpcmpeqb xmm1, xmm1, xmm4
-       vpor     xmm0, xmm0, xmm1
-       vpcmpeqb xmm1, xmm2, xmm4
-       vpor     xmm0, xmm0, xmm1
+       vpcmpeqb xmm2, xmm2, xmm4
+       vpternlogd xmm0, xmm1, xmm2, -2
        vpcmpeqb xmm1, xmm3, xmm4
        vpor     xmm4, xmm0, xmm1
        vptest   xmm4, xmm4
@@ -357,7 +354,7 @@ G_M58324_IG30:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0004 {rdx},
        tzcnt    eax, eax
        add      eax, ecx
        jmp      SHORT G_M58324_IG32
-						;; size=57 bbWeight=0.50 PerfScore 9.38
+						;; size=56 bbWeight=0.50 PerfScore 9.29
 G_M58324_IG31:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0041 {rax rsi}, byref
        ; byrRegs -[rdx] +[rax rsi]
        sub      rax, rsi
@@ -368,9 +365,10 @@ G_M58324_IG31:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0041 {rax rs
 						;; size=13 bbWeight=0.50 PerfScore 2.25
 G_M58324_IG32:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rsi]
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
+       vmovaps  xmm6, xmmword ptr [rsp+0x30]
+       vmovaps  xmm7, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 72
        pop      rbx
        pop      rbp
        pop      rsi
@@ -378,14 +376,15 @@ G_M58324_IG32:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=0.50 PerfScore 4.62
+						;; size=28 bbWeight=0.50 PerfScore 6.62
 G_M58324_IG33:        ; bbWeight=0.50, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        mov      eax, -1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M58324_IG34:        ; bbWeight=0.50, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
+       vmovaps  xmm6, xmmword ptr [rsp+0x30]
+       vmovaps  xmm7, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 72
        pop      rbx
        pop      rbp
        pop      rsi
@@ -393,9 +392,9 @@ G_M58324_IG34:        ; bbWeight=0.50, epilog, nogc, extend
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=0.50 PerfScore 4.62
+						;; size=28 bbWeight=0.50 PerfScore 6.62
 
-; Total bytes of code 847, prolog size 48, PerfScore 402.08, instruction count 229, allocated bytes for code 847 (MethodHash=ee5b1c2b) for method System.SpanHelpers:IndexOfAnyValueType[ubyte,System.SpanHelpers+DontNegate`1[ubyte]](byref,ubyte,ubyte,ubyte,ubyte,int):int (FullOpts)
+; Total bytes of code 861, prolog size 54, PerfScore 407.98, instruction count 228, allocated bytes for code 861 (MethodHash=ee5b1c2b) for method System.SpanHelpers:IndexOfAnyValueType[ubyte,System.SpanHelpers+DontNegate`1[ubyte]](byref,ubyte,ubyte,ubyte,ubyte,int):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -403,14 +402,16 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
   Flags             : 0x00
-  SizeOfProlog      : 0x15
-  CountOfUnwindCodes: 9
+  SizeOfProlog      : 0x1B
+  CountOfUnwindCodes: 11
   FrameRegister     : none (0)
   FrameOffset       : N/A (no FrameRegister) (Value=0)
   UnwindCodes       :
-    CodeOffset: 0x15 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM6 (6)
+    CodeOffset: 0x1B UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM7 (7)
       Scaled Small Offset: 2 * 16 = 32 = 0x00020
-    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 6 * 8 + 8 = 56 = 0x38
+    CodeOffset: 0x15 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM6 (6)
+      Scaled Small Offset: 3 * 16 = 48 = 0x00030
+    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 8 * 8 + 8 = 72 = 0x48
     CodeOffset: 0x08 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbx (3)
     CodeOffset: 0x07 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbp (5)
     CodeOffset: 0x06 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rsi (6)
+9 (+1.97%) : 3798.dasm - System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
@@ -150,8 +150,8 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        cmp      edx, 64
        jle      G_M42463_IG14
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rbx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rbx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      G_M42463_IG17
@@ -170,7 +170,7 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        call     [<unknown method>]
        ; gcrRegs -[rcx rdx]
        ; gcr arg pop 0
-						;; size=87 bbWeight=0.50 PerfScore 14.88
+						;; size=90 bbWeight=0.50 PerfScore 15.88
 G_M42463_IG11:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref, isz
        add      rsi, -64
        cmp      rbp, rsi
@@ -180,23 +180,23 @@ G_M42463_IG12:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi},
        lea      rcx, bword ptr [rbx+2*rbp]
        ; byrRegs +[rcx]
        vmovups  ymm0, ymmword ptr [rcx]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x40]
+       vmovups  ymm1, ymmword ptr [rcx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rcx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rcx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      SHORT G_M42463_IG17
        add      rbp, 64
        cmp      rbp, rsi
        jb       SHORT G_M42463_IG12
-						;; size=43 bbWeight=4 PerfScore 96.00
+						;; size=46 bbWeight=4 PerfScore 104.00
 G_M42463_IG13:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        ; byrRegs -[rcx]
        lea      rbx, bword ptr [rbx+2*rsi]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rdi-0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rdi-0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rdi-0x20]
        mov      ecx, 1
        vextractf128 xmm7, ymm6, 1
@@ -207,7 +207,7 @@ G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        vptest   ymm6, ymmword ptr [reloc @RWD32]
        sete     al
        movzx    rax, al
-						;; size=57 bbWeight=0.50 PerfScore 14.75
+						;; size=60 bbWeight=0.50 PerfScore 15.75
 G_M42463_IG15:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        movzx    rax, al
 						;; size=3 bbWeight=0.50 PerfScore 0.12
@@ -241,7 +241,7 @@ RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 456, prolog size 26, PerfScore 248.10, instruction count 122, allocated bytes for code 456 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
+; Total bytes of code 465, prolog size 26, PerfScore 259.00, instruction count 122, allocated bytes for code 465 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+3.53%) : 22146.dasm - System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
@@ -44,12 +44,12 @@ G_M50163_IG03:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0006 {rcx rdx},
        vcvtps2pd ymm2, ymm2
        vextractf128 xmm1, ymm1, 1
        vcvtps2pd ymm1, ymm1
-       vxorpd   ymm1, ymm2, ymm1
-       vxorpd   ymm0, ymm0, ymm1
+       vpternlogq ymm2, ymm1, ymm0, -106
+       vmovaps  ymm0, ymm2
        add      rdx, 32
        dec      eax
        jns      SHORT G_M50163_IG03
-						;; size=38 bbWeight=4 PerfScore 85.67
+						;; size=41 bbWeight=4 PerfScore 86.00
 G_M50163_IG04:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0002 {rcx}, byref
        ; byrRegs -[rdx]
        vmovups  ymmword ptr [rcx], ymm0
@@ -68,7 +68,7 @@ G_M50163_IG06:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
        int3     
 						;; size=6 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 85, prolog size 7, PerfScore 104.75, instruction count 24, allocated bytes for code 85 (MethodHash=b7133c0c) for method System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
+; Total bytes of code 88, prolog size 7, PerfScore 105.38, instruction count 24, allocated bytes for code 88 (MethodHash=b7133c0c) for method System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
 ; ============================================================
 
 Unwind Info:
benchmarks.run_pgo.windows.x64.checked.mch
-22 (-2.55%) : 81880.dasm - System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
@@ -26,14 +26,14 @@
 ;* V14 loc9         [V14    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V15 loc10        [V15    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )   byref  ->  zero-ref   
-;  V17 loc12        [V17,T09] (  8,  6.09)  simd32  ->  mm6         <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V17 loc12        [V17,T09] (  8,  6.09)  simd32  ->  mm4         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V18 loc13        [V18,T08] (  8,  6.79)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V19 loc14        [V19,T10] (  3,  2.70)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V20 loc15        [V20,T11] (  3,  2.70)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V21 loc16        [V21,T12] (  3,  2.70)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V22 loc17        [V22,T00] (  7,  6.79)   byref  ->  rax        
 ;  V23 loc18        [V23,T07] (  4,  1.70)   byref  ->  r15         single-def
-;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm4         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V25 loc20        [V25,T16] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V26 loc21        [V26,T20] (  3,  0   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V27 loc22        [V27,T21] (  3,  0   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
@@ -67,7 +67,7 @@
 ;* V55 tmp24        [V55    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
 ;  V56 tmp25        [V56,T06] (  9,  4.00)     int  ->  rax         "Single return block return value"
 ;
-; Lcl frame size = 56
+; Lcl frame size = 40
 
 G_M11069_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r15
@@ -76,16 +76,15 @@ G_M11069_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 56
+       sub      rsp, 40
        vzeroupper 
-       vmovaps  xmmword ptr [rsp+0x20], xmm6
        mov      rbx, rcx
        ; byrRegs +[rbx]
        mov      esi, edx
        mov      edi, r8d
        mov      ebp, r9d
-       mov      r14d, dword ptr [rsp+0x90]
-						;; size=40 bbWeight=1 PerfScore 11.25
+       mov      r14d, dword ptr [rsp+0x80]
+						;; size=34 bbWeight=1 PerfScore 9.25
 G_M11069_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byref
        test     r14d, r14d
        jl       G_M11069_IG12
@@ -112,18 +111,17 @@ G_M11069_IG04:        ; bbWeight=1.70, gcrefRegs=0000 {}, byrefRegs=8009 {rax rb
        vmovups  ymm3, ymmword ptr [rax]
        vpcmpeqw ymm4, ymm0, ymm3
        vpcmpeqw ymm5, ymm1, ymm3
-       vpor     ymm4, ymm4, ymm5
        vpcmpeqw ymm3, ymm2, ymm3
-       vpor     ymm6, ymm4, ymm3
-       vptest   ymm6, ymm6
+       vpternlogd ymm4, ymm5, ymm3, -2
+       vptest   ymm4, ymm4
        je       SHORT G_M11069_IG09
-						;; size=31 bbWeight=1.70 PerfScore 22.35
+						;; size=30 bbWeight=1.70 PerfScore 22.06
 G_M11069_IG05:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref, isz
        ; byrRegs -[r15]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  ymm0, ymm6, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
@@ -135,9 +133,8 @@ G_M11069_IG06:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        mov      eax, -1
 						;; size=5 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG07:        ; bbWeight=0.00, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
@@ -145,11 +142,10 @@ G_M11069_IG07:        ; bbWeight=0.00, epilog, nogc, extend
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=0.00 PerfScore 0.00
+						;; size=16 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG08:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
@@ -157,7 +153,7 @@ G_M11069_IG08:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=1.00 PerfScore 9.25
+						;; size=16 bbWeight=1.00 PerfScore 5.25
 G_M11069_IG09:        ; bbWeight=0.70, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=8009 {rax rbx r15}, gcvars, byref, isz
        ; byrRegs +[rax rbx r15]
        add      rax, 32
@@ -170,19 +166,18 @@ G_M11069_IG10:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=8008 {rbx r1
        test     al, 15
        je       SHORT G_M11069_IG06
        vmovups  ymm3, ymmword ptr [r15]
-       vpcmpeqw ymm0, ymm0, ymm3
-       vpcmpeqw ymm1, ymm1, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpcmpeqw ymm4, ymm0, ymm3
+       vpcmpeqw ymm0, ymm1, ymm3
        vpcmpeqw ymm1, ymm2, ymm3
-       vpor     ymm6, ymm0, ymm1
-       vptest   ymm6, ymm6
+       vpternlogd ymm4, ymm0, ymm1, -2
+       vptest   ymm4, ymm4
        je       SHORT G_M11069_IG06
-						;; size=39 bbWeight=0.00 PerfScore 0.00
+						;; size=38 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG11:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=8008 {rbx r15}, byref, isz
        sub      r15, rbx
        ; byrRegs -[r15]
        shr      r15, 1
-       vpshufb  ymm0, ymm6, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
@@ -355,20 +350,19 @@ G_M11069_IG32:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        vmovups  xmm3, xmmword ptr [rax]
        vpcmpeqw xmm4, xmm0, xmm3
        vpcmpeqw xmm5, xmm1, xmm3
-       vpor     xmm4, xmm4, xmm5
        vpcmpeqw xmm3, xmm2, xmm3
-       vpor     xmm3, xmm4, xmm3
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm5, xmm3, -2
+       vptest   xmm4, xmm4
        jne      SHORT G_M11069_IG33
        add      rax, 16
        jmp      SHORT G_M11069_IG34
-						;; size=37 bbWeight=0 PerfScore 0.00
+						;; size=36 bbWeight=0 PerfScore 0.00
 G_M11069_IG33:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref
        ; byrRegs -[rcx]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
        add      eax, ecx
@@ -383,26 +377,25 @@ G_M11069_IG34:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        test     al, 7
        je       G_M11069_IG06
        vmovups  xmm3, xmmword ptr [rcx]
-       vpcmpeqw xmm0, xmm0, xmm3
-       vpcmpeqw xmm1, xmm1, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpcmpeqw xmm4, xmm0, xmm3
+       vpcmpeqw xmm0, xmm1, xmm3
        vpcmpeqw xmm1, xmm2, xmm3
-       vpor     xmm3, xmm0, xmm1
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm0, xmm1, -2
+       vptest   xmm4, xmm4
        je       G_M11069_IG06
        sub      rcx, rbx
        ; byrRegs -[rcx]
        shr      rcx, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
        add      eax, ecx
        jmp      G_M11069_IG08
-						;; size=81 bbWeight=0 PerfScore 0.00
+						;; size=80 bbWeight=0 PerfScore 0.00
 RWD00  	dq	0F0D0B0907050301h, 8080808080808080h, 0F0D0B0907050301h, 8080808080808080h
 
 
-; Total bytes of code 863, prolog size 40, PerfScore 153.19, instruction count 238, allocated bytes for code 863 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
+; Total bytes of code 841, prolog size 34, PerfScore 144.71, instruction count 231, allocated bytes for code 841 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
 ; ============================================================
 
 Unwind Info:
@@ -410,14 +403,12 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
   Flags             : 0x00
-  SizeOfProlog      : 0x15
-  CountOfUnwindCodes: 9
+  SizeOfProlog      : 0x0C
+  CountOfUnwindCodes: 7
   FrameRegister     : none (0)
   FrameOffset       : N/A (no FrameRegister) (Value=0)
   UnwindCodes       :
-    CodeOffset: 0x15 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM6 (6)
-      Scaled Small Offset: 2 * 16 = 32 = 0x00020
-    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 6 * 8 + 8 = 56 = 0x38
+    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 4 * 8 + 8 = 40 = 0x28
     CodeOffset: 0x08 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbx (3)
     CodeOffset: 0x07 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbp (5)
     CodeOffset: 0x06 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rsi (6)
-22 (-2.55%) : 88557.dasm - System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
@@ -26,14 +26,14 @@
 ;* V14 loc9         [V14    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V15 loc10        [V15    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )   byref  ->  zero-ref   
-;  V17 loc12        [V17,T09] (  8,  6.06)  simd32  ->  mm6         <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V17 loc12        [V17,T09] (  8,  6.06)  simd32  ->  mm4         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V18 loc13        [V18,T08] (  8,  6.74)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V19 loc14        [V19,T10] (  3,  2.69)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V20 loc15        [V20,T11] (  3,  2.69)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V21 loc16        [V21,T12] (  3,  2.69)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V22 loc17        [V22,T00] (  7,  6.74)   byref  ->  rax        
 ;  V23 loc18        [V23,T07] (  4,  1.69)   byref  ->  r15         single-def
-;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm4         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V25 loc20        [V25,T16] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V26 loc21        [V26,T20] (  3,  0   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V27 loc22        [V27,T21] (  3,  0   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
@@ -67,7 +67,7 @@
 ;* V55 tmp24        [V55    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
 ;  V56 tmp25        [V56,T06] (  9,  4.00)     int  ->  rax         "Single return block return value"
 ;
-; Lcl frame size = 56
+; Lcl frame size = 40
 
 G_M11069_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r15
@@ -76,16 +76,15 @@ G_M11069_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 56
+       sub      rsp, 40
        vzeroupper 
-       vmovaps  xmmword ptr [rsp+0x20], xmm6
        mov      rbx, rcx
        ; byrRegs +[rbx]
        mov      esi, edx
        mov      edi, r8d
        mov      ebp, r9d
-       mov      r14d, dword ptr [rsp+0x90]
-						;; size=40 bbWeight=1 PerfScore 11.25
+       mov      r14d, dword ptr [rsp+0x80]
+						;; size=34 bbWeight=1 PerfScore 9.25
 G_M11069_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byref
        test     r14d, r14d
        jl       G_M11069_IG12
@@ -112,18 +111,17 @@ G_M11069_IG04:        ; bbWeight=1.69, gcrefRegs=0000 {}, byrefRegs=8009 {rax rb
        vmovups  ymm3, ymmword ptr [rax]
        vpcmpeqw ymm4, ymm0, ymm3
        vpcmpeqw ymm5, ymm1, ymm3
-       vpor     ymm4, ymm4, ymm5
        vpcmpeqw ymm3, ymm2, ymm3
-       vpor     ymm6, ymm4, ymm3
-       vptest   ymm6, ymm6
+       vpternlogd ymm4, ymm5, ymm3, -2
+       vptest   ymm4, ymm4
        je       SHORT G_M11069_IG09
-						;; size=31 bbWeight=1.69 PerfScore 22.20
+						;; size=30 bbWeight=1.69 PerfScore 21.92
 G_M11069_IG05:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref, isz
        ; byrRegs -[r15]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  ymm0, ymm6, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
@@ -135,9 +133,8 @@ G_M11069_IG06:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        mov      eax, -1
 						;; size=5 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG07:        ; bbWeight=0.00, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
@@ -145,11 +142,10 @@ G_M11069_IG07:        ; bbWeight=0.00, epilog, nogc, extend
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=0.00 PerfScore 0.00
+						;; size=16 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG08:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
-       vmovaps  xmm6, xmmword ptr [rsp+0x20]
        vzeroupper 
-       add      rsp, 56
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
@@ -157,7 +153,7 @@ G_M11069_IG08:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        pop      r14
        pop      r15
        ret      
-						;; size=22 bbWeight=1.00 PerfScore 9.25
+						;; size=16 bbWeight=1.00 PerfScore 5.25
 G_M11069_IG09:        ; bbWeight=0.69, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=8009 {rax rbx r15}, gcvars, byref, isz
        ; byrRegs +[rax rbx r15]
        add      rax, 32
@@ -170,19 +166,18 @@ G_M11069_IG10:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=8008 {rbx r1
        test     al, 15
        je       SHORT G_M11069_IG06
        vmovups  ymm3, ymmword ptr [r15]
-       vpcmpeqw ymm0, ymm0, ymm3
-       vpcmpeqw ymm1, ymm1, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpcmpeqw ymm4, ymm0, ymm3
+       vpcmpeqw ymm0, ymm1, ymm3
        vpcmpeqw ymm1, ymm2, ymm3
-       vpor     ymm6, ymm0, ymm1
-       vptest   ymm6, ymm6
+       vpternlogd ymm4, ymm0, ymm1, -2
+       vptest   ymm4, ymm4
        je       SHORT G_M11069_IG06
-						;; size=39 bbWeight=0.00 PerfScore 0.00
+						;; size=38 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG11:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=8008 {rbx r15}, byref, isz
        sub      r15, rbx
        ; byrRegs -[r15]
        shr      r15, 1
-       vpshufb  ymm0, ymm6, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
@@ -355,20 +350,19 @@ G_M11069_IG32:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        vmovups  xmm3, xmmword ptr [rax]
        vpcmpeqw xmm4, xmm0, xmm3
        vpcmpeqw xmm5, xmm1, xmm3
-       vpor     xmm4, xmm4, xmm5
        vpcmpeqw xmm3, xmm2, xmm3
-       vpor     xmm3, xmm4, xmm3
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm5, xmm3, -2
+       vptest   xmm4, xmm4
        jne      SHORT G_M11069_IG33
        add      rax, 16
        jmp      SHORT G_M11069_IG34
-						;; size=37 bbWeight=0 PerfScore 0.00
+						;; size=36 bbWeight=0 PerfScore 0.00
 G_M11069_IG33:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref
        ; byrRegs -[rcx]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
        add      eax, ecx
@@ -383,26 +377,25 @@ G_M11069_IG34:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        test     al, 7
        je       G_M11069_IG06
        vmovups  xmm3, xmmword ptr [rcx]
-       vpcmpeqw xmm0, xmm0, xmm3
-       vpcmpeqw xmm1, xmm1, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpcmpeqw xmm4, xmm0, xmm3
+       vpcmpeqw xmm0, xmm1, xmm3
        vpcmpeqw xmm1, xmm2, xmm3
-       vpor     xmm3, xmm0, xmm1
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm0, xmm1, -2
+       vptest   xmm4, xmm4
        je       G_M11069_IG06
        sub      rcx, rbx
        ; byrRegs -[rcx]
        shr      rcx, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
        add      eax, ecx
        jmp      G_M11069_IG08
-						;; size=81 bbWeight=0 PerfScore 0.00
+						;; size=80 bbWeight=0 PerfScore 0.00
 RWD00  	dq	0F0D0B0907050301h, 8080808080808080h, 0F0D0B0907050301h, 8080808080808080h
 
 
-; Total bytes of code 863, prolog size 40, PerfScore 153.03, instruction count 238, allocated bytes for code 863 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
+; Total bytes of code 841, prolog size 34, PerfScore 144.55, instruction count 231, allocated bytes for code 841 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
 ; ============================================================
 
 Unwind Info:
@@ -410,14 +403,12 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
   Flags             : 0x00
-  SizeOfProlog      : 0x15
-  CountOfUnwindCodes: 9
+  SizeOfProlog      : 0x0C
+  CountOfUnwindCodes: 7
   FrameRegister     : none (0)
   FrameOffset       : N/A (no FrameRegister) (Value=0)
   UnwindCodes       :
-    CodeOffset: 0x15 UnwindOp: UWOP_SAVE_XMM128 (8)     OpInfo: XMM6 (6)
-      Scaled Small Offset: 2 * 16 = 32 = 0x00020
-    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 6 * 8 + 8 = 56 = 0x38
+    CodeOffset: 0x0C UnwindOp: UWOP_ALLOC_SMALL (2)     OpInfo: 4 * 8 + 8 = 40 = 0x28
     CodeOffset: 0x08 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbx (3)
     CodeOffset: 0x07 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rbp (5)
     CodeOffset: 0x06 UnwindOp: UWOP_PUSH_NONVOL (0)     OpInfo: rsi (6)
-4 (-0.47%) : 24666.dasm - System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
@@ -26,14 +26,14 @@
 ;* V14 loc9         [V14    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V15 loc10        [V15    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )   byref  ->  zero-ref   
-;  V17 loc12        [V17,T09] (  8,  6.08)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V17 loc12        [V17,T09] (  8,  6.08)  simd32  ->  mm4         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V18 loc13        [V18,T08] (  8,  6.78)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V19 loc14        [V19,T10] (  3,  2.69)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V20 loc15        [V20,T11] (  3,  2.69)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V21 loc16        [V21,T12] (  3,  2.69)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V22 loc17        [V22,T00] (  7,  6.78)   byref  ->  rax        
 ;  V23 loc18        [V23,T07] (  4,  1.69)   byref  ->  rcx         single-def
-;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V24 loc19        [V24,T15] (  8,  0   )  simd16  ->  mm4         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V25 loc20        [V25,T16] (  8,  0   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V26 loc21        [V26,T20] (  3,  0   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V27 loc22        [V27,T21] (  3,  0   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
@@ -110,19 +110,18 @@ G_M11069_IG04:        ; bbWeight=1.69, gcrefRegs=0000 {}, byrefRegs=000B {rax rc
        vmovups  ymm3, ymmword ptr [rax]
        vpcmpeqw ymm4, ymm0, ymm3
        vpcmpeqw ymm5, ymm1, ymm3
-       vpor     ymm4, ymm4, ymm5
        vpcmpeqw ymm3, ymm2, ymm3
-       vpor     ymm3, ymm4, ymm3
-       vptest   ymm3, ymm3
+       vpternlogd ymm4, ymm5, ymm3, -2
+       vptest   ymm4, ymm4
        je       SHORT G_M11069_IG09
-						;; size=31 bbWeight=1.69 PerfScore 22.30
+						;; size=30 bbWeight=1.69 PerfScore 22.02
 G_M11069_IG05:        ; bbWeight=1.00, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref, isz
        ; byrRegs -[rcx]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  ymm3, ymm3, ymmword ptr [reloc @RWD00]
-       vpermq   ymm0, ymm3, -40
+       vpshufb  ymm4, ymm4, ymmword ptr [reloc @RWD00]
+       vpermq   ymm0, ymm4, -40
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
        add      eax, ecx
@@ -164,15 +163,14 @@ G_M11069_IG10:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=000A {rcx rb
        test     al, 15
        je       SHORT G_M11069_IG06
        vmovups  ymm3, ymmword ptr [rcx]
-       vpcmpeqw ymm0, ymm0, ymm3
-       vpcmpeqw ymm1, ymm1, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpcmpeqw ymm4, ymm0, ymm3
+       vpcmpeqw ymm0, ymm1, ymm3
        vpcmpeqw ymm1, ymm2, ymm3
-       vpor     ymm3, ymm0, ymm1
-       vptest   ymm3, ymm3
+       vpternlogd ymm4, ymm0, ymm1, -2
+       vptest   ymm4, ymm4
        jne      G_M11069_IG28
        jmp      SHORT G_M11069_IG06
-						;; size=44 bbWeight=0.00 PerfScore 0.00
+						;; size=43 bbWeight=0.00 PerfScore 0.00
 G_M11069_IG11:        ; bbWeight=0.00, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byref
        ; byrRegs -[rcx]
        mov      rcx, 0xD1FFAB1E
@@ -305,7 +303,7 @@ G_M11069_IG28:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000A {rcx rbx},
        sub      rcx, rbx
        ; byrRegs -[rcx]
        shr      rcx, 1
-       vpshufb  ymm0, ymm3, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
@@ -330,20 +328,19 @@ G_M11069_IG30:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        vmovups  xmm3, xmmword ptr [rax]
        vpcmpeqw xmm4, xmm0, xmm3
        vpcmpeqw xmm5, xmm1, xmm3
-       vpor     xmm4, xmm4, xmm5
        vpcmpeqw xmm3, xmm2, xmm3
-       vpor     xmm3, xmm4, xmm3
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm5, xmm3, -2
+       vptest   xmm4, xmm4
        jne      SHORT G_M11069_IG31
        add      rax, 16
        jmp      SHORT G_M11069_IG32
-						;; size=37 bbWeight=0 PerfScore 0.00
+						;; size=36 bbWeight=0 PerfScore 0.00
 G_M11069_IG31:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0009 {rax rbx}, byref
        ; byrRegs -[rcx]
        sub      rax, rbx
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
        add      eax, ecx
@@ -358,22 +355,21 @@ G_M11069_IG32:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=000B {rax rcx r
        test     al, 7
        je       G_M11069_IG06
        vmovups  xmm3, xmmword ptr [rcx]
-       vpcmpeqw xmm0, xmm0, xmm3
-       vpcmpeqw xmm1, xmm1, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpcmpeqw xmm4, xmm0, xmm3
+       vpcmpeqw xmm0, xmm1, xmm3
        vpcmpeqw xmm1, xmm2, xmm3
-       vpor     xmm3, xmm0, xmm1
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm0, xmm1, -2
+       vptest   xmm4, xmm4
        je       G_M11069_IG06
        sub      rcx, rbx
        ; byrRegs -[rcx]
        shr      rcx, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
        add      eax, ecx
        jmp      G_M11069_IG08
-						;; size=81 bbWeight=0 PerfScore 0.00
+						;; size=80 bbWeight=0 PerfScore 0.00
 G_M11069_IG33:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byref
        xor      ecx, ecx
        cmp      r14d, 4
@@ -399,7 +395,7 @@ G_M11069_IG34:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0008 {rbx}, byr
 RWD00  	dq	0F0D0B0907050301h, 8080808080808080h, 0F0D0B0907050301h, 8080808080808080h
 
 
-; Total bytes of code 852, prolog size 29, PerfScore 144.55, instruction count 233, allocated bytes for code 852 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
+; Total bytes of code 848, prolog size 29, PerfScore 143.87, instruction count 229, allocated bytes for code 848 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
 ; ============================================================
 
 Unwind Info:
+9 (+1.62%) : 88451.dasm - System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
@@ -123,8 +123,8 @@ G_M42463_IG09:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
 						;; size=13 bbWeight=0 PerfScore 0.00
 G_M42463_IG10:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi}, byref
        vmovups  ymm0, ymmword ptr [rsi]
-       vpor     ymm0, ymm0, ymmword ptr [rsi+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rdi-0x40]
+       vmovups  ymm1, ymmword ptr [rsi+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rdi-0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rdi-0x20]
        mov      ecx, 1
        vextractf128 xmm7, ymm6, 1
@@ -135,7 +135,7 @@ G_M42463_IG10:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi},
        vptest   ymm6, ymmword ptr [reloc @RWD00]
        sete     al
        movzx    rax, al
-						;; size=57 bbWeight=0 PerfScore 0.00
+						;; size=60 bbWeight=0 PerfScore 0.00
 G_M42463_IG11:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        movzx    rax, al
 						;; size=3 bbWeight=0 PerfScore 0.00
@@ -205,8 +205,8 @@ G_M42463_IG16:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi},
        cmp      ebx, 64
        jle      G_M42463_IG10
        vmovups  ymm0, ymmword ptr [rsi]
-       vpor     ymm0, ymm0, ymmword ptr [rsi+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rsi+0x40]
+       vmovups  ymm1, ymmword ptr [rsi+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rsi+0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rsi+0x60]
        mov      ecx, 1
        vextractf128 xmm7, ymm6, 1
@@ -229,13 +229,13 @@ G_M42463_IG16:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi},
        movsxd   rbx, ebx
        add      rbx, -64
        jmp      G_M42463_IG09
-						;; size=112 bbWeight=0 PerfScore 0.00
+						;; size=115 bbWeight=0 PerfScore 0.00
 G_M42463_IG17:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi}, byref
        lea      rcx, bword ptr [rsi+2*rbp]
        ; byrRegs +[rcx]
        vmovups  ymm0, ymmword ptr [rcx]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x40]
+       vmovups  ymm1, ymmword ptr [rcx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rcx+0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rcx+0x60]
        mov      ecx, 1
        ; byrRegs -[rcx]
@@ -247,11 +247,11 @@ G_M42463_IG17:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=00C0 {rsi rdi},
        jne      G_M42463_IG07
        add      rbp, 64
        jmp      G_M42463_IG09
-						;; size=70 bbWeight=0 PerfScore 0.00
+						;; size=73 bbWeight=0 PerfScore 0.00
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 556, prolog size 28, PerfScore 96.80, instruction count 140, allocated bytes for code 556 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
+; Total bytes of code 565, prolog size 28, PerfScore 97.70, instruction count 140, allocated bytes for code 565 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
 ; ============================================================
 
 Unwind Info:
+3 (+4.11%) : 64196.dasm - System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
@@ -29,22 +29,22 @@ G_M21070_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs +[rax]
        vmovups  xmm0, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x18]
-       vandps   xmm0, xmm0, xmmword ptr [rax]
-       mov      rax, bword ptr [rbp+0x18]
        vmovups  xmm1, xmmword ptr [rax]
+       mov      rax, bword ptr [rbp+0x18]
+       vmovups  xmm2, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x28]
-       vandnps  xmm1, xmm1, xmmword ptr [rax]
-       vorps    xmm0, xmm0, xmm1
+       vandnps  xmm2, xmm2, xmmword ptr [rax]
+       vpternlogd xmm0, xmm1, xmm2, -22
        mov      rax, bword ptr [rbp+0x10]
        vmovups  xmmword ptr [rax], xmm0
        mov      rax, bword ptr [rbp+0x10]
-						;; size=48 bbWeight=1 PerfScore 22.33
+						;; size=51 bbWeight=1 PerfScore 23.50
 G_M21070_IG04:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=2 bbWeight=1 PerfScore 1.50
 
-; Total bytes of code 73, prolog size 7, PerfScore 37.38, instruction count 22, allocated bytes for code 73 (MethodHash=1f63adb1) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
+; Total bytes of code 76, prolog size 7, PerfScore 38.85, instruction count 22, allocated bytes for code 76 (MethodHash=1f63adb1) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
 ; ============================================================
 
 Unwind Info:
+3 (+4.11%) : 48668.dasm - System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
@@ -29,22 +29,22 @@ G_M49358_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs +[rax]
        vmovups  xmm0, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x18]
-       vandpd   xmm0, xmm0, xmmword ptr [rax]
-       mov      rax, bword ptr [rbp+0x18]
        vmovups  xmm1, xmmword ptr [rax]
+       mov      rax, bword ptr [rbp+0x18]
+       vmovups  xmm2, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x28]
-       vandnpd  xmm1, xmm1, xmmword ptr [rax]
-       vorpd    xmm0, xmm0, xmm1
+       vandnpd  xmm2, xmm2, xmmword ptr [rax]
+       vpternlogq xmm0, xmm1, xmm2, -22
        mov      rax, bword ptr [rbp+0x10]
        vmovups  xmmword ptr [rax], xmm0
        mov      rax, bword ptr [rbp+0x10]
-						;; size=48 bbWeight=1 PerfScore 22.33
+						;; size=51 bbWeight=1 PerfScore 23.50
 G_M49358_IG04:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=2 bbWeight=1 PerfScore 1.50
 
-; Total bytes of code 73, prolog size 7, PerfScore 37.38, instruction count 22, allocated bytes for code 73 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
+; Total bytes of code 76, prolog size 7, PerfScore 38.85, instruction count 22, allocated bytes for code 76 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
 ; ============================================================
 
 Unwind Info:
benchmarks.run_tiered.windows.x64.checked.mch
-4 (-0.79%) : 41558.dasm - System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (Tier1)
@@ -189,8 +189,7 @@ G_M35621_IG03:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        vmovups  xmm10, xmmword ptr [reloc @RWD32]
        vpsubw   xmm0, xmm6, xmm8
        vpcmpgtw xmm0, xmm9, xmm0
-       vpand    xmm0, xmm0, xmm10
-       vpxor    xmm0, xmm6, xmm0
+       vpternlogd xmm0, xmm10, xmm6, 106
        vmovups  xmmword ptr [rsi], xmm0
        mov      ecx, esi
        and      ecx, 15
@@ -208,7 +207,7 @@ G_M35621_IG03:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        call     [<unknown method>]
        ; gcrRegs -[rcx rdx]
        ; gcr arg pop 0
-						;; size=123 bbWeight=0.50 PerfScore 14.38
+						;; size=121 bbWeight=0.50 PerfScore 14.29
 G_M35621_IG04:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        cmp      rbp, rbx
        jbe      SHORT G_M35621_IG06
@@ -240,12 +239,11 @@ G_M35621_IG08:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        jne      SHORT G_M35621_IG09
        vpsubw   xmm0, xmm6, xmm8
        vpcmpgtw xmm0, xmm9, xmm0
-       vpand    xmm0, xmm0, xmm10
-       vpxor    xmm0, xmm6, xmm0
+       vpternlogd xmm0, xmm10, xmm6, 106
        vmovups  xmmword ptr [rsi+2*rbp], xmm0
        add      rbp, 8
        jmp      SHORT G_M35621_IG04
-						;; size=41 bbWeight=4 PerfScore 55.00
+						;; size=39 bbWeight=4 PerfScore 54.33
 G_M35621_IG09:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        mov      rax, rbx
        sub      rax, rbp
@@ -327,7 +325,7 @@ RWD16  	dq	801A801A801A801Ah, 801A801A801A801Ah
 RWD32  	dq	0020002000200020h, 0020002000200020h
 
 
-; Total bytes of code 509, prolog size 50, PerfScore 224.65, instruction count 125, allocated bytes for code 509 (MethodHash=e97274da) for method System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (Tier1)
+; Total bytes of code 505, prolog size 50, PerfScore 223.50, instruction count 123, allocated bytes for code 505 (MethodHash=e97274da) for method System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (Tier1)
 ; ============================================================
 
 Unwind Info:
-1 (-0.65%) : 45177.dasm - System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (Tier1)
@@ -17,7 +17,7 @@
 ;  V06 loc3         [V06,T09] (  4,  2.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V07 loc4         [V07,T12] (  2,  1   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V08 loc5         [V08,T13] (  2,  1   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V09 loc6         [V09,T14] (  2,  1   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V09 loc6         [V09,T14] (  2,  1   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V11 loc8         [V11    ] (  0,  0   )     ref  ->  zero-ref    class-hnd <<unknown class>>
 ;* V12 loc9         [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
@@ -52,7 +52,7 @@
 ;* V41 tmp26        [V41,T22] (  0,  0   )  simd16  ->  zero-ref    "field V18.Item3 (fldOffset=0x20)" P-INDEP
 ;  V42 tmp27        [V42,T17] (  2,  1   )  simd16  ->  mm2         "field V19.Item1 (fldOffset=0x0)" P-INDEP
 ;  V43 tmp28        [V43,T18] (  2,  1   )  simd16  ->  mm3         "field V19.Item2 (fldOffset=0x10)" P-INDEP
-;  V44 tmp29        [V44,T19] (  2,  1   )  simd16  ->  mm0         "field V19.Item3 (fldOffset=0x20)" P-INDEP
+;  V44 tmp29        [V44,T19] (  2,  1   )  simd16  ->  mm1         "field V19.Item3 (fldOffset=0x20)" P-INDEP
 ;* V45 tmp30        [V45    ] (  0,  0   )  simd16  ->  zero-ref    "field V27.Item1 (fldOffset=0x0)" P-INDEP
 ;* V46 tmp31        [V46    ] (  0,  0   )  simd16  ->  zero-ref    "field V27.Item2 (fldOffset=0x10)" P-INDEP
 ;  V47 cse0         [V47,T07] (  3,  3   )  simd16  ->  mm1         "CSE - aggressive"
@@ -86,12 +86,11 @@ G_M39076_IG03:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0001 {rax},
        vpshufb  xmm3, xmm0, xmmword ptr [reloc @RWD64]
        vpshufb  xmm1, xmm1, xmmword ptr [reloc @RWD80]
        vpshufb  xmm0, xmm0, xmmword ptr [reloc @RWD96]
-       vpor     xmm0, xmm1, xmm0
-       vpor     xmm0, xmm0, xmmword ptr [reloc @RWD112]
+       vpternlogd xmm1, xmm0, xmmword ptr [reloc @RWD112], -2
        vmovups  xmmword ptr [rax], xmm2
        vmovups  xmmword ptr [rax+0x10], xmm3
-       vmovups  xmmword ptr [rax+0x20], xmm0
-						;; size=62 bbWeight=0.50 PerfScore 8.17
+       vmovups  xmmword ptr [rax+0x20], xmm1
+						;; size=61 bbWeight=0.50 PerfScore 8.00
 G_M39076_IG04:        ; bbWeight=0.50, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=0.50 PerfScore 0.50
@@ -114,7 +113,7 @@ RWD96  	dq	FFFFFFFFFFFFFFFFh, FF03020100FFFFFFh
 RWD112 	dq	00002D000000002Dh, 2D000000002D0000h
 
 
-; Total bytes of code 153, prolog size 3, PerfScore 47.80, instruction count 30, allocated bytes for code 153 (MethodHash=915f675b) for method System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (Tier1)
+; Total bytes of code 152, prolog size 3, PerfScore 47.53, instruction count 29, allocated bytes for code 152 (MethodHash=915f675b) for method System.Guid:FormatGuidVector128Utf8(System.Guid,bool):System.ValueTuple`3[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]] (Tier1)
 ; ============================================================
 
 Unwind Info:
-4 (-0.52%) : 17597.dasm - System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
@@ -25,14 +25,14 @@
 ;* V14 loc9         [V14    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V15 loc10        [V15    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )   byref  ->  zero-ref   
-;  V17 loc12        [V17,T25] (  8, 14.50)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V17 loc12        [V17,T25] (  8, 14.50)  simd32  ->  mm4         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V18 loc13        [V18,T23] (  8, 18   )  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V19 loc14        [V19,T27] (  3,  5   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V20 loc15        [V20,T28] (  3,  5   )  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V21 loc16        [V21,T29] (  3,  5   )  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V22 loc17        [V22,T05] (  7, 17.50)   byref  ->  rax        
 ;  V23 loc18        [V23,T19] (  3,  5   )   byref  ->   r9         single-def
-;  V24 loc19        [V24,T26] (  8, 14.50)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V24 loc19        [V24,T26] (  8, 14.50)  simd16  ->  mm4         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V25 loc20        [V25,T24] (  8, 18   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V26 loc21        [V26,T30] (  3,  5   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V27 loc22        [V27,T31] (  3,  5   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
@@ -246,42 +246,40 @@ G_M11069_IG25:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0241 {rax rsi r
        vmovups  ymm3, ymmword ptr [rax]
        vpcmpeqw ymm4, ymm0, ymm3
        vpcmpeqw ymm5, ymm1, ymm3
-       vpor     ymm4, ymm4, ymm5
        vpcmpeqw ymm3, ymm2, ymm3
-       vpor     ymm3, ymm4, ymm3
-       vptest   ymm3, ymm3
+       vpternlogd ymm4, ymm5, ymm3, -2
+       vptest   ymm4, ymm4
        jne      SHORT G_M11069_IG27
        add      rax, 32
        cmp      rax, r9
        jbe      SHORT G_M11069_IG25
-						;; size=40 bbWeight=4 PerfScore 58.67
+						;; size=39 bbWeight=4 PerfScore 58.00
 G_M11069_IG26:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0200 {r9}, byref
        ; byrRegs -[rax rsi]
        mov      eax, ebx
        test     al, 15
        je       G_M11069_IG33
        vmovups  ymm3, ymmword ptr [r9]
-       vpcmpeqw ymm0, ymm0, ymm3
-       vpcmpeqw ymm1, ymm1, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpcmpeqw ymm4, ymm0, ymm3
+       vpcmpeqw ymm0, ymm1, ymm3
        vpcmpeqw ymm1, ymm2, ymm3
-       vpor     ymm3, ymm0, ymm1
-       vptest   ymm3, ymm3
+       vpternlogd ymm4, ymm0, ymm1, -2
+       vptest   ymm4, ymm4
        je       G_M11069_IG33
        shr      r10, 1
-       vpshufb  ymm3, ymm3, ymmword ptr [reloc @RWD00]
-       vpermq   ymm0, ymm3, -40
+       vpshufb  ymm4, ymm4, ymmword ptr [reloc @RWD00]
+       vpermq   ymm0, ymm4, -40
        vpmovmskb eax, xmm0
        tzcnt    eax, eax
        add      eax, r10d
        jmp      G_M11069_IG32
-						;; size=80 bbWeight=0.50 PerfScore 12.71
+						;; size=79 bbWeight=0.50 PerfScore 12.62
 G_M11069_IG27:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0041 {rax rsi}, byref
        ; byrRegs -[r9] +[rax rsi]
        sub      rax, rsi
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  ymm0, ymm3, ymmword ptr [reloc @RWD00]
+       vpshufb  ymm0, ymm4, ymmword ptr [reloc @RWD00]
        vpermq   ymm0, ymm0, -40
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
@@ -307,41 +305,39 @@ G_M11069_IG29:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0045 {rax rdx r
        vmovups  xmm3, xmmword ptr [rax]
        vpcmpeqw xmm4, xmm0, xmm3
        vpcmpeqw xmm5, xmm1, xmm3
-       vpor     xmm4, xmm4, xmm5
        vpcmpeqw xmm3, xmm2, xmm3
-       vpor     xmm3, xmm4, xmm3
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm5, xmm3, -2
+       vptest   xmm4, xmm4
        jne      SHORT G_M11069_IG31
        add      rax, 16
        cmp      rax, rdx
        jbe      SHORT G_M11069_IG29
-						;; size=40 bbWeight=4 PerfScore 46.67
+						;; size=39 bbWeight=4 PerfScore 46.00
 G_M11069_IG30:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0004 {rdx}, byref, isz
        ; byrRegs -[rax rsi]
        mov      eax, ebx
        test     al, 7
        je       SHORT G_M11069_IG33
        vmovups  xmm3, xmmword ptr [rdx]
-       vpcmpeqw xmm0, xmm0, xmm3
-       vpcmpeqw xmm1, xmm1, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpcmpeqw xmm4, xmm0, xmm3
+       vpcmpeqw xmm0, xmm1, xmm3
        vpcmpeqw xmm1, xmm2, xmm3
-       vpor     xmm3, xmm0, xmm1
-       vptest   xmm3, xmm3
+       vpternlogd xmm4, xmm0, xmm1, -2
+       vptest   xmm4, xmm4
        je       SHORT G_M11069_IG33
        shr      rcx, 1
-       vpshufb  xmm3, xmm3, xmmword ptr [reloc @RWD00]
-       vpmovmskb eax, xmm3
+       vpshufb  xmm4, xmm4, xmmword ptr [reloc @RWD00]
+       vpmovmskb eax, xmm4
        tzcnt    eax, eax
        add      eax, ecx
        jmp      SHORT G_M11069_IG32
-						;; size=61 bbWeight=0.50 PerfScore 10.21
+						;; size=60 bbWeight=0.50 PerfScore 10.12
 G_M11069_IG31:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0041 {rax rsi}, byref
        ; byrRegs -[rdx] +[rax rsi]
        sub      rax, rsi
        ; byrRegs -[rax]
        shr      rax, 1
-       vpshufb  xmm0, xmm3, xmmword ptr [reloc @RWD00]
+       vpshufb  xmm0, xmm4, xmmword ptr [reloc @RWD00]
        vpmovmskb ecx, xmm0
        tzcnt    ecx, ecx
        add      eax, ecx
@@ -373,7 +369,7 @@ G_M11069_IG34:        ; bbWeight=0.50, epilog, nogc, extend
 RWD00  	dq	0F0D0B0907050301h, 8080808080808080h, 0F0D0B0907050301h, 8080808080808080h
 
 
-; Total bytes of code 774, prolog size 29, PerfScore 409.65, instruction count 209, allocated bytes for code 774 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
+; Total bytes of code 770, prolog size 29, PerfScore 407.75, instruction count 205, allocated bytes for code 770 (MethodHash=20f3d4c2) for method System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (Tier1)
 ; ============================================================
 
 Unwind Info:
+9 (+1.97%) : 17569.dasm - System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
@@ -150,8 +150,8 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        cmp      edx, 64
        jle      G_M42463_IG14
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rbx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rbx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      G_M42463_IG17
@@ -170,7 +170,7 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        call     [<unknown method>]
        ; gcrRegs -[rcx rdx]
        ; gcr arg pop 0
-						;; size=87 bbWeight=0.50 PerfScore 14.88
+						;; size=90 bbWeight=0.50 PerfScore 15.88
 G_M42463_IG11:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref, isz
        add      rsi, -64
        cmp      rbp, rsi
@@ -180,23 +180,23 @@ G_M42463_IG12:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi},
        lea      rcx, bword ptr [rbx+2*rbp]
        ; byrRegs +[rcx]
        vmovups  ymm0, ymmword ptr [rcx]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x40]
+       vmovups  ymm1, ymmword ptr [rcx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rcx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rcx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      SHORT G_M42463_IG17
        add      rbp, 64
        cmp      rbp, rsi
        jb       SHORT G_M42463_IG12
-						;; size=43 bbWeight=4 PerfScore 96.00
+						;; size=46 bbWeight=4 PerfScore 104.00
 G_M42463_IG13:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        ; byrRegs -[rcx]
        lea      rbx, bword ptr [rbx+2*rsi]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rdi-0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rdi-0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rdi-0x20]
        mov      ecx, 1
        vextractf128 xmm7, ymm6, 1
@@ -207,7 +207,7 @@ G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        vptest   ymm6, ymmword ptr [reloc @RWD32]
        sete     al
        movzx    rax, al
-						;; size=57 bbWeight=0.50 PerfScore 14.75
+						;; size=60 bbWeight=0.50 PerfScore 15.75
 G_M42463_IG15:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        movzx    rax, al
 						;; size=3 bbWeight=0.50 PerfScore 0.12
@@ -241,7 +241,7 @@ RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 456, prolog size 26, PerfScore 248.10, instruction count 122, allocated bytes for code 456 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
+; Total bytes of code 465, prolog size 26, PerfScore 259.00, instruction count 122, allocated bytes for code 465 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
 ; ============================================================
 
 Unwind Info:
+3 (+4.11%) : 41810.dasm - System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
@@ -29,22 +29,22 @@ G_M21070_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs +[rax]
        vmovups  xmm0, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x18]
-       vandps   xmm0, xmm0, xmmword ptr [rax]
-       mov      rax, bword ptr [rbp+0x18]
        vmovups  xmm1, xmmword ptr [rax]
+       mov      rax, bword ptr [rbp+0x18]
+       vmovups  xmm2, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x28]
-       vandnps  xmm1, xmm1, xmmword ptr [rax]
-       vorps    xmm0, xmm0, xmm1
+       vandnps  xmm2, xmm2, xmmword ptr [rax]
+       vpternlogd xmm0, xmm1, xmm2, -22
        mov      rax, bword ptr [rbp+0x10]
        vmovups  xmmword ptr [rax], xmm0
        mov      rax, bword ptr [rbp+0x10]
-						;; size=48 bbWeight=1 PerfScore 22.33
+						;; size=51 bbWeight=1 PerfScore 23.50
 G_M21070_IG04:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=2 bbWeight=1 PerfScore 1.50
 
-; Total bytes of code 73, prolog size 7, PerfScore 37.38, instruction count 22, allocated bytes for code 73 (MethodHash=1f63adb1) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
+; Total bytes of code 76, prolog size 7, PerfScore 38.85, instruction count 22, allocated bytes for code 76 (MethodHash=1f63adb1) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[float]):System.Runtime.Intrinsics.Vector128`1[float] (Tier0)
 ; ============================================================
 
 Unwind Info:
+3 (+4.11%) : 32889.dasm - System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
@@ -29,22 +29,22 @@ G_M49358_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs +[rax]
        vmovups  xmm0, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x18]
-       vandpd   xmm0, xmm0, xmmword ptr [rax]
-       mov      rax, bword ptr [rbp+0x18]
        vmovups  xmm1, xmmword ptr [rax]
+       mov      rax, bword ptr [rbp+0x18]
+       vmovups  xmm2, xmmword ptr [rax]
        mov      rax, bword ptr [rbp+0x28]
-       vandnpd  xmm1, xmm1, xmmword ptr [rax]
-       vorpd    xmm0, xmm0, xmm1
+       vandnpd  xmm2, xmm2, xmmword ptr [rax]
+       vpternlogq xmm0, xmm1, xmm2, -22
        mov      rax, bword ptr [rbp+0x10]
        vmovups  xmmword ptr [rax], xmm0
        mov      rax, bword ptr [rbp+0x10]
-						;; size=48 bbWeight=1 PerfScore 22.33
+						;; size=51 bbWeight=1 PerfScore 23.50
 G_M49358_IG04:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        ret      
 						;; size=2 bbWeight=1 PerfScore 1.50
 
-; Total bytes of code 73, prolog size 7, PerfScore 37.38, instruction count 22, allocated bytes for code 73 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
+; Total bytes of code 76, prolog size 7, PerfScore 38.85, instruction count 22, allocated bytes for code 76 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (Tier0)
 ; ============================================================
 
 Unwind Info:
coreclr_tests.run.windows.x64.checked.mch
-11 (-10.00%) : 222531.dasm - Tests_len34_13:Test_tst_21(System.String):bool (Tier1)
@@ -95,16 +95,14 @@ G_M60234_IG06:        ; bbWeight=0.05, gcrefRegs=0000 {}, byrefRegs=0003 {rax rc
        vmovups  zmm0, zmmword ptr [rax]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rax+0x04]
-       vmovups  zmm3, zmmword ptr [rcx+0x04]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x04]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     al
        ; byrRegs -[rax]
        movzx    rax, al
-						;; size=66 bbWeight=0.05 PerfScore 1.01
+						;; size=55 bbWeight=0.05 PerfScore 0.94
 G_M60234_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rcx]
        vzeroupper 
@@ -115,7 +113,7 @@ G_M60234_IG08:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
        jmp      SHORT G_M60234_IG07
 						;; size=4 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 110, prolog size 3, PerfScore 20.46, instruction count 26, allocated bytes for code 112 (MethodHash=07d314b5) for method Tests_len34_13:Test_tst_21(System.String):bool (Tier1)
+; Total bytes of code 99, prolog size 3, PerfScore 19.29, instruction count 24, allocated bytes for code 101 (MethodHash=07d314b5) for method Tests_len34_13:Test_tst_21(System.String):bool (Tier1)
 ; ============================================================
 
 Unwind Info:
-11 (-10.00%) : 222537.dasm - Tests_len34_13:Test_tst_24(System.String):bool (Tier1)
@@ -103,16 +103,14 @@ G_M54607_IG06:        ; bbWeight=0.09, gcrefRegs=0000 {}, byrefRegs=0003 {rax rc
        vmovups  zmm0, zmmword ptr [rax]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rax+0x04]
-       vmovups  zmm3, zmmword ptr [rcx+0x04]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x04]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     al
        ; byrRegs -[rax]
        movzx    rax, al
-						;; size=66 bbWeight=0.09 PerfScore 1.90
+						;; size=55 bbWeight=0.09 PerfScore 1.77
 G_M54607_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rcx]
        vzeroupper 
@@ -123,7 +121,7 @@ G_M54607_IG08:        ; bbWeight=0.05, gcVars=0000000000000000 {}, gcrefRegs=000
        jmp      SHORT G_M54607_IG07
 						;; size=4 bbWeight=0.05 PerfScore 0.11
 
-; Total bytes of code 110, prolog size 3, PerfScore 21.45, instruction count 26, allocated bytes for code 112 (MethodHash=35c22ab0) for method Tests_len34_13:Test_tst_24(System.String):bool (Tier1)
+; Total bytes of code 99, prolog size 3, PerfScore 20.22, instruction count 24, allocated bytes for code 101 (MethodHash=35c22ab0) for method Tests_len34_13:Test_tst_24(System.String):bool (Tier1)
 ; ============================================================
 
 Unwind Info:
-11 (-10.00%) : 479726.dasm - Tests_len34_13:Test_tst_21(System.String):bool (FullOpts)
@@ -95,16 +95,14 @@ G_M60234_IG06:        ; bbWeight=0.43, gcrefRegs=0000 {}, byrefRegs=0003 {rax rc
        vmovups  zmm0, zmmword ptr [rax]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rax+0x04]
-       vmovups  zmm3, zmmword ptr [rcx+0x04]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x04]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     al
        ; byrRegs -[rax]
        movzx    rax, al
-						;; size=66 bbWeight=0.43 PerfScore 9.62
+						;; size=55 bbWeight=0.43 PerfScore 8.97
 G_M60234_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rcx]
        vzeroupper 
@@ -115,7 +113,7 @@ G_M60234_IG08:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
        jmp      SHORT G_M60234_IG07
 						;; size=4 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 110, prolog size 3, PerfScore 29.07, instruction count 26, allocated bytes for code 112 (MethodHash=07d314b5) for method Tests_len34_13:Test_tst_21(System.String):bool (FullOpts)
+; Total bytes of code 99, prolog size 3, PerfScore 27.32, instruction count 24, allocated bytes for code 101 (MethodHash=07d314b5) for method Tests_len34_13:Test_tst_21(System.String):bool (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 114841.dasm - System.Double:CopySign(double,double):double (Instrumented Tier1)
@@ -24,17 +24,17 @@ G_M5642_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M5642_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandpd   xmm1, xmm1, xmm2
        vandnpd  xmm0, xmm2, xmm0
-       vorpd    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogq xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M5642_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000000000000h, 8000000000000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=f0a0e9f5) for method System.Double:CopySign(double,double):double (Instrumented Tier1)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=f0a0e9f5) for method System.Double:CopySign(double,double):double (Instrumented Tier1)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 119618.dasm - System.Single:CopySign(float,float):float (Instrumented Tier1)
@@ -24,17 +24,17 @@ G_M12736_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M12736_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandps   xmm1, xmm1, xmm2
        vandnps  xmm0, xmm2, xmm0
-       vorps    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogd xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M12736_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000080000000h, 8000000080000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=38bcce3f) for method System.Single:CopySign(float,float):float (Instrumented Tier1)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=38bcce3f) for method System.Single:CopySign(float,float):float (Instrumented Tier1)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 119622.dasm - System.Single:CopySign(float,float):float (Tier1)
@@ -24,17 +24,17 @@ G_M12736_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M12736_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandps   xmm1, xmm1, xmm2
        vandnps  xmm0, xmm2, xmm0
-       vorps    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogd xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M12736_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000080000000h, 8000000080000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=38bcce3f) for method System.Single:CopySign(float,float):float (Tier1)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=38bcce3f) for method System.Single:CopySign(float,float):float (Tier1)
 ; ============================================================
 
 Unwind Info:
libraries.pmi.windows.x64.checked.mch
-11 (-7.48%) : 251209.dasm - System.SR:get_RangeAttribute_ValidationError_MinExclusive_MaxExclusive():System.String (FullOpts)
@@ -57,10 +57,8 @@ G_M42103_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000
        vmovups  zmm0, zmmword ptr [rbx]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rbx+0x30]
-       vmovups  zmm3, zmmword ptr [rcx+0x30]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x30]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     cl
@@ -68,7 +66,7 @@ G_M42103_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000
        movzx    rcx, cl
        test     ecx, ecx
        jne      SHORT G_M42103_IG07
-						;; size=78 bbWeight=0.50 PerfScore 12.13
+						;; size=67 bbWeight=0.50 PerfScore 11.38
 G_M42103_IG06:        ; bbWeight=0.50, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byref, isz
        ; byrRegs -[rbx]
        test     rax, rax
@@ -87,7 +85,7 @@ G_M42103_IG08:        ; bbWeight=1, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byr
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 147, prolog size 8, PerfScore 40.82, instruction count 35, allocated bytes for code 149 (MethodHash=5d645b88) for method System.SR:get_RangeAttribute_ValidationError_MinExclusive_MaxExclusive():System.String (FullOpts)
+; Total bytes of code 136, prolog size 8, PerfScore 38.97, instruction count 33, allocated bytes for code 138 (MethodHash=5d645b88) for method System.SR:get_RangeAttribute_ValidationError_MinExclusive_MaxExclusive():System.String (FullOpts)
 ; ============================================================
 
 Unwind Info:
-11 (-7.48%) : 251175.dasm - System.SR:get_CustomValidationAttribute_Method_Must_Return_ValidationResult():System.String (FullOpts)
@@ -57,10 +57,8 @@ G_M1354_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000 {
        vmovups  zmm0, zmmword ptr [rbx]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rbx+0x3A]
-       vmovups  zmm3, zmmword ptr [rcx+0x3A]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x3A]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     cl
@@ -68,7 +66,7 @@ G_M1354_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000 {
        movzx    rcx, cl
        test     ecx, ecx
        jne      SHORT G_M1354_IG07
-						;; size=78 bbWeight=0.50 PerfScore 12.13
+						;; size=67 bbWeight=0.50 PerfScore 11.38
 G_M1354_IG06:        ; bbWeight=0.50, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byref, isz
        ; byrRegs -[rbx]
        test     rax, rax
@@ -87,7 +85,7 @@ G_M1354_IG08:        ; bbWeight=1, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byre
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 147, prolog size 8, PerfScore 40.82, instruction count 35, allocated bytes for code 149 (MethodHash=e04afab5) for method System.SR:get_CustomValidationAttribute_Method_Must_Return_ValidationResult():System.String (FullOpts)
+; Total bytes of code 136, prolog size 8, PerfScore 38.97, instruction count 33, allocated bytes for code 138 (MethodHash=e04afab5) for method System.SR:get_CustomValidationAttribute_Method_Must_Return_ValidationResult():System.String (FullOpts)
 ; ============================================================
 
 Unwind Info:
-11 (-7.48%) : 251211.dasm - System.SR:get_RegularExpressionAttribute_Empty_Pattern():System.String (FullOpts)
@@ -57,10 +57,8 @@ G_M54031_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000
        vmovups  zmm0, zmmword ptr [rbx]
        vmovups  zmm1, zmmword ptr [rcx]
        vmovups  zmm2, zmmword ptr [rbx+0x10]
-       vmovups  zmm3, zmmword ptr [rcx+0x10]
-       vpxorq   zmm0, zmm0, zmm1
-       vpxorq   zmm1, zmm2, zmm3
-       vporq    zmm0, zmm0, zmm1
+       vpxorq   zmm2, zmm2, zmmword ptr [rcx+0x10]
+       vpternlogq zmm0, zmm1, zmm2, -66
        vptestmq k1, zmm0, zmm0
        kortestb k1, k1
        sete     cl
@@ -68,7 +66,7 @@ G_M54031_IG05:        ; bbWeight=0.50, gcrefRegs=0009 {rax rbx}, byrefRegs=0000
        movzx    rcx, cl
        test     ecx, ecx
        jne      SHORT G_M54031_IG07
-						;; size=78 bbWeight=0.50 PerfScore 12.13
+						;; size=67 bbWeight=0.50 PerfScore 11.38
 G_M54031_IG06:        ; bbWeight=0.50, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byref, isz
        ; byrRegs -[rbx]
        test     rax, rax
@@ -87,7 +85,7 @@ G_M54031_IG08:        ; bbWeight=1, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byr
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 147, prolog size 8, PerfScore 40.82, instruction count 35, allocated bytes for code 149 (MethodHash=c9c12cf0) for method System.SR:get_RegularExpressionAttribute_Empty_Pattern():System.String (FullOpts)
+; Total bytes of code 136, prolog size 8, PerfScore 38.97, instruction count 33, allocated bytes for code 138 (MethodHash=c9c12cf0) for method System.SR:get_RegularExpressionAttribute_Empty_Pattern():System.String (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+10.34%) : 34948.dasm - System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (FullOpts)
@@ -12,7 +12,7 @@
 ;  V02 arg1         [V02,T01] (  3,  6   )   byref  ->   r8         single-def
 ;  V03 arg2         [V03,T02] (  3,  6   )   byref  ->   r9         single-def
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V05 cse0         [V05,T04] (  3,  3   )  simd16  ->  mm0         "CSE - aggressive"
+;  V05 cse0         [V05,T04] (  3,  3   )  simd16  ->  mm1         "CSE - aggressive"
 ;
 ; Lcl frame size = 0
 
@@ -21,19 +21,19 @@ G_M49358_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M49358_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0306 {rcx rdx r8 r9}, byref
        ; byrRegs +[rcx rdx r8-r9]
-       vmovups  xmm0, xmmword ptr [rdx]
-       vandpd   xmm1, xmm0, xmmword ptr [r8]
-       vandnpd  xmm0, xmm0, xmmword ptr [r9]
-       vorpd    xmm0, xmm1, xmm0
+       vmovups  xmm0, xmmword ptr [r8]
+       vmovups  xmm1, xmmword ptr [rdx]
+       vandnpd  xmm2, xmm1, xmmword ptr [r9]
+       vpternlogq xmm0, xmm1, xmm2, -22
        vmovups  xmmword ptr [rcx], xmm0
        mov      rax, rcx
        ; byrRegs +[rax]
-						;; size=25 bbWeight=1 PerfScore 12.58
+						;; size=28 bbWeight=1 PerfScore 13.75
 G_M49358_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 
-; Total bytes of code 29, prolog size 3, PerfScore 17.48, instruction count 8, allocated bytes for code 29 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (FullOpts)
+; Total bytes of code 32, prolog size 3, PerfScore 18.95, instruction count 8, allocated bytes for code 32 (MethodHash=9f023f31) for method System.Numerics.VectorMath:ConditionalSelectBitwise(System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double],System.Runtime.Intrinsics.Vector128`1[double]):System.Runtime.Intrinsics.Vector128`1[double] (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+11.11%) : 34504.dasm - System.Numerics.Vector:AndNot[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
@@ -19,19 +19,19 @@ G_M4792_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M4792_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0106 {rcx rdx r8}, byref
        ; byrRegs +[rcx rdx r8]
-       vpcmpeqd ymm0, ymm0, ymm0
-       vpxor    ymm0, ymm0, ymmword ptr [r8]
-       vpand    ymm0, ymm0, ymmword ptr [rdx]
+       vmovups  ymm0, ymmword ptr [r8]
+       vpcmpeqd ymm1, ymm1, ymm1
+       vpternlogd ymm0, ymm1, ymmword ptr [rdx], 40
        vmovups  ymmword ptr [rcx], ymm0
        mov      rax, rcx
        ; byrRegs +[rax]
-						;; size=20 bbWeight=1 PerfScore 8.75
+						;; size=23 bbWeight=1 PerfScore 10.75
 G_M4792_IG03:        ; bbWeight=1, epilog, nogc, extend
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 27, prolog size 3, PerfScore 14.45, instruction count 8, allocated bytes for code 27 (MethodHash=239eed47) for method System.Numerics.Vector:AndNot[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 30, prolog size 3, PerfScore 16.75, instruction count 8, allocated bytes for code 30 (MethodHash=239eed47) for method System.Numerics.Vector:AndNot[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; ============================================================
 
 Unwind Info:
+5 (+12.50%) : 34539.dasm - System.Numerics.Vector:ConditionalSelect[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
@@ -12,7 +12,7 @@
 ;  V02 arg1         [V02,T01] (  3,  6   )   byref  ->   r8         single-def
 ;  V03 arg2         [V03,T02] (  3,  6   )   byref  ->   r9         single-def
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V05 cse0         [V05,T04] (  3,  3   )  simd32  ->  mm0         "CSE - aggressive"
+;  V05 cse0         [V05,T04] (  3,  3   )  simd32  ->  mm1         "CSE - aggressive"
 ;
 ; Lcl frame size = 0
 
@@ -21,22 +21,22 @@ G_M19472_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M19472_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0306 {rcx rdx r8 r9}, byref
        ; byrRegs +[rcx rdx r8-r9]
-       vmovups  ymm0, ymmword ptr [rdx]
-       vpand    ymm1, ymm0, ymmword ptr [r8]
+       vmovups  ymm0, ymmword ptr [r8]
+       vmovups  ymm1, ymmword ptr [rdx]
        vpcmpeqd ymm2, ymm2, ymm2
-       vpxor    ymm0, ymm0, ymm2
-       vpand    ymm0, ymm0, ymmword ptr [r9]
-       vpor     ymm0, ymm1, ymm0
+       vmovaps  ymm3, ymm1
+       vpternlogd ymm3, ymm2, ymmword ptr [r9], 40
+       vpternlogd ymm0, ymm1, ymm3, -22
        vmovups  ymmword ptr [rcx], ymm0
        mov      rax, rcx
        ; byrRegs +[rax]
-						;; size=33 bbWeight=1 PerfScore 14.42
+						;; size=38 bbWeight=1 PerfScore 16.50
 G_M19472_IG03:        ; bbWeight=1, epilog, nogc, extend
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 40, prolog size 3, PerfScore 21.42, instruction count 11, allocated bytes for code 40 (MethodHash=daf9b3ef) for method System.Numerics.Vector:ConditionalSelect[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 45, prolog size 3, PerfScore 24.00, instruction count 11, allocated bytes for code 45 (MethodHash=daf9b3ef) for method System.Numerics.Vector:ConditionalSelect[ubyte](System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte],System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; ============================================================
 
 Unwind Info:
libraries_tests.pmi.windows.x64.checked.mch
-10 (-3.22%) : 230310.dasm - System.SpanTests.ReadOnlySpanTests:CtorArray2() (FullOpts)
@@ -142,15 +142,13 @@ G_M14147_IG03:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0081 {rax
        vmovups  ymm0, ymmword ptr [rdi]
        vmovups  ymm1, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rdi+0x08]
-       vmovups  ymm3, ymmword ptr [rax+0x08]
-       vpxor    ymm0, ymm0, ymm1
-       vpxor    ymm1, ymm2, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpxor    ymm2, ymm2, ymmword ptr [rax+0x08]
+       vpternlogq ymm0, ymm1, ymm2, -66
        vptest   ymm0, ymm0
        sete     cl
        movzx    rcx, cl
        jmp      SHORT G_M14147_IG05
-						;; size=43 bbWeight=0.50 PerfScore 14.63
+						;; size=38 bbWeight=0.50 PerfScore 13.38
 G_M14147_IG04:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byref
        ; byrRegs -[rax rdi]
        xor      ecx, ecx
@@ -186,10 +184,8 @@ G_M14147_IG05:        ; bbWeight=1, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byr
        vmovups  ymm0, ymmword ptr [rsi]
        vmovups  ymm1, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rsi+0x08]
-       vmovups  ymm3, ymmword ptr [rax+0x08]
-       vpxor    ymm0, ymm0, ymm1
-       vpxor    ymm1, ymm2, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpxor    ymm2, ymm2, ymmword ptr [rax+0x08]
+       vpternlogq ymm0, ymm1, ymm2, -66
        vptest   ymm0, ymm0
        sete     cl
        movzx    rcx, cl
@@ -202,7 +198,7 @@ G_M14147_IG05:        ; bbWeight=1, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byr
        ; byrRegs -[rax rsi]
        ; gcr arg pop 0
        nop      
-						;; size=132 bbWeight=1 PerfScore 51.75
+						;; size=127 bbWeight=1 PerfScore 49.25
 G_M14147_IG06:        ; bbWeight=1, epilog, nogc, extend
        vzeroupper 
        add      rsp, 40
@@ -213,7 +209,7 @@ G_M14147_IG06:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=12 bbWeight=1 PerfScore 4.25
 
-; Total bytes of code 311, prolog size 11, PerfScore 133.35, instruction count 79, allocated bytes for code 311 (MethodHash=99f1c8bc) for method System.SpanTests.ReadOnlySpanTests:CtorArray2() (FullOpts)
+; Total bytes of code 301, prolog size 11, PerfScore 128.60, instruction count 75, allocated bytes for code 301 (MethodHash=99f1c8bc) for method System.SpanTests.ReadOnlySpanTests:CtorArray2() (FullOpts)
 ; ============================================================
 
 Unwind Info:
-10 (-3.22%) : 230864.dasm - System.SpanTests.SpanTests:CtorArray2() (FullOpts)
@@ -142,15 +142,13 @@ G_M55557_IG03:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0081 {rax
        vmovups  ymm0, ymmword ptr [rdi]
        vmovups  ymm1, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rdi+0x08]
-       vmovups  ymm3, ymmword ptr [rax+0x08]
-       vpxor    ymm0, ymm0, ymm1
-       vpxor    ymm1, ymm2, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpxor    ymm2, ymm2, ymmword ptr [rax+0x08]
+       vpternlogq ymm0, ymm1, ymm2, -66
        vptest   ymm0, ymm0
        sete     cl
        movzx    rcx, cl
        jmp      SHORT G_M55557_IG05
-						;; size=43 bbWeight=0.50 PerfScore 14.63
+						;; size=38 bbWeight=0.50 PerfScore 13.38
 G_M55557_IG04:        ; bbWeight=0.50, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byref
        ; byrRegs -[rax rdi]
        xor      ecx, ecx
@@ -186,10 +184,8 @@ G_M55557_IG05:        ; bbWeight=1, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byr
        vmovups  ymm0, ymmword ptr [rsi]
        vmovups  ymm1, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rsi+0x08]
-       vmovups  ymm3, ymmword ptr [rax+0x08]
-       vpxor    ymm0, ymm0, ymm1
-       vpxor    ymm1, ymm2, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpxor    ymm2, ymm2, ymmword ptr [rax+0x08]
+       vpternlogq ymm0, ymm1, ymm2, -66
        vptest   ymm0, ymm0
        sete     cl
        movzx    rcx, cl
@@ -202,7 +198,7 @@ G_M55557_IG05:        ; bbWeight=1, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, byr
        ; byrRegs -[rax rsi]
        ; gcr arg pop 0
        nop      
-						;; size=132 bbWeight=1 PerfScore 51.75
+						;; size=127 bbWeight=1 PerfScore 49.25
 G_M55557_IG06:        ; bbWeight=1, epilog, nogc, extend
        vzeroupper 
        add      rsp, 40
@@ -213,7 +209,7 @@ G_M55557_IG06:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=12 bbWeight=1 PerfScore 4.25
 
-; Total bytes of code 311, prolog size 11, PerfScore 133.35, instruction count 79, allocated bytes for code 311 (MethodHash=373326fa) for method System.SpanTests.SpanTests:CtorArray2() (FullOpts)
+; Total bytes of code 301, prolog size 11, PerfScore 128.60, instruction count 75, allocated bytes for code 301 (MethodHash=373326fa) for method System.SpanTests.SpanTests:CtorArray2() (FullOpts)
 ; ============================================================
 
 Unwind Info:
-5 (-2.76%) : 230342.dasm - System.SpanTests.ReadOnlySpanTests:EndsWithMatchDifferentSpans_Long() (FullOpts)
@@ -92,10 +92,8 @@ G_M11328_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm0, xmmword ptr [rsi]
        vmovups  xmm1, xmmword ptr [rax]
        vmovups  xmm2, xmmword ptr [rsi+0x08]
-       vmovups  xmm3, xmmword ptr [rax+0x08]
-       vpxor    xmm0, xmm0, xmm1
-       vpxor    xmm1, xmm2, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpxor    xmm2, xmm2, xmmword ptr [rax+0x08]
+       vpternlogq xmm0, xmm1, xmm2, -66
        vptest   xmm0, xmm0
        sete     cl
        movzx    rcx, cl
@@ -109,7 +107,7 @@ G_M11328_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        ; byrRegs -[rax rsi]
        ; gcr arg pop 0
        nop      
-						;; size=165 bbWeight=1 PerfScore 50.25
+						;; size=160 bbWeight=1 PerfScore 48.75
 G_M11328_IG03:        ; bbWeight=1, epilog, nogc, extend
        add      rsp, 40
        pop      rbx
@@ -117,7 +115,7 @@ G_M11328_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=7 bbWeight=1 PerfScore 2.25
 
-; Total bytes of code 181, prolog size 9, PerfScore 73.85, instruction count 44, allocated bytes for code 181 (MethodHash=72b9d3bf) for method System.SpanTests.ReadOnlySpanTests:EndsWithMatchDifferentSpans_Long() (FullOpts)
+; Total bytes of code 176, prolog size 9, PerfScore 71.85, instruction count 42, allocated bytes for code 176 (MethodHash=72b9d3bf) for method System.SpanTests.ReadOnlySpanTests:EndsWithMatchDifferentSpans_Long() (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 130554.dasm - System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
@@ -24,17 +24,17 @@ G_M58904_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M58904_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandpd   xmm1, xmm1, xmm2
        vandnpd  xmm0, xmm2, xmm0
-       vorpd    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogq xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M58904_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000000000000h, 8000000000000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 327844.dasm - System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
@@ -24,17 +24,17 @@ G_M58904_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M58904_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandpd   xmm1, xmm1, xmm2
        vandnpd  xmm0, xmm2, xmm0
-       vorpd    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogq xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M58904_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000000000000h, 8000000000000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
 ; ============================================================
 
 Unwind Info:
+3 (+12.50%) : 330292.dasm - System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
@@ -24,17 +24,17 @@ G_M58904_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=3 bbWeight=1 PerfScore 1.00
 G_M58904_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovups  xmm2, xmmword ptr [reloc @RWD00]
-       vandpd   xmm1, xmm1, xmm2
        vandnpd  xmm0, xmm2, xmm0
-       vorpd    xmm0, xmm1, xmm0
-						;; size=20 bbWeight=1 PerfScore 4.00
+       vpternlogq xmm1, xmm2, xmm0, -22
+       vmovaps  xmm0, xmm1
+						;; size=23 bbWeight=1 PerfScore 4.08
 G_M58904_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	8000000000000000h, 8000000000000000h
 
 
-; Total bytes of code 24, prolog size 3, PerfScore 8.40, instruction count 6, allocated bytes for code 24 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
+; Total bytes of code 27, prolog size 3, PerfScore 8.78, instruction count 6, allocated bytes for code 27 (MethodHash=320d19e7) for method System.NumberHelper`1[double]:CopySign(double,double):double (FullOpts)
 ; ============================================================
 
 Unwind Info:
realworld.run.windows.x64.checked.mch
-5 (-3.45%) : 13178.dasm - FSharp.Compiler.Lexer+Ranges+isInt32BadMax@42:Invoke(System.String):bool:this (FullOpts)
@@ -74,14 +74,12 @@ G_M52301_IG06:        ; bbWeight=0.50, gcrefRegs=0048 {rbx rsi}, byrefRegs=0000
        vmovups  xmm0, xmmword ptr [rsi]
        vmovups  xmm1, xmmword ptr [rbx]
        vmovups  xmm2, xmmword ptr [rsi+0x04]
-       vmovups  xmm3, xmmword ptr [rbx+0x04]
-       vpxor    xmm0, xmm0, xmm1
-       vpxor    xmm1, xmm2, xmm3
-       vpor     xmm0, xmm0, xmm1
+       vpxor    xmm2, xmm2, xmmword ptr [rbx+0x04]
+       vpternlogq xmm0, xmm1, xmm2, -66
        vptest   xmm0, xmm0
        sete     al
        movzx    rax, al
-						;; size=86 bbWeight=0.50 PerfScore 17.75
+						;; size=81 bbWeight=0.50 PerfScore 17.00
 G_M52301_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rbx rsi]
        add      rsp, 40
@@ -90,7 +88,7 @@ G_M52301_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ret      
 						;; size=7 bbWeight=1 PerfScore 2.25
 
-; Total bytes of code 145, prolog size 9, PerfScore 46.38, instruction count 41, allocated bytes for code 145 (MethodHash=a3e433b2) for method FSharp.Compiler.Lexer+Ranges+isInt32BadMax@42:Invoke(System.String):bool:this (FullOpts)
+; Total bytes of code 140, prolog size 9, PerfScore 45.13, instruction count 39, allocated bytes for code 140 (MethodHash=a3e433b2) for method FSharp.Compiler.Lexer+Ranges+isInt32BadMax@42:Invoke(System.String):bool:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
-5 (-3.38%) : 14927.dasm - FSharp.Compiler.Lexer+Ranges+isInt64BadMax@45:Invoke(System.String):bool:this (FullOpts)
@@ -74,14 +74,12 @@ G_M32073_IG06:        ; bbWeight=0.50, gcrefRegs=0048 {rbx rsi}, byrefRegs=0000
        vmovups  ymm0, ymmword ptr [rsi]
        vmovups  ymm1, ymmword ptr [rbx]
        vmovups  ymm2, ymmword ptr [rsi+0x06]
-       vmovups  ymm3, ymmword ptr [rbx+0x06]
-       vpxor    ymm0, ymm0, ymm1
-       vpxor    ymm1, ymm2, ymm3
-       vpor     ymm0, ymm0, ymm1
+       vpxor    ymm2, ymm2, ymmword ptr [rbx+0x06]
+       vpternlogq ymm0, ymm1, ymm2, -66
        vptest   ymm0, ymm0
        sete     al
        movzx    rax, al
-						;; size=86 bbWeight=0.50 PerfScore 20.75
+						;; size=81 bbWeight=0.50 PerfScore 19.50
 G_M32073_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
        ; byrRegs -[rbx rsi]
        vzeroupper 
@@ -91,7 +89,7 @@ G_M32073_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        ret      
 						;; size=10 bbWeight=1 PerfScore 3.25
 
-; Total bytes of code 148, prolog size 9, PerfScore 50.68, instruction count 42, allocated bytes for code 148 (MethodHash=78f682b6) for method FSharp.Compiler.Lexer+Ranges+isInt64BadMax@45:Invoke(System.String):bool:this (FullOpts)
+; Total bytes of code 143, prolog size 9, PerfScore 48.93, instruction count 40, allocated bytes for code 143 (MethodHash=78f682b6) for method FSharp.Compiler.Lexer+Ranges+isInt64BadMax@45:Invoke(System.String):bool:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
-23 (-2.06%) : 1501.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:ClipTriangleEdgeAgainstBoxFace(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,int) (FullOpts)
@@ -33,7 +33,7 @@
 ;* V22 loc8         [V22    ] (  0,  0   )  struct (96) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;* V23 loc9         [V23    ] (  0,  0   )  struct (96) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;* V24 loc10        [V24,T48] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op <System.Numerics.Vector`1[int]>
-;  V25 loc11        [V25,T22] (  3,  6   )  simd32  ->  mm19         ld-addr-op <System.Numerics.Vector`1[int]>
+;  V25 loc11        [V25,T22] (  3,  6   )  simd32  ->  mm6         ld-addr-op <System.Numerics.Vector`1[int]>
 ;  V26 loc12        [V26,T23] (  3,  6   )  simd32  ->  mm0         ld-addr-op <System.Numerics.Vector`1[int]>
 ;  V27 loc13        [V27,T47] (  2,  2   )  simd32  ->  mm1         ld-addr-op <System.Numerics.Vector`1[int]>
 ;  V28 OutArgs      [V28    ] (  1,  1   )  struct (48) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
@@ -75,7 +75,7 @@
 ;  V64 tmp36        [V64,T31] (  3,  3   )  simd32  ->  mm3         "field V34.X (fldOffset=0x0)" P-INDEP
 ;  V65 tmp37        [V65,T32] (  3,  3   )  simd32  ->  mm5         "field V34.Y (fldOffset=0x20)" P-INDEP
 ;  V66 tmp38        [V66,T33] (  3,  3   )  simd32  ->  mm17         "field V34.Z (fldOffset=0x40)" P-INDEP
-;  V67 tmp39        [V67,T41] (  2,  3   )  simd32  ->  mm20         "field V35.X (fldOffset=0x0)" P-INDEP
+;  V67 tmp39        [V67,T41] (  2,  3   )  simd32  ->  mm19         "field V35.X (fldOffset=0x0)" P-INDEP
 ;  V68 tmp40        [V68,T42] (  2,  3   )  simd32  ->  mm3         "field V35.Y (fldOffset=0x20)" P-INDEP
 ;* V69 tmp41        [V69    ] (  0,  0   )  simd32  ->  zero-ref    "field V35.Depth (fldOffset=0x40)" P-INDEP
 ;  V70 tmp42        [V70,T43] (  2,  3   )  simd32  ->  mm5         "field V35.FeatureId (fldOffset=0x60)" P-INDEP
@@ -204,21 +204,19 @@ G_M44304_IG03:        ; bbWeight=1, extend
        vmovups  ymm19, ymmword ptr [rbx]
        vmovups  ymm7, ymmword ptr [reloc @RWD64]
        vpcmpgtd ymm7, ymm7, ymmword ptr [rsi]
-       vpandd   ymm19, ymm19, ymm7
-       vpandd   ymm19, ymm19, ymm6
+       vpternlogd ymm19, ymm7, ymm6, -128
        vcmpps   ymm6, ymm0, ymm2, 1
        vxorps   ymm7, ymm7, ymm7
        vcmpps   ymm7, ymm0, ymm7, 14
-       vpandd   ymm20, ymm6, ymm7
-       vpandd   ymm19, ymm19, ymm20
+       vpternlogd ymm6, ymm7, ymm19, -128
        vsubps   ymm3, ymm3, ymmword ptr [r14]
        vsubps   ymm5, ymm5, ymmword ptr [r14+0x20]
        vsubps   ymm17, ymm17, ymmword ptr [r14+0x40]
-       vmulps   ymm20, ymm3, ymmword ptr [r15]
-       vmulps   ymm21, ymm5, ymmword ptr [r15+0x20]
-       vaddps   ymm20, ymm20, ymm21
-       vmulps   ymm21, ymm17, ymmword ptr [r15+0x40]
-       vaddps   ymm20, ymm20, ymm21
+       vmulps   ymm19, ymm3, ymmword ptr [r15]
+       vmulps   ymm20, ymm5, ymmword ptr [r15+0x20]
+       vaddps   ymm19, ymm19, ymm20
+       vmulps   ymm20, ymm17, ymmword ptr [r15+0x40]
+       vaddps   ymm19, ymm19, ymm20
        vmulps   ymm3, ymm3, ymmword ptr [r13]
        vmulps   ymm5, ymm5, ymmword ptr [r13+0x20]
        vaddps   ymm3, ymm3, ymm5
@@ -228,17 +226,17 @@ G_M44304_IG03:        ; bbWeight=1, extend
        xor      eax, eax
        test     ebp, ebp
        jle      G_M44304_IG07
-						;; size=182 bbWeight=1 PerfScore 105.17
+						;; size=172 bbWeight=1 PerfScore 104.83
 G_M44304_IG04:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r12 r13 r14 r15}, byref, isz
        ; byrRegs -[rdi]
        cmp      eax, 8
        jae      G_M44304_IG13
        mov      ecx, eax
-       vmovups  ymmword ptr [rsp+0x30], ymm19
+       vmovups  ymmword ptr [rsp+0x30], ymm6
        mov      edx, dword ptr [rsp+4*rcx+0x30]
        test     edx, edx
        jge      SHORT G_M44304_IG06
-						;; size=30 bbWeight=4 PerfScore 23.00
+						;; size=25 bbWeight=4 PerfScore 23.00
 G_M44304_IG05:        ; bbWeight=2, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r12 r13 r14 r15}, byref
        movsxd   rdx, dword ptr [rsi+4*rcx]
        shl      rdx, 7
@@ -248,7 +246,7 @@ G_M44304_IG05:        ; bbWeight=2, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r
        ; byrRegs +[rdx]
        movsxd   r8, eax
        lea      rdx, bword ptr [rdx+4*r8]
-       vmovups  ymmword ptr [rsp+0x30], ymm20
+       vmovups  ymmword ptr [rsp+0x30], ymm19
        vmovss   xmm17, dword ptr [rsp+4*rcx+0x30]
        vmovss   dword ptr [rdx], xmm17
        vmovups  ymmword ptr [rsp+0x30], ymm3
@@ -259,28 +257,26 @@ G_M44304_IG05:        ; bbWeight=2, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r
        mov      dword ptr [rdx+0x60], ecx
        mov      bword ptr [rsp+0x268], rdi
 						;; size=93 bbWeight=2 PerfScore 51.00
-G_M44304_IG06:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r12 r13 r14 r15}, byref
+G_M44304_IG06:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r12 r13 r14 r15}, byref, isz
        ; byrRegs -[rdx rdi]
        inc      eax
        cmp      eax, ebp
-       jl       G_M44304_IG04
-						;; size=10 bbWeight=4 PerfScore 6.00
+       jl       SHORT G_M44304_IG04
+						;; size=6 bbWeight=4 PerfScore 6.00
 G_M44304_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r12 r13 r14 r15}, byref, isz
        vmovups  ymm3, ymmword ptr [rsi]
        vpaddd   ymm3, ymm3, dword ptr [reloc @RWD96] {1to8}
-       vpternlogd ymm19, ymm3, ymmword ptr [rsi], -54
-       vmovups  ymmword ptr [rsi], ymm19
+       vpternlogd ymm6, ymm3, ymmword ptr [rsi], -54
+       vmovups  ymmword ptr [rsi], ymm6
        vcmpps   ymm0, ymm1, ymm0, 13
        vmovups  ymm3, ymmword ptr [rbx]
        vmovups  ymm5, ymmword ptr [reloc @RWD64]
        vpcmpgtd ymm5, ymm5, ymmword ptr [rsi]
-       vpand    ymm3, ymm3, ymm5
-       vpand    ymm0, ymm3, ymm0
-       vcmpps   ymm2, ymm1, ymm2, 2
-       vxorps   ymm3, ymm3, ymm3
-       vcmpps   ymm1, ymm1, ymm3, 13
-       vpand    ymm1, ymm2, ymm1
-       vpand    ymm0, ymm0, ymm1
+       vpternlogd ymm3, ymm5, ymm0, -128
+       vcmpps   ymm0, ymm1, ymm2, 2
+       vxorps   ymm2, ymm2, ymm2
+       vcmpps   ymm1, ymm1, ymm2, 13
+       vpternlogd ymm0, ymm1, ymm3, -128
        vmovups  ymm1, ymmword ptr [r12]
        vpaddd   ymm1, ymm1, dword ptr [reloc @RWD100] {1to8}
        vsubps   ymm2, ymm18, ymmword ptr [r14]
@@ -299,7 +295,7 @@ G_M44304_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=F048 {rbx rsi r
        xor      eax, eax
        test     ebp, ebp
        jle      SHORT G_M44304_IG11
-						;; size=176 bbWeight=1 PerfScore 109.17
+						;; size=172 bbWeight=1 PerfScore 108.83
 G_M44304_IG08:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0040 {rsi}, byref, isz
        ; byrRegs -[rbx r12-r15]
        cmp      eax, 8
@@ -370,7 +366,7 @@ RWD96  	dd	00000001h
 RWD100 	dd	00000008h
 
 
-; Total bytes of code 1115, prolog size 96, PerfScore 707.58, instruction count 213, allocated bytes for code 1115 (MethodHash=2cba52ef) for method BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:ClipTriangleEdgeAgainstBoxFace(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,int) (FullOpts)
+; Total bytes of code 1092, prolog size 96, PerfScore 704.62, instruction count 209, allocated bytes for code 1092 (MethodHash=2cba52ef) for method BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:ClipTriangleEdgeAgainstBoxFace(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,int) (FullOpts)
 ; ============================================================
 
 Unwind Info:
+9 (+1.97%) : 19339.dasm - System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
@@ -150,8 +150,8 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        cmp      edx, 64
        jle      G_M42463_IG14
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rbx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rbx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      G_M42463_IG17
@@ -170,7 +170,7 @@ G_M42463_IG10:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        call     [<unknown method>]
        ; gcrRegs -[rcx rdx]
        ; gcr arg pop 0
-						;; size=87 bbWeight=0.50 PerfScore 14.88
+						;; size=90 bbWeight=0.50 PerfScore 15.88
 G_M42463_IG11:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref, isz
        add      rsi, -64
        cmp      rbp, rsi
@@ -180,23 +180,23 @@ G_M42463_IG12:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi},
        lea      rcx, bword ptr [rbx+2*rbp]
        ; byrRegs +[rcx]
        vmovups  ymm0, ymmword ptr [rcx]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rcx+0x40]
+       vmovups  ymm1, ymmword ptr [rcx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rcx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rcx+0x60]
        vptest   ymm0, ymmword ptr [reloc @RWD32]
        jne      SHORT G_M42463_IG17
        add      rbp, 64
        cmp      rbp, rsi
        jb       SHORT G_M42463_IG12
-						;; size=43 bbWeight=4 PerfScore 96.00
+						;; size=46 bbWeight=4 PerfScore 104.00
 G_M42463_IG13:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        ; byrRegs -[rcx]
        lea      rbx, bword ptr [rbx+2*rsi]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rdi}, byref
        vmovups  ymm0, ymmword ptr [rbx]
-       vpor     ymm0, ymm0, ymmword ptr [rbx+0x20]
-       vpor     ymm0, ymm0, ymmword ptr [rdi-0x40]
+       vmovups  ymm1, ymmword ptr [rbx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rdi-0x40], -2
        vpor     ymm6, ymm0, ymmword ptr [rdi-0x20]
        mov      ecx, 1
        vextractf128 xmm7, ymm6, 1
@@ -207,7 +207,7 @@ G_M42463_IG14:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0088 {rbx rd
        vptest   ymm6, ymmword ptr [reloc @RWD32]
        sete     al
        movzx    rax, al
-						;; size=57 bbWeight=0.50 PerfScore 14.75
+						;; size=60 bbWeight=0.50 PerfScore 15.75
 G_M42463_IG15:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        movzx    rax, al
 						;; size=3 bbWeight=0.50 PerfScore 0.12
@@ -241,7 +241,7 @@ RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 456, prolog size 26, PerfScore 248.10, instruction count 122, allocated bytes for code 456 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
+; Total bytes of code 465, prolog size 26, PerfScore 259.00, instruction count 122, allocated bytes for code 465 (MethodHash=42925a20) for method System.Text.Ascii:IsValidCore[ushort](byref,int):bool (FullOpts)
 ; ============================================================
 
 Unwind Info:
+47 (+2.68%) : 1504.dasm - BepuPhysics.CollisionDetection.CollisionTasks.TrianglePairTester:ClipBEdgeAgainstABounds(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,System.Numerics.Vector`1[int],byref,byref,int) (FullOpts)
@@ -38,15 +38,15 @@
 ;  V27 arg27        [V27,T17] (  2,  4   )   byref  ->  [rbp+0xE8]  single-def
 ;  V28 arg28        [V28,T04] ( 10, 12   )   byref  ->  rbx         single-def
 ;  V29 arg29        [V29,T07] (  4, 10   )     int  ->  rsi         single-def
-;  V30 loc0         [V30    ] (  3,  3   )  simd32  ->  [rbp-0x90]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
-;  V31 loc1         [V31    ] (  5,  5   )  simd32  ->  [rbp-0xB0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
-;  V32 loc2         [V32    ] (  3,  3   )  simd32  ->  [rbp-0xD0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
-;  V33 loc3         [V33    ] (  3,  3   )  simd32  ->  [rbp-0xF0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
-;  V34 loc4         [V34    ] (  5,  5   )  simd32  ->  [rbp-0x110]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
-;  V35 loc5         [V35    ] (  3,  3   )  simd32  ->  [rbp-0x130]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
-;  V36 loc6         [V36    ] (  3,  3   )  simd32  ->  [rbp-0x150]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
-;  V37 loc7         [V37    ] (  3,  3   )  simd32  ->  [rbp-0x170]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
-;  V38 loc8         [V38    ] (  3,  3   )  simd32  ->  [rbp-0x190]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V30 loc0         [V30    ] (  3,  3   )  simd32  ->  [rbp-0xB0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
+;  V31 loc1         [V31    ] (  5,  5   )  simd32  ->  [rbp-0xD0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V32 loc2         [V32    ] (  3,  3   )  simd32  ->  [rbp-0xF0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V33 loc3         [V33    ] (  3,  3   )  simd32  ->  [rbp-0x110]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
+;  V34 loc4         [V34    ] (  5,  5   )  simd32  ->  [rbp-0x130]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V35 loc5         [V35    ] (  3,  3   )  simd32  ->  [rbp-0x150]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V36 loc6         [V36    ] (  3,  3   )  simd32  ->  [rbp-0x170]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[int]>
+;  V37 loc7         [V37    ] (  3,  3   )  simd32  ->  [rbp-0x190]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
+;  V38 loc8         [V38    ] (  3,  3   )  simd32  ->  [rbp-0x1B0]  do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[float]>
 ;* V39 loc9         [V39,T78] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op <System.Numerics.Vector`1[float]>
 ;* V40 loc10        [V40,T79] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op <System.Numerics.Vector`1[float]>
 ;  V41 loc11        [V41,T64] (  2,  2   )  simd32  ->  mm0         <System.Numerics.Vector`1[float]>
@@ -116,9 +116,9 @@
 ;  V105 cse11       [V105,T63] (  3,  3   )  simd32  ->  mm17         "CSE - conservative"
 ;  V106 cse12       [V106,T45] (  4,  4   )  simd32  ->  mm20         "CSE - aggressive"
 ;  V107 cse13       [V107,T46] (  4,  4   )  simd32  ->  mm3         "CSE - aggressive"
-;  V108 rat0        [V108    ] (  1,  1   )  simd32  ->  [rbp-0x1B0]  "SIMDInitTempVar"
+;  V108 rat0        [V108    ] (  1,  1   )  simd32  ->  [rbp-0x1D0]  "SIMDInitTempVar"
 ;
-; Lcl frame size = 456
+; Lcl frame size = 488
 
 G_M22781_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     rbp
@@ -129,11 +129,14 @@ G_M22781_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        push     rdi
        push     rsi
        push     rbx
-       sub      rsp, 456
+       sub      rsp, 488
        vzeroupper 
-       vmovaps  xmmword ptr [rsp+0x1B0], xmm6
-       vmovaps  xmmword ptr [rsp+0x1A0], xmm7
-       lea      rbp, [rsp+0x200]
+       vmovaps  xmmword ptr [rsp+0x1D0], xmm6
+       vmovaps  xmmword ptr [rsp+0x1C0], xmm7
+       vmovaps  xmmword ptr [rsp+0x1B0], xmm8
+       vmovaps  xmmword ptr [rsp+0x1A0], xmm9
+       vmovaps  xmmword ptr [rsp+0x190], xmm10
+       lea      rbp, [rsp+0x220]
        mov      bword ptr [rbp+0x20], r8
        ; GC ptr vars +{V02}
        mov      rax, rcx
@@ -151,7 +154,7 @@ G_M22781_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        mov      rbx, bword ptr [rbp+0xF0]
        ; byrRegs +[rbx]
        mov      esi, dword ptr [rbp+0xF8]
-						;; size=99 bbWeight=1 PerfScore 21.25
+						;; size=126 bbWeight=1 PerfScore 27.25
 G_M22781_IG02:        ; bbWeight=1, gcVars=00000000000000000000000000000800 {V02}, gcrefRegs=0000 {}, byrefRegs=728D {rax rdx rbx rdi r9 r12 r13 r14}, gcvars, byref
        ; byrRegs +[r9]
        mov      r15, bword ptr [rbp+0x40]
@@ -164,12 +167,12 @@ G_M22781_IG02:        ; bbWeight=1, gcVars=00000000000000000000000000000800 {V02
        mov      r15, bword ptr [rbp+0x70]
        mov      bword ptr [rsp+0x30], r15
        ; byr arg write
-       lea      r15, [rbp-0x90]
+       lea      r15, [rbp-0xB0]
        ; byrRegs -[r15]
        mov      qword ptr [rsp+0x38], r15
-       lea      r15, [rbp-0xB0]
-       mov      qword ptr [rsp+0x40], r15
        lea      r15, [rbp-0xD0]
+       mov      qword ptr [rsp+0x40], r15
+       lea      r15, [rbp-0xF0]
        mov      qword ptr [rsp+0x48], r15
        mov      bword ptr [rbp+0x88], rdx
        mov      rcx, rdx
@@ -190,11 +193,11 @@ G_M22781_IG02:        ; bbWeight=1, gcVars=00000000000000000000000000000800 {V02
        mov      r15, bword ptr [rbp+0x78]
        mov      bword ptr [rsp+0x30], r15
        ; byr arg write
-       lea      rcx, [rbp-0xF0]
-       mov      qword ptr [rsp+0x38], rcx
        lea      rcx, [rbp-0x110]
-       mov      qword ptr [rsp+0x40], rcx
+       mov      qword ptr [rsp+0x38], rcx
        lea      rcx, [rbp-0x130]
+       mov      qword ptr [rsp+0x40], rcx
+       lea      rcx, [rbp-0x150]
        mov      qword ptr [rsp+0x48], rcx
        mov      rcx, bword ptr [rbp+0x88]
        ; byrRegs +[rcx]
@@ -217,11 +220,11 @@ G_M22781_IG02:        ; bbWeight=1, gcVars=00000000000000000000000000000800 {V02
        mov      r14, bword ptr [rbp+0x80]
        mov      bword ptr [rsp+0x30], r14
        ; byr arg write
-       lea      rcx, [rbp-0x150]
-       mov      qword ptr [rsp+0x38], rcx
        lea      rcx, [rbp-0x170]
-       mov      qword ptr [rsp+0x40], rcx
+       mov      qword ptr [rsp+0x38], rcx
        lea      rcx, [rbp-0x190]
+       mov      qword ptr [rsp+0x40], rcx
+       lea      rcx, [rbp-0x1B0]
        mov      qword ptr [rsp+0x48], rcx
        mov      rcx, bword ptr [rbp+0x88]
        ; byrRegs +[rcx]
@@ -235,46 +238,45 @@ G_M22781_IG02:        ; bbWeight=1, gcVars=00000000000000000000000000000800 {V02
        call     [<unknown method>]
        ; byrRegs -[rcx rdx r8-r9 r14]
        ; gcr arg pop 0
-       vmovups  ymm0, ymmword ptr [rbp-0x90]
-       vmovups  ymm1, ymmword ptr [rbp-0xB0]
+       vmovups  ymm0, ymmword ptr [rbp-0xB0]
+       vmovups  ymm1, ymmword ptr [rbp-0xD0]
        vpternlogd ymm0, ymm1, ymmword ptr [reloc @RWD00], -54
-       vmovups  ymm1, ymmword ptr [rbp-0xF0]
+       vmovups  ymm1, ymmword ptr [rbp-0x110]
 						;; size=308 bbWeight=1 PerfScore 64.25
 G_M22781_IG03:        ; bbWeight=1, extend
-       vmovups  ymm2, ymmword ptr [rbp-0x110]
+       vmovups  ymm2, ymmword ptr [rbp-0x130]
        vpternlogd ymm1, ymm2, ymmword ptr [reloc @RWD00], -54
-       vmovups  ymm2, ymmword ptr [rbp-0x150]
-       vmovups  ymm3, ymmword ptr [rbp-0x170]
+       vmovups  ymm2, ymmword ptr [rbp-0x170]
+       vmovups  ymm3, ymmword ptr [rbp-0x190]
        vpternlogd ymm2, ymm3, ymmword ptr [reloc @RWD00], -54
-       vmovups  ymm3, ymmword ptr [rbp-0x90]
-       vmovups  ymm4, ymmword ptr [rbp-0xB0]
+       vmovups  ymm3, ymmword ptr [rbp-0xB0]
+       vmovups  ymm4, ymmword ptr [rbp-0xD0]
        vpternlogd ymm3, ymm4, ymmword ptr [reloc @RWD32], -54
-       vmovups  ymm4, ymmword ptr [rbp-0xF0]
-       vmovups  ymm5, ymmword ptr [rbp-0x110]
+       vmovups  ymm4, ymmword ptr [rbp-0x110]
+       vmovups  ymm5, ymmword ptr [rbp-0x130]
        vpternlogd ymm4, ymm5, ymmword ptr [reloc @RWD32], -54
-       vmovups  ymm5, ymmword ptr [rbp-0x150]
-       vmovups  ymm16, ymmword ptr [rbp-0x170]
+       vmovups  ymm5, ymmword ptr [rbp-0x170]
+       vmovups  ymm16, ymmword ptr [rbp-0x190]
        vpternlogd ymm5, ymm16, ymmword ptr [reloc @RWD32], -54
        vminps   ymm1, ymm1, ymm2
        vminps   ymm0, ymm0, ymm1
        vmaxps   ymm1, ymm4, ymm5
        vmaxps   ymm1, ymm3, ymm1
-       vcmpps   ymm2, ymm0, ymmword ptr [rbp-0xB0], 0
-       vcmpps   ymm3, ymm0, ymmword ptr [rbp-0x110], 0
-       vcmpps   ymm4, ymm1, ymmword ptr [rbp-0xB0], 0
-       vcmpps   ymm5, ymm1, ymmword ptr [rbp-0x110], 0
-       vmovups  ymm16, ymmword ptr [rbp-0xD0]
-       vmovups  ymm17, ymmword ptr [rbp-0x130]
-       vpternlogd ymm3, ymm17, ymmword ptr [rbp-0x190], -54
+       vcmpps   ymm2, ymm0, ymmword ptr [rbp-0xD0], 0
+       vcmpps   ymm3, ymm0, ymmword ptr [rbp-0x130], 0
+       vcmpps   ymm4, ymm1, ymmword ptr [rbp-0xD0], 0
+       vcmpps   ymm5, ymm1, ymmword ptr [rbp-0x130], 0
+       vmovups  ymm16, ymmword ptr [rbp-0xF0]
+       vmovups  ymm17, ymmword ptr [rbp-0x150]
+       vpternlogd ymm3, ymm17, ymmword ptr [rbp-0x1B0], -54
        vpternlogd ymm2, ymm16, ymm3, -54
-       vmovups  ymm3, ymmword ptr [rbp-0xD0]
-       vmovups  ymm16, ymmword ptr [rbp-0x130]
-       vpternlogd ymm5, ymm16, ymmword ptr [rbp-0x190], -54
+       vmovups  ymm3, ymmword ptr [rbp-0xF0]
+       vmovups  ymm16, ymmword ptr [rbp-0x150]
+       vpternlogd ymm5, ymm16, ymmword ptr [rbp-0x1B0], -54
        vpternlogd ymm4, ymm3, ymm5, -54
        vcmpps   ymm3, ymm0, ymmword ptr [reloc @RWD32], 0
        vcmpps   ymm5, ymm1, ymmword ptr [reloc @RWD00], 0
-       vpor     ymm3, ymm3, ymm5
-       vpandn   ymm3, ymm3, ymmword ptr [rdi]
+       vpternlogd ymm3, ymm5, ymmword ptr [rdi], 87
        vmovups  ymmword ptr [rdi], ymm3
        vxorps   ymm3, ymm3, ymm3
        vmaxps   ymm0, ymm3, ymm0
@@ -302,9 +304,9 @@ G_M22781_IG03:        ; bbWeight=1, extend
        vmovups  ymm21, ymmword ptr [r15+0x40]
        vmulps   ymm19, ymm21, ymm19
        vaddps   ymm17, ymm17, ymm19
-						;; size=432 bbWeight=1 PerfScore 190.67
-G_M22781_IG04:        ; bbWeight=1, extend
        vmulps   ymm19, ymm0, ymm17
+						;; size=437 bbWeight=1 PerfScore 193.33
+G_M22781_IG04:        ; bbWeight=1, extend
        vaddps   ymm19, ymm5, ymm19
        vmulps   ymm17, ymm1, ymm17
        vaddps   ymm5, ymm5, ymm17
@@ -339,20 +341,19 @@ G_M22781_IG04:        ; bbWeight=1, extend
        vmulps   ymm17, ymm17, ymm21
        vaddps   ymm17, ymm20, ymm17
        vsubps   ymm2, ymm19, ymm2
+       vmovups  ymm19, ymmword ptr [rdi]
        mov      r14, bword ptr [rbp+0xD8]
        vcmpps   ymm6, ymm2, ymmword ptr [r14], 13
-       vpandd   ymm19, ymm6, ymmword ptr [rdi]
-       vsubps   ymm6, ymm1, ymm0
-       vcmpps   ymm6, ymm6, ymmword ptr [reloc @RWD96], 13
-       vmovups  ymm7, ymmword ptr [reloc @RWD128]
-       vpcmpgtd ymm7, ymm7, ymmword ptr [rbx]
-       vpandd   ymm20, ymm7, ymm6
-       vcmpps   ymm6, ymm0, ymm3, 1
-       vxorps   ymm7, ymm7, ymm7
-       vcmpps   ymm7, ymm0, ymm7, 14
-       vpandd   ymm21, ymm6, ymm7
-       vpandd   ymm20, ymm20, ymm21
-       vpandd   ymm19, ymm19, ymm20
+       vsubps   ymm7, ymm1, ymm0
+       vcmpps   ymm7, ymm7, ymmword ptr [reloc @RWD96], 13
+       vmovups  ymm8, ymmword ptr [reloc @RWD128]
+       vpcmpgtd ymm8, ymm8, ymmword ptr [rbx]
+       vcmpps   ymm9, ymm0, ymm3, 1
+       vxorps   ymm10, ymm10, ymm10
+       vcmpps   ymm10, ymm0, ymm10, 14
+       vpandd   ymm20, ymm9, ymm10
+       vpternlogd ymm8, ymm7, ymm20, -128
+       vpternlogd ymm19, ymm6, ymm8, -128
        vmulps   ymm20, ymm0, ymm18
        vaddps   ymm20, ymm20, ymm23
        vmulps   ymm21, ymm0, ymm17
@@ -362,14 +363,14 @@ G_M22781_IG04:        ; bbWeight=1, extend
        xor      eax, eax
        test     esi, esi
        jle      G_M22781_IG08
-						;; size=352 bbWeight=1 PerfScore 179.17
+						;; size=344 bbWeight=1 PerfScore 178.17
 G_M22781_IG05:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=C088 {rbx rdi r14 r15}, byref
        ; byrRegs -[r12-r13]
        cmp      eax, 8
        jae      G_M22781_IG14
        mov      ecx, eax
-       vmovups  ymmword ptr [rbp-0x1B0], ymm19
-       mov      edx, dword ptr [rbp+4*rcx-0x1B0]
+       vmovups  ymmword ptr [rbp-0x1D0], ymm19
+       mov      edx, dword ptr [rbp+4*rcx-0x1D0]
        test     edx, edx
        jge      G_M22781_IG07
 						;; size=36 bbWeight=4 PerfScore 23.00
@@ -382,17 +383,17 @@ G_M22781_IG06:        ; bbWeight=2, gcrefRegs=0000 {}, byrefRegs=C088 {rbx rdi r
        ; byrRegs +[rdx]
        movsxd   r8, eax
        lea      rdx, bword ptr [rdx+4*r8]
-       vmovups  ymmword ptr [rbp-0x1B0], ymm20
-       vmovss   xmm24, dword ptr [rbp+4*rcx-0x1B0]
+       vmovups  ymmword ptr [rbp-0x1D0], ymm20
...
+14 (+2.82%) : 19110.dasm - Microsoft.ML.Internal.CpuMath.AvxIntrinsics:Scale(float,System.Span`1[float]) (FullOpts)
@@ -131,18 +131,19 @@ G_M32706_IG07:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        vmulps   ymm2, ymm0, ymm1
        lea      r9d, [8*rax]
        movsxd   r9, r9d
-       vandps   ymm2, ymm2, ymmword ptr [rcx+4*r9]
+       vmovups  ymm3, ymmword ptr [rcx+4*r9]
        mov      r9d, eax
        neg      r9d
        lea      r9d, [8*r9+0x40]
        movsxd   r9, r9d
        vandps   ymm1, ymm1, ymmword ptr [rdx+4*r9]
-       vorps    ymm1, ymm2, ymm1
+       vpternlogd ymm2, ymm3, ymm1, -22
+       vmovaps  ymm1, ymm2
        vmovups  ymmword ptr [r8], ymm1
        movsxd   r9, eax
        lea      r8, [r8+4*r9]
        sub      r10d, eax
-						;; size=80 bbWeight=0.50 PerfScore 10.79
+						;; size=87 bbWeight=0.50 PerfScore 12.00
 G_M32706_IG08:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        cmp      r10d, 7
        jle      SHORT G_M32706_IG11
@@ -187,15 +188,16 @@ G_M32706_IG13:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byr
        vmulps   ymm0, ymm0, ymm1
        lea      eax, [8*r9]
        cdqe     
-       vandps   ymm0, ymm0, ymmword ptr [rdx+4*rax]
+       vmovups  ymm2, ymmword ptr [rdx+4*rax]
        mov      edx, r9d
        neg      edx
        lea      edx, [8*rdx+0x40]
        movsxd   rdx, edx
        vandps   ymm1, ymm1, ymmword ptr [rcx+4*rdx]
-       vorps    ymm1, ymm0, ymm1
+       vpternlogd ymm0, ymm2, ymm1, -22
+       vmovaps  ymm1, ymm0
        vmovups  ymmword ptr [r8], ymm1
-						;; size=70 bbWeight=0.50 PerfScore 10.04
+						;; size=77 bbWeight=0.50 PerfScore 11.25
 G_M32706_IG14:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        xor      ecx, ecx
        mov      bword ptr [rsp+0x28], rcx
@@ -229,7 +231,7 @@ G_M32706_IG19:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        int3     
 						;; size=6 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 497, prolog size 28, PerfScore 196.62, instruction count 127, allocated bytes for code 497 (MethodHash=efee803d) for method Microsoft.ML.Internal.CpuMath.AvxIntrinsics:Scale(float,System.Span`1[float]) (FullOpts)
+; Total bytes of code 511, prolog size 28, PerfScore 200.43, instruction count 129, allocated bytes for code 511 (MethodHash=efee803d) for method Microsoft.ML.Internal.CpuMath.AvxIntrinsics:Scale(float,System.Span`1[float]) (FullOpts)
 ; ============================================================
 
 Unwind Info:
Details

Improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same size Improvements (bytes) Regressions (bytes)
aspnet.run.windows.x64.checked.mch 312 81 230 1 -555 +1,784
benchmarks.run.windows.x64.checked.mch 25 14 8 3 -60 +57
benchmarks.run_pgo.windows.x64.checked.mch 37 15 17 5 -96 +151
benchmarks.run_tiered.windows.x64.checked.mch 30 15 14 1 -51 +63
coreclr_tests.run.windows.x64.checked.mch 100 80 20 0 -475 +233
libraries.crossgen2.windows.x64.checked.mch 0 0 0 0 -0 +0
libraries.pmi.windows.x64.checked.mch 90 72 18 0 -769 +147
libraries_tests.pmi.windows.x64.checked.mch 39 36 3 0 -413 +9
realworld.run.windows.x64.checked.mch 41 28 13 0 -1,453 +236
674 341 323 10 -3,872 +2,680

Context information

Collection Diffed contexts MinOpts FullOpts Missed, base Missed, diff
aspnet.run.windows.x64.checked.mch 96,372 33,539 62,833 6 (0.01%) 6 (0.01%)
benchmarks.run.windows.x64.checked.mch 27,865 1,362 26,503 0 (0.00%) 0 (0.00%)
benchmarks.run_pgo.windows.x64.checked.mch 94,253 48,494 45,759 0 (0.00%) 0 (0.00%)
benchmarks.run_tiered.windows.x64.checked.mch 55,018 36,527 18,491 0 (0.00%) 0 (0.00%)
coreclr_tests.run.windows.x64.checked.mch 559,250 339,170 220,080 0 (0.00%) 0 (0.00%)
libraries.crossgen2.windows.x64.checked.mch 272,446 15 272,431 0 (0.00%) 0 (0.00%)
libraries.pmi.windows.x64.checked.mch 303,987 5,705 298,282 0 (0.00%) 0 (0.00%)
libraries_tests.pmi.windows.x64.checked.mch 360,912 7,969 352,943 0 (0.00%) 0 (0.00%)
realworld.run.windows.x64.checked.mch 39,101 4,488 34,613 0 (0.00%) 0 (0.00%)
1,809,204 477,269 1,331,935 6 (0.00%) 6 (0.00%)

jit-analyze output

Selected micros

System.Text.Perf_Ascii.IsValid*: +9 (+1.97%) : 17569.dasm - System.Text.Ascii:IsValidCore[ushort](byref,int):bool (Tier1)
System.Text.Perf_Ascii.ToUpper_Chars: -4 (-0.79%) : 41558.dasm - System.Text.Ascii:ChangeCase[ushort,ushort,System.Text.Ascii+ToUpperConversion](ulong,ulong,ulong):ulong (Tier1)
System.Text.Perf_Ascii.FromUtf16: +10 (+5.32%) : 40640.dasm - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (Tier1-OSR)

System.Memory.Span.IndexOf* : -22 (-2.55%) : 81880.dasm - System.SpanHelpers:NonPackedIndexOfAnyValueType[short,System.SpanHelpers+DontNegate1[short]](byref,short,short,short,int):int (Tier1)

@ghost ghost added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member labels Aug 28, 2023
@ghost
Copy link

ghost commented Aug 28, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Description

This PR is trying to solve #84534.

We implemented the optimization by tracking the use-def chain during lowering, and trying to fold 2 adjacent binary bitwise operations on the same chain into a single ternary node when AVX512 is available.

As we tested internally, we observed some code size reduction in superpmi asmdiff tests, and no tp regression. Moreover, based on the tests where code gen difference is detected, we ran the related micros, and the results will be attached below.

Author: Ruihan-Yin
Assignees: -
Labels:

area-CodeGen-coreclr, community-contribution

Milestone: -

@Ruihan-Yin
Copy link
Member Author

Ruihan-Yin commented Aug 28, 2023

Micros performance collected on IceLake server platform

Method Job Toolchain Size Mean Error StdDev Median Min Max Ratio Allocated Alloc Ratio
IsValid_Bytes Job-SGGNTM \base\Core_Root\corerun.exe 128 2.165 ns 0.0263 ns 0.0233 ns 2.167 ns 2.130 ns 2.217 ns 1.00 - NA
IsValid_Bytes Job-GBHENQ \diff\Core_Root\corerun.exe 128 1.858 ns 0.0195 ns 0.0182 ns 1.848 ns 1.841 ns 1.892 ns 0.86 - NA
IsValid_Chars Job-SGGNTM \base\Core_Root\corerun.exe 128 2.669 ns 0.0159 ns 0.0124 ns 2.668 ns 2.649 ns 2.691 ns 1.00 - NA
IsValid_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 128 2.734 ns 0.0183 ns 0.0153 ns 2.737 ns 2.709 ns 2.758 ns 1.02 - NA
ToUpper_Chars Job-SGGNTM \base\Core_Root\corerun.exe 128 26.098 ns 0.2764 ns 0.2450 ns 25.994 ns 25.873 ns 26.607 ns 1.00 - NA
ToUpper_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 128 23.453 ns 0.0814 ns 0.0680 ns 23.425 ns 23.373 ns 23.577 ns 0.90 - NA
FromUtf16 Job-SGGNTM \base\Core_Root\corerun.exe 128 19.440 ns 0.1853 ns 0.1643 ns 19.396 ns 19.273 ns 19.802 ns 1.00 - NA
FromUtf16 Job-GBHENQ \diff\Core_Root\corerun.exe 128 19.592 ns 0.0903 ns 0.0754 ns 19.576 ns 19.513 ns 19.794 ns 1.01 - NA
IsValid_Bytes Job-SGGNTM \base\Core_Root\corerun.exe 512 4.571 ns 0.0255 ns 0.0213 ns 4.572 ns 4.538 ns 4.603 ns 1.00 - NA
IsValid_Bytes Job-GBHENQ \diff\Core_Root\corerun.exe 512 4.360 ns 0.0057 ns 0.0053 ns 4.360 ns 4.351 ns 4.369 ns 0.95 - NA
IsValid_Chars Job-SGGNTM \base\Core_Root\corerun.exe 512 9.794 ns 0.0135 ns 0.0113 ns 9.794 ns 9.771 ns 9.811 ns 1.00 - NA
IsValid_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 512 9.518 ns 0.0226 ns 0.0200 ns 9.522 ns 9.465 ns 9.538 ns 0.97 - NA
ToUpper_Chars Job-SGGNTM \base\Core_Root\corerun.exe 512 83.291 ns 0.0746 ns 0.0698 ns 83.301 ns 83.154 ns 83.386 ns 1.00 - NA
ToUpper_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 512 79.522 ns 0.0958 ns 0.0896 ns 79.528 ns 79.376 ns 79.664 ns 0.95 - NA
FromUtf16 Job-SGGNTM \base\Core_Root\corerun.exe 512 26.948 ns 0.0522 ns 0.0463 ns 26.935 ns 26.893 ns 27.039 ns 1.00 - NA
FromUtf16 Job-GBHENQ \diff\Core_Root\corerun.exe 512 27.531 ns 0.1927 ns 0.1802 ns 27.519 ns 27.260 ns 27.854 ns 1.02 - NA
IsValid_Bytes Job-SGGNTM \base\Core_Root\corerun.exe 10000 79.193 ns 0.5639 ns 0.4998 ns 78.993 ns 78.653 ns 80.167 ns 1.00 - NA
IsValid_Bytes Job-GBHENQ \diff\Core_Root\corerun.exe 10000 76.127 ns 0.4324 ns 0.4044 ns 76.139 ns 75.494 ns 76.990 ns 0.96 - NA
IsValid_Chars Job-SGGNTM \base\Core_Root\corerun.exe 10000 156.119 ns 1.1006 ns 1.0295 ns 156.024 ns 154.753 ns 158.070 ns 1.00 - NA
IsValid_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 10000 141.222 ns 1.1802 ns 1.1040 ns 140.717 ns 140.115 ns 143.425 ns 0.90 - NA
ToUpper_Chars Job-SGGNTM \base\Core_Root\corerun.exe 10000 1,396.589 ns 9.5042 ns 8.8903 ns 1,395.314 ns 1,386.125 ns 1,412.198 ns 1.00 - NA
ToUpper_Chars Job-GBHENQ \diff\Core_Root\corerun.exe 10000 1,259.837 ns 10.0903 ns 9.4385 ns 1,255.931 ns 1,250.916 ns 1,277.422 ns 0.90 - NA
FromUtf16 Job-SGGNTM \base\Core_Root\corerun.exe 10000 301.512 ns 1.8862 ns 1.7644 ns 300.743 ns 299.807 ns 305.576 ns 1.00 - NA
FromUtf16 Job-GBHENQ \diff\Core_Root\corerun.exe 10000 307.764 ns 1.5660 ns 1.4648 ns 308.032 ns 304.686 ns 309.845 ns 1.02 - NA
Method Job Toolchain Size Mean Error StdDev Median Min Max Ratio RatioSD Allocated Alloc Ratio
IndexOfValue Job-VFJRIM \base\Core_Root\corerun.exe 512 12.09 ns 0.316 ns 0.364 ns 12.01 ns 11.65 ns 12.70 ns 1.00 0.00 - NA
IndexOfValue Job-HTWODF \diff\Core_Root\corerun.exe 512 12.01 ns 0.257 ns 0.252 ns 11.99 ns 11.65 ns 12.53 ns 0.99 0.04 - NA
IndexOfAnyTwoValues Job-VFJRIM \base\Core_Root\corerun.exe 512 101.25 ns 0.347 ns 0.325 ns 101.07 ns 100.85 ns 101.72 ns 1.00 0.00 - NA
IndexOfAnyTwoValues Job-HTWODF \diff\Core_Root\corerun.exe 512 101.58 ns 0.711 ns 0.665 ns 101.34 ns 100.83 ns 102.92 ns 1.00 0.01 - NA
IndexOfAnyThreeValues Job-VFJRIM \base\Core_Root\corerun.exe 512 140.43 ns 1.121 ns 1.048 ns 140.10 ns 139.47 ns 142.95 ns 1.00 0.00 - NA
IndexOfAnyThreeValues Job-HTWODF \diff\Core_Root\corerun.exe 512 140.21 ns 0.632 ns 0.561 ns 140.39 ns 139.17 ns 140.78 ns 1.00 0.01 - NA
IndexOfAnyFourValues Job-VFJRIM \base\Core_Root\corerun.exe 512 477.66 ns 2.594 ns 2.300 ns 477.21 ns 474.89 ns 483.08 ns 1.00 0.00 - NA
IndexOfAnyFourValues Job-HTWODF \diff\Core_Root\corerun.exe 512 477.70 ns 3.102 ns 2.902 ns 476.25 ns 474.63 ns 482.16 ns 1.00 0.01 - NA
IndexOfAnyFiveValues Job-VFJRIM \base\Core_Root\corerun.exe 512 546.38 ns 3.349 ns 3.133 ns 545.77 ns 542.51 ns 551.66 ns 1.00 0.00 - NA
IndexOfAnyFiveValues Job-HTWODF \diff\Core_Root\corerun.exe 512 546.28 ns 4.452 ns 3.946 ns 544.19 ns 543.15 ns 556.37 ns 1.00 0.01 - NA
LastIndexOfValue Job-VFJRIM \base\Core_Root\corerun.exe 512 13.16 ns 0.091 ns 0.080 ns 13.15 ns 13.05 ns 13.27 ns 1.00 0.00 - NA
LastIndexOfValue Job-HTWODF \diff\Core_Root\corerun.exe 512 13.33 ns 0.136 ns 0.127 ns 13.29 ns 13.18 ns 13.57 ns 1.01 0.01 - NA
LastIndexOfAnyValues Job-VFJRIM \base\Core_Root\corerun.exe 512 102.91 ns 1.080 ns 0.958 ns 102.48 ns 101.93 ns 104.50 ns 1.00 0.00 - NA
LastIndexOfAnyValues Job-HTWODF \diff\Core_Root\corerun.exe 512 102.51 ns 0.603 ns 0.564 ns 102.55 ns 101.61 ns 103.34 ns 1.00 0.01 - NA
IndexOfValue Job-VFJRIM \base\Core_Root\corerun.exe 10000 188.90 ns 1.862 ns 1.650 ns 188.26 ns 186.92 ns 192.47 ns 1.00 0.00 - NA
IndexOfValue Job-HTWODF \diff\Core_Root\corerun.exe 10000 188.52 ns 1.244 ns 1.103 ns 188.36 ns 187.31 ns 190.57 ns 1.00 0.01 - NA
IndexOfAnyTwoValues Job-VFJRIM \base\Core_Root\corerun.exe 10000 1,801.95 ns 12.788 ns 11.962 ns 1,799.55 ns 1,781.30 ns 1,822.80 ns 1.00 0.00 - NA
IndexOfAnyTwoValues Job-HTWODF \diff\Core_Root\corerun.exe 10000 1,793.51 ns 9.667 ns 9.043 ns 1,793.51 ns 1,779.92 ns 1,805.97 ns 1.00 0.01 - NA
IndexOfAnyThreeValues Job-VFJRIM \base\Core_Root\corerun.exe 10000 2,454.67 ns 28.544 ns 26.700 ns 2,442.09 ns 2,420.35 ns 2,515.88 ns 1.00 0.00 - NA
IndexOfAnyThreeValues Job-HTWODF \diff\Core_Root\corerun.exe 10000 2,429.35 ns 11.429 ns 10.132 ns 2,428.73 ns 2,418.03 ns 2,451.63 ns 0.99 0.01 - NA
IndexOfAnyFourValues Job-VFJRIM \base\Core_Root\corerun.exe 10000 9,202.94 ns 133.764 ns 125.123 ns 9,214.04 ns 9,012.08 ns 9,470.96 ns 1.00 0.00 - NA
IndexOfAnyFourValues Job-HTWODF \diff\Core_Root\corerun.exe 10000 9,097.02 ns 65.918 ns 58.435 ns 9,103.08 ns 8,996.87 ns 9,213.29 ns 0.99 0.01 - NA
IndexOfAnyFiveValues Job-VFJRIM \base\Core_Root\corerun.exe 10000 10,500.31 ns 119.416 ns 105.859 ns 10,494.02 ns 10,384.50 ns 10,728.66 ns 1.00 0.00 - NA
IndexOfAnyFiveValues Job-HTWODF \diff\Core_Root\corerun.exe 10000 10,385.62 ns 44.575 ns 37.222 ns 10,374.98 ns 10,350.72 ns 10,482.70 ns 0.99 0.01 - NA
LastIndexOfValue Job-VFJRIM \base\Core_Root\corerun.exe 10000 185.26 ns 0.426 ns 0.378 ns 185.18 ns 184.80 ns 186.01 ns 1.00 0.00 - NA
LastIndexOfValue Job-HTWODF \diff\Core_Root\corerun.exe 10000 186.02 ns 0.802 ns 0.670 ns 185.96 ns 185.06 ns 187.10 ns 1.00 0.00 - NA
LastIndexOfAnyValues Job-VFJRIM \base\Core_Root\corerun.exe 10000 1,689.71 ns 9.743 ns 8.637 ns 1,691.89 ns 1,676.66 ns 1,703.77 ns 1.00 0.00 - NA
LastIndexOfAnyValues Job-HTWODF \diff\Core_Root\corerun.exe 10000 1,689.96 ns 10.835 ns 9.605 ns 1,684.88 ns 1,680.95 ns 1,709.34 ns 1.00 0.01 - NA

@Ruihan-Yin Ruihan-Yin marked this pull request as ready for review August 28, 2023 21:59
Comment on lines 442 to 447
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if ANDNOT logic should be included or not. The ANDNOT case won't be used by my PR, while it seems better to include it considering the semantics this function indicates.

@EgorBo EgorBo requested a review from tannergooding August 29, 2023 23:25
@JulieLeeMSFT JulieLeeMSFT added this to the 9.0.0 milestone Sep 1, 2023
@Ruihan-Yin
Copy link
Member Author

@tannergooding @dotnet/avx512-contrib trying to kindly ask if we could have more reviews on this PR, thanks.

Comment on lines 25295 to 25302
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be fine to be general and not XARCH only.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reviewing, sharing the long term plan!

Trying to confirm, if I understand it correctly, we will stick with the current implementation and continue the review process, right?

//
bool GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic()
{
return HWOperGet() == GT_AND || HWOperGet() == GT_OR || HWOperGet() == GT_XOR || HWOperGet() == GT_AND_NOT;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It'd be nice to explicitly cache HWOperGet(). The compiler "should" be doing it for us, but its not a trivial call and is better to be safe.

case NI_AVX2_Or:
case NI_AVX512F_Or:
case NI_AVX512DQ_Or:
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to either include the { return GT_OR; } as part of the ifdef -or- more ideally we add the relevant:

#elif defined(TARGET_ARM64)
    case NI_AdvSimd_Or:
#endif

case NI_AVX2_AndNot:
case NI_AVX512F_AndNot:
case NI_AVX512DQ_AndNot:
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. We need to either include the braces as part of the ifdef or cover NI_AdvSimd_AndNot:

Comment on lines 26368 to 26369
NamedIntrinsic firstLogic = GetHWIntrinsicId();
NamedIntrinsic secondLogic = second->GetHWIntrinsicId();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These look like dead locals now.

Comment on lines 26364 to 26366
const uint8_t A = 240; // 0xF0
const uint8_t B = 204; // 0xCC
const uint8_t C = 170; // 0xAA
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just make these directly hex:

    const uint8_t A = 0xF0;
    const uint8_t B = 0xCC;
    const uint8_t C = 0xAA;

bool OperIsEmbBroadcastCompatible() const;
bool OperIsBroadcastScalar() const;
bool OperIsCreateScalarUnsafe() const;
bool OperIsBitwiseHWIntrinsic();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be bool OperIsBitwiseHWIntrinsic() const; to indicate it doesn't mutate the instance?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah Yes, you are right, and do we consider also making HWOperGet() constant? it would make sense to me based on its semantic, and it would save some const_cast statement.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it should be annotated as such since it doesn't mutate (and never should)

bool OperRequiresCallFlag() const;

unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, shouldn't this be uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;?

@BruceForstall
Copy link
Contributor

@Ruihan-Yin Is this ready for another review? @tannergooding @EgorBo

@Ruihan-Yin
Copy link
Member Author

@Ruihan-Yin Is this ready for another review? @tannergooding @EgorBo

Yes, this PR hasn't changed after I resolved the comments from Tanner last time.

case NI_AVX_Xor:
case NI_AVX2_Xor:
case NI_AVX512F_Xor:
case NI_AVX512DQ_Xor:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any concerns around this becoming out of sync from OperIsBitwiseHWIntrinsic

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this list, we have the intrinsics that can be folded into ternary logic, while ANDNOT related intrinsics cannot be folded currently. On the other hand, OperIsBitwiseHWIntrinsic should be consistent with its name from my view, so I included ANDNOT there.

I could leave some comments there to specify this issue, if this is the better way to make thing more clear.

Copy link
Member

@EgorBo EgorBo left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

@Ruihan-Yin
Copy link
Member Author

Thanks everyone for the review and help!

@ghost ghost locked as resolved and limited conversation to collaborators Nov 23, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants