-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
EDITED BY @stephentoub on 10/26/2022 to add revised API shape:
namespace System
{
public static class IndexOfAnyValues
{
public static IndexOfAnyValues<T> Create<T>(ReadOnlySpan<T> values) where T : IEquatable<T>?;
// in the future, could add an `IndexOfAnyValues<char> Create(ReadOnlySpan<string> values) overload
}
public class IndexOfAnyValues<T> where T : IEquatable<T>?
{
// no public/protected members, including ctors
}
public static partial class MemoryExtensions
{
// These are identical to the existing overloads, just with IndexOfAnyValues<T> instead of ReadOnlySpan<T> for the values parameter type
public static int IndexOfAny<T>(this ReadOnlySpan<T> span, IndexOfAnyValues<T> values) where T : IEquatable<T>?;
public static int IndexOfAnyExcept<T>(this ReadOnlySpan<T> span, IndexOfAnyValues<T> values) where T : IEquatable<T>?;
public static int LastIndexOfAny<T>(this ReadOnlySpan<T> span, IndexOfAnyValues<T> values) where T : IEquatable<T>?;
public static int LastIndexOfAnyExcept<T>(this ReadOnlySpan<T> span, IndexOfAnyValues<T> values) where T : IEquatable<T>?;
// ... repeat for Span overloads
}
}
API Usage
internal static class HttpRuleParser
{
private static readonly IndexOfAnyValues<char> s_tokenChars =
IndexOfAnyValues.Create("!#$%&'*+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz|~");
public static bool IsToken(ReadOnlySpan<char> input) =>
input.IndexOfAnyExcept(s_tokenChars) < 0;
}
EDITED BY @MihaZupan on 10/24/2022 to add proposed API shape:
API Proposal
namespace System
{
public static partial class MemoryExtensions
{
public static int IndexOfAny(this ReadOnlySpan<byte> span, IndexOfAnyAsciiValues asciiValues);
public static int IndexOfAnyExcept(this ReadOnlySpan<byte> span, IndexOfAnyAsciiValues asciiValues);
public static int LastIndexOfAny(this ReadOnlySpan<byte> span, IndexOfAnyAsciiValues asciiValues);
public static int LastIndexOfAnyExcept(this ReadOnlySpan<byte> span, IndexOfAnyAsciiValues asciiValues);
public static int IndexOfAny(this ReadOnlySpan<char> span, IndexOfAnyAsciiValues asciiValues);
public static int IndexOfAnyExcept(this ReadOnlySpan<char> span, IndexOfAnyAsciiValues asciiValues);
public static int LastIndexOfAny(this ReadOnlySpan<char> span, IndexOfAnyAsciiValues asciiValues);
public static int LastIndexOfAnyExcept(this ReadOnlySpan<char> span, IndexOfAnyAsciiValues asciiValues);
// ... repeat for Span overloads
}
}
namespace System.Buffers.Text
{
public sealed class IndexOfAnyAsciiValues
{
public IndexOfAnyAsciiValues(ReadOnlySpan<char> asciiValues);
// No other public members
}
}
API Usage
internal static class HttpRuleParser
{
private static readonly IndexOfAnyAsciiValues s_tokenChars =
new("!#$%&'*+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz|~");
public static bool IsToken(ReadOnlySpan<byte> input) =>
input.IndexOfAnyExcept(s_tokenChars) < 0;
}
Alternative Designs
Static members on Ascii
instead of extension methods in MemoryExtensions
.
Based on feedback from API review on 2022-10-25.
namespace System.Buffers.Text;
public static class Ascii
{
public static int IndexOfAny(ReadOnlySpan<byte> span, IndexOfAnyAsciiValues values);
public static int IndexOfAny(ReadOnlySpan<char> span, IndexOfAnyAsciiValues values);
// ...
}
public sealed class IndexOfAnyAsciiValues
{
public IndexOfAnyAsciiValues(ReadOnlySpan<char> asciiValues);
}
Original post:
Earlier in the .NET 7 release, we combined the best of string.IndexOfAny and MemoryExtensions.IndexOfAny to provide a single implementation used by both. As part of this, whereas <= 5 characters are vectorized, > 5 characters use a "probabilistic map":
runtime/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs
Lines 1140 to 1167 in d58efd1
private static unsafe int IndexOfAnyProbabilistic(ref char searchSpace, int searchSpaceLength, ref char values, int valuesLength) | |
{ | |
Debug.Assert(searchSpaceLength >= 0); | |
Debug.Assert(valuesLength >= 0); | |
ReadOnlySpan<char> valuesSpan = new ReadOnlySpan<char>(ref values, valuesLength); | |
ProbabilisticMap map = default; | |
uint* charMap = (uint*)↦ | |
ProbabilisticMap.Initialize(charMap, valuesSpan); | |
ref char cur = ref searchSpace; | |
while (searchSpaceLength != 0) | |
{ | |
int ch = cur; | |
if (ProbabilisticMap.IsCharBitSet(charMap, (byte)ch) && | |
ProbabilisticMap.IsCharBitSet(charMap, (byte)(ch >> 8)) && | |
ProbabilisticMap.SpanContains(valuesSpan, (char)ch)) | |
{ | |
return (int)((nint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); | |
} | |
searchSpaceLength--; | |
cur = ref Unsafe.Add(ref cur, 1); | |
} | |
return -1; | |
} |
essentially a Bloom filter to quickly screen out characters that are definitely not in the set so that the more expensive check need only be done if it's likely the character is in the set. While faster than comparing every input character against every character in the set, we still have to walk the input character by character.
@MihaZupan's xoofx/markdig@da756f4 is an implementation of http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm, and highlights that it's possible to vectorize this operation instead. It would require startup overhead of determining whether all of the set characters are ASCII and generating a bitmap from them, but we're already doing such a loop in order to build the probabilistic map:
runtime/src/libraries/System.Private.CoreLib/src/System/ProbabilisticMap.cs
Lines 31 to 67 in d58efd1
public static unsafe void Initialize(uint* charMap, ReadOnlySpan<char> values) | |
{ | |
#if DEBUG | |
for (int i = 0; i < Size; i++) | |
{ | |
Debug.Assert(charMap[i] == 0, "Expected charMap to be zero-initialized."); | |
} | |
#endif | |
bool hasAscii = false; | |
uint* charMapLocal = charMap; // https://github.com/dotnet/runtime/issues/9040 | |
for (int i = 0; i < values.Length; ++i) | |
{ | |
int c = values[i]; | |
// Map low bit | |
SetCharBit(charMapLocal, (byte)c); | |
// Map high bit | |
c >>= 8; | |
if (c == 0) | |
{ | |
hasAscii = true; | |
} | |
else | |
{ | |
SetCharBit(charMapLocal, (byte)c); | |
} | |
} | |
if (hasAscii) | |
{ | |
// Common to search for ASCII symbols. Just set the high value once. | |
charMapLocal[0] |= 1u; | |
} | |
} |
We could simply augment that to also build up the ASCII bitmap, and if after initialization determine that every character was indeed ASCII, take the vectorized path of using that ASCII bitmap, otherwise fall back to using the probabilistic map as we do today.