Skip to content

Commit 0862418

Browse files
authored
Merge pull request #36 from simdutf/perf_boost
Perf boost
2 parents bd23f79 + 3d73d88 commit 0862418

File tree

6 files changed

+194
-12
lines changed

6 files changed

+194
-12
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ fully reproducible.
3232

3333
| processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up |
3434
|:----------------|:------------------------|:-------------------|:-------------------|
35-
| Apple M2 processor (ARM, 3.5 Ghz) | 6.5 | 3.8 | 1.7 x |
36-
| AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x |
37-
| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x |
38-
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x |
35+
| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x |
36+
| AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x |
37+
| Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x |
38+
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.9 | 3.0 | 2.3 x |
3939

4040
## Results (SimdBase64 vs. string .NET functions)
4141

benchmark/Benchmark.cs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22
using BenchmarkDotNet.Running;
33
using BenchmarkDotNet.Configs;
44
using BenchmarkDotNet.Reports;
5-
using BenchmarkDotNet.Filters;
65
using BenchmarkDotNet.Jobs;
76
using System.Text;
8-
using System.Runtime.InteropServices;
97
using BenchmarkDotNet.Columns;
10-
using System.Runtime.Intrinsics;
11-
using System.Runtime.Intrinsics.X86;
128

139
namespace SimdUnicodeBenchmarks
1410
{
@@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le
464460

465461
if (dataoutput.Length != lengths[i])
466462
{
467-
Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}");
463+
Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}");
468464
#pragma warning disable CA2201
469465
throw new Exception("Error");
470466
}

src/Base64.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64<T>(ReadOnlySpan<T> input)
1414
{
1515
return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input);
1616
}
17-
public static byte[] FromBase64String(string s) {
17+
public static byte[] FromBase64String(string s)
18+
{
1819
ReadOnlySpan<char> base64 = s.AsSpan();
1920
byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64<char>(base64)];
2021
int bytesConsumed = 0;
@@ -35,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<byte> source,
3536
//if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
3637
//{
3738
//}
38-
if (Avx2.IsSupported)
39+
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
3940
{
4041
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
4142
}
@@ -60,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<char> source,
6061
//{
6162
// return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
6263
//}
63-
if (Avx2.IsSupported)
64+
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
6465
{
6566
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
6667
}

src/Base64ARM.cs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,65 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
219219
[MethodImpl(MethodImplOptions.AggressiveInlining)]
220220
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
221221
{
222+
// if mask is a power of 2, we can use a simpler version
223+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
224+
{
225+
int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
226+
int pos = pos64 & 0xf;
227+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
228+
Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
229+
switch (pos64 >> 4)
230+
{
231+
case 3:
232+
{
233+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
234+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
235+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh);
236+
Vector128.Store(compressed, output + 0 * 16);
237+
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
238+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
239+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
240+
}
241+
break;
242+
243+
case 2:
244+
{
245+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
246+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
247+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh);
248+
Vector128.Store(b.chunk0, output + 0 * 16);
249+
Vector128.Store(compressed, output + 1 * 16);
250+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
251+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
252+
}
253+
break;
254+
255+
case 1:
256+
{
257+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
258+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
259+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
260+
Vector128.Store(b.chunk0, output + 0 * 16);
261+
Vector128.Store(b.chunk1, output + 1 * 16);
262+
Vector128.Store(compressed, output + 2 * 16);
263+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
264+
}
265+
break;
266+
267+
case 0:
268+
{
269+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
270+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
271+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh);
272+
Vector128.Store(b.chunk0, output + 0 * 16);
273+
Vector128.Store(b.chunk1, output + 1 * 16);
274+
Vector128.Store(b.chunk2, output + 2 * 16);
275+
Vector128.Store(compressed, output + 3 * 16);
276+
}
277+
break;
278+
}
279+
return 63;
280+
}
222281
ulong nmask = ~mask;
223282
Compress(b.chunk0, (ushort)mask, output, tablePtr);
224283
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);

src/Base64AVX2UTF8.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
167167
[MethodImpl(MethodImplOptions.AggressiveInlining)]
168168
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
169169
{
170+
// if mask is a power of 2, we can use a simpler version
171+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
172+
{
173+
ulong pos64 = Bmi1.X64.TrailingZeroCount(mask);
174+
ulong pos = pos64 & 0xf;
175+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
176+
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
177+
switch (pos64 >> 4)
178+
{
179+
case 0:
180+
{
181+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
182+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
183+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
184+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
185+
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
186+
Vector128.Store(compressed, output + 0 * 16);
187+
Vector128.Store(chunk1, output + 1 * 16 - 1);
188+
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
189+
}
190+
break;
191+
192+
case 1:
193+
{
194+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
195+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
196+
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
197+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
198+
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
199+
Vector128.Store(chunk0, output + 0 * 16);
200+
Vector128.Store(compressed, output + 1 * 16);
201+
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
202+
}
203+
break;
204+
205+
case 2:
206+
{
207+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
208+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
209+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
210+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
211+
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
212+
Vector256.Store(b.chunk0, output + 0 * 16);
213+
Vector128.Store(compressed, output + 2 * 16);
214+
Vector128.Store(chunk1, output + 3 * 16 - 1);
215+
}
216+
break;
217+
218+
case 3:
219+
{
220+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
221+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
222+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
223+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
224+
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
225+
Vector256.Store(b.chunk0, output + 0 * 16);
226+
Vector128.Store(chunk0, output + 2 * 16);
227+
Vector128.Store(compressed, output + 3 * 16);
228+
}
229+
break;
230+
}
231+
return 63;
232+
}
170233
ulong nmask = ~mask;
171234
Compress(b.chunk0, (UInt32)mask, output, tablePtr);
172235
Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr);

src/Base64SSEUTF8.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Numerics;
23
using System.Runtime.Intrinsics;
34
using System.Runtime.Intrinsics.X86;
45
using System.Runtime.CompilerServices;
@@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128<byte> src, ref
131132
[MethodImpl(MethodImplOptions.AggressiveInlining)]
132133
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
133134
{
135+
// if mask is a power of 2, we can use a simpler version
136+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
137+
{
138+
int pos64 = BitOperations.TrailingZeroCount(mask);
139+
int pos = pos64 & 0xf;
140+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
141+
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
142+
switch (pos64 >> 4)
143+
{
144+
case 0:
145+
{
146+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
147+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
148+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk0, sh);
149+
Vector128.Store(compressed, output + 0 * 16);
150+
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
151+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
152+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
153+
154+
}
155+
break;
156+
157+
case 1:
158+
{
159+
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
160+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
161+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk1, sh);
162+
Vector128.Store(b.chunk0, output + 0 * 16);
163+
Vector128.Store(compressed, output + 1 * 16);
164+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
165+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
166+
167+
}
168+
break;
169+
170+
case 2:
171+
{
172+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
173+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
174+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk2, sh);
175+
Vector128.Store(b.chunk0, output + 0 * 16);
176+
Vector128.Store(b.chunk1, output + 1 * 16);
177+
Vector128.Store(compressed, output + 2 * 16);
178+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
179+
180+
}
181+
break;
182+
183+
case 3:
184+
{
185+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
186+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
187+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk3, sh);
188+
Vector128.Store(b.chunk0, output + 0 * 16);
189+
Vector128.Store(b.chunk1, output + 1 * 16);
190+
Vector128.Store(b.chunk2, output + 2 * 16);
191+
Vector128.Store(compressed, output + 3 * 16);
192+
}
193+
break;
194+
}
195+
return 63;
196+
}
134197
ulong nmask = ~mask;
135198
Compress(b.chunk0, (ushort)mask, output, tablePtr);
136199
Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr);

0 commit comments

Comments
 (0)