@@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
167167 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
168168 private unsafe static ulong CompressBlock ( ref Block64 b , ulong mask , byte * output , byte * tablePtr )
169169 {
170+ // if mask is a power of 2, we can use a simpler version
171+ if ( ( mask & ( mask - 1 ) ) == 0 ) // check if mask is a power of 2
172+ {
173+ ulong pos64 = Bmi1 . X64 . TrailingZeroCount ( mask ) ;
174+ ulong pos = pos64 & 0xf ;
175+ Vector128 < byte > v1 = Vector128 . Create ( ( byte ) 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ) ;
176+ Vector128 < byte > v0 = Vector128 . Create ( ( byte ) ( pos - 1 ) ) ;
177+ switch ( pos64 >> 4 )
178+ {
179+ case 0 :
180+ {
181+ Vector128 < byte > chunk0 = Avx2 . ExtractVector128 ( b . chunk0 , 0 ) ;
182+ Vector128 < byte > chunk1 = Avx2 . ExtractVector128 ( b . chunk0 , 1 ) ;
183+ Vector128 < byte > v2 = Sse2 . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
184+ Vector128 < byte > sh = Sse2 . Subtract ( v1 , v2 ) ;
185+ Vector128 < byte > compressed = Ssse3 . Shuffle ( chunk0 , sh ) ;
186+ Vector128 . Store ( compressed , output + 0 * 16 ) ;
187+ Vector128 . Store ( chunk1 , output + 1 * 16 - 1 ) ;
188+ Vector256 . Store ( b . chunk1 , output + 2 * 16 - 1 ) ;
189+ }
190+ break ;
191+
192+ case 1 :
193+ {
194+ Vector128 < byte > chunk0 = Avx2 . ExtractVector128 ( b . chunk0 , 0 ) ;
195+ Vector128 < byte > chunk1 = Avx2 . ExtractVector128 ( b . chunk0 , 1 ) ;
196+ Vector128 < byte > v2 = Sse2 . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
197+ Vector128 < byte > sh = Sse2 . Subtract ( v1 , v2 ) ;
198+ Vector128 < byte > compressed = Ssse3 . Shuffle ( chunk1 , sh ) ;
199+ Vector128 . Store ( chunk0 , output + 0 * 16 ) ;
200+ Vector128 . Store ( compressed , output + 1 * 16 ) ;
201+ Vector256 . Store ( b . chunk1 , output + 2 * 16 - 1 ) ;
202+ }
203+ break ;
204+
205+ case 2 :
206+ {
207+ Vector128 < byte > chunk0 = Avx2 . ExtractVector128 ( b . chunk1 , 0 ) ;
208+ Vector128 < byte > chunk1 = Avx2 . ExtractVector128 ( b . chunk0 , 1 ) ;
209+ Vector128 < byte > v2 = Sse2 . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
210+ Vector128 < byte > sh = Sse2 . Subtract ( v1 , v2 ) ;
211+ Vector128 < byte > compressed = Ssse3 . Shuffle ( chunk0 , sh ) ;
212+ Vector256 . Store ( b . chunk0 , output + 0 * 16 ) ;
213+ Vector128 . Store ( compressed , output + 2 * 16 ) ;
214+ Vector128 . Store ( chunk1 , output + 3 * 16 - 1 ) ;
215+ }
216+ break ;
217+
218+ case 3 :
219+ {
220+ Vector128 < byte > chunk0 = Avx2 . ExtractVector128 ( b . chunk1 , 0 ) ;
221+ Vector128 < byte > chunk1 = Avx2 . ExtractVector128 ( b . chunk0 , 1 ) ;
222+ Vector128 < byte > v2 = Sse2 . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
223+ Vector128 < byte > sh = Sse2 . Subtract ( v1 , v2 ) ;
224+ Vector128 < byte > compressed = Ssse3 . Shuffle ( chunk1 , sh ) ;
225+ Vector256 . Store ( b . chunk0 , output + 0 * 16 ) ;
226+ Vector128 . Store ( chunk0 , output + 2 * 16 ) ;
227+ Vector128 . Store ( compressed , output + 3 * 16 ) ;
228+ }
229+ break ;
230+ }
231+ return 63 ;
232+ }
170233 ulong nmask = ~ mask ;
171234 Compress ( b . chunk0 , ( UInt32 ) mask , output , tablePtr ) ;
172235 Compress ( b . chunk1 , ( UInt32 ) ( mask >> 32 ) , output + Popcnt . X64 . PopCount ( nmask & 0xFFFFFFFF ) , tablePtr ) ;
0 commit comments