@@ -29,6 +29,7 @@ pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<Multi
29
29
( lines, multi_byte_chars)
30
30
}
31
31
32
+ #[ cfg( bootstrap) ]
32
33
cfg_match ! {
33
34
cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) => {
34
35
fn analyze_source_file_dispatch(
@@ -185,6 +186,165 @@ cfg_match! {
185
186
}
186
187
}
187
188
}
189
+
190
+ #[ cfg( not( bootstrap) ) ]
191
+ cfg_match ! {
192
+ any( target_arch = "x86" , target_arch = "x86_64" ) => {
193
+ fn analyze_source_file_dispatch(
194
+ src: & str ,
195
+ lines: & mut Vec <RelativeBytePos >,
196
+ multi_byte_chars: & mut Vec <MultiByteChar >,
197
+ ) {
198
+ if is_x86_feature_detected!( "sse2" ) {
199
+ unsafe {
200
+ analyze_source_file_sse2( src, lines, multi_byte_chars) ;
201
+ }
202
+ } else {
203
+ analyze_source_file_generic(
204
+ src,
205
+ src. len( ) ,
206
+ RelativeBytePos :: from_u32( 0 ) ,
207
+ lines,
208
+ multi_byte_chars,
209
+ ) ;
210
+ }
211
+ }
212
+
213
+ /// Checks 16 byte chunks of text at a time. If the chunk contains
214
+ /// something other than printable ASCII characters and newlines, the
215
+ /// function falls back to the generic implementation. Otherwise it uses
216
+ /// SSE2 intrinsics to quickly find all newlines.
217
+ #[ target_feature( enable = "sse2" ) ]
218
+ unsafe fn analyze_source_file_sse2(
219
+ src: & str ,
220
+ lines: & mut Vec <RelativeBytePos >,
221
+ multi_byte_chars: & mut Vec <MultiByteChar >,
222
+ ) {
223
+ #[ cfg( target_arch = "x86" ) ]
224
+ use std:: arch:: x86:: * ;
225
+ #[ cfg( target_arch = "x86_64" ) ]
226
+ use std:: arch:: x86_64:: * ;
227
+
228
+ const CHUNK_SIZE : usize = 16 ;
229
+
230
+ let src_bytes = src. as_bytes( ) ;
231
+
232
+ let chunk_count = src. len( ) / CHUNK_SIZE ;
233
+
234
+ // This variable keeps track of where we should start decoding a
235
+ // chunk. If a multi-byte character spans across chunk boundaries,
236
+ // we need to skip that part in the next chunk because we already
237
+ // handled it.
238
+ let mut intra_chunk_offset = 0 ;
239
+
240
+ for chunk_index in 0 ..chunk_count {
241
+ let ptr = src_bytes. as_ptr( ) as * const __m128i;
242
+ // We don't know if the pointer is aligned to 16 bytes, so we
243
+ // use `loadu`, which supports unaligned loading.
244
+ let chunk = unsafe { _mm_loadu_si128( ptr. add( chunk_index) ) } ;
245
+
246
+ // For character in the chunk, see if its byte value is < 0, which
247
+ // indicates that it's part of a UTF-8 char.
248
+ let multibyte_test = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 0 ) ) } ;
249
+ // Create a bit mask from the comparison results.
250
+ let multibyte_mask = unsafe { _mm_movemask_epi8( multibyte_test) } ;
251
+
252
+ // If the bit mask is all zero, we only have ASCII chars here:
253
+ if multibyte_mask == 0 {
254
+ assert!( intra_chunk_offset == 0 ) ;
255
+
256
+ // Check if there are any control characters in the chunk. All
257
+ // control characters that we can encounter at this point have a
258
+ // byte value less than 32 or ...
259
+ let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
260
+ let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
261
+
262
+ // ... it's the ASCII 'DEL' character with a value of 127.
263
+ let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
264
+ let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
265
+
266
+ let control_char_mask = control_char_mask0 | control_char_mask1;
267
+
268
+ if control_char_mask != 0 {
269
+ // Check for newlines in the chunk
270
+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
271
+ let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
272
+
273
+ if control_char_mask == newlines_mask {
274
+ // All control characters are newlines, record them
275
+ let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
276
+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
277
+
278
+ loop {
279
+ let index = newlines_mask. trailing_zeros( ) ;
280
+
281
+ if index >= CHUNK_SIZE as u32 {
282
+ // We have arrived at the end of the chunk.
283
+ break ;
284
+ }
285
+
286
+ lines. push( RelativeBytePos ( index) + output_offset) ;
287
+
288
+ // Clear the bit, so we can find the next one.
289
+ newlines_mask &= ( !1 ) << index;
290
+ }
291
+
292
+ // We are done for this chunk. All control characters were
293
+ // newlines and we took care of those.
294
+ continue ;
295
+ } else {
296
+ // Some of the control characters are not newlines,
297
+ // fall through to the slow path below.
298
+ }
299
+ } else {
300
+ // No control characters, nothing to record for this chunk
301
+ continue ;
302
+ }
303
+ }
304
+
305
+ // The slow path.
306
+ // There are control chars in here, fallback to generic decoding.
307
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
308
+ intra_chunk_offset = analyze_source_file_generic(
309
+ & src[ scan_start..] ,
310
+ CHUNK_SIZE - intra_chunk_offset,
311
+ RelativeBytePos :: from_usize( scan_start) ,
312
+ lines,
313
+ multi_byte_chars,
314
+ ) ;
315
+ }
316
+
317
+ // There might still be a tail left to analyze
318
+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
319
+ if tail_start < src. len( ) {
320
+ analyze_source_file_generic(
321
+ & src[ tail_start..] ,
322
+ src. len( ) - tail_start,
323
+ RelativeBytePos :: from_usize( tail_start) ,
324
+ lines,
325
+ multi_byte_chars,
326
+ ) ;
327
+ }
328
+ }
329
+ }
330
+ _ => {
331
+ // The target (or compiler version) does not support SSE2 ...
332
+ fn analyze_source_file_dispatch(
333
+ src: & str ,
334
+ lines: & mut Vec <RelativeBytePos >,
335
+ multi_byte_chars: & mut Vec <MultiByteChar >,
336
+ ) {
337
+ analyze_source_file_generic(
338
+ src,
339
+ src. len( ) ,
340
+ RelativeBytePos :: from_u32( 0 ) ,
341
+ lines,
342
+ multi_byte_chars,
343
+ ) ;
344
+ }
345
+ }
346
+ }
347
+
188
348
// `scan_len` determines the number of bytes in `src` to scan. Note that the
189
349
// function can read past `scan_len` if a multi-byte character start within the
190
350
// range but extends past it. The overflow is returned by the function.
0 commit comments