1
1
use std:: arch:: aarch64:: {
2
2
float32x4_t, int16x8_t, int32x4_t, int8x16_t, uint16x8_t, uint32x4_t, uint8x16_t, vabsq_f32,
3
- vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddvq_f32, vandq_u16, vandq_u32,
4
- vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vceqq_f32, vceqq_s16,
5
- vceqq_s32, vceqq_s8, vceqq_u16, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgeq_u16,
6
- vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcleq_f32, vcleq_s16, vcleq_s8,
7
- vcleq_u16, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32,
8
- vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vfmaq_f32, vld1q_f32,
9
- vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32,
10
- vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8, vmulq_u16, vnegq_f32, vnegq_s16, vnegq_s32,
11
- vnegq_s8, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8,
12
- vst1q_u16, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16,
3
+ vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddq_u8, vaddvq_f32, vandq_u16,
4
+ vandq_u32, vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vbslq_u8, vceqq_f32,
5
+ vceqq_s16, vceqq_s32, vceqq_s8, vceqq_u16, vceqq_u8, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8,
6
+ vcgeq_u16, vcgeq_u8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcgtq_u8, vcleq_f32,
7
+ vcleq_s16, vcleq_s8, vcleq_u16, vcleq_u8, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcltq_u8,
8
+ vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8,
9
+ vdupq_n_u16, vdupq_n_u8, vfmaq_f32, vld1q_f32, vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16,
10
+ vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32, vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8,
11
+ vmulq_u16, vmulq_u8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vshlq_n_s16, vshlq_n_s32,
12
+ vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8, vst1q_u16, vst1q_u8, vsubq_f32,
13
+ vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16, vsubq_u8,
13
14
} ;
14
15
use std:: mem:: transmute;
15
16
@@ -32,6 +33,7 @@ unsafe impl Isa for ArmNeonIsa {
32
33
type I32 = int32x4_t ;
33
34
type I16 = int16x8_t ;
34
35
type I8 = int8x16_t ;
36
+ type U8 = uint8x16_t ;
35
37
type U16 = uint16x8_t ;
36
38
type Bits = int32x4_t ;
37
39
@@ -51,6 +53,10 @@ unsafe impl Isa for ArmNeonIsa {
51
53
self
52
54
}
53
55
56
+ fn u8 ( self ) -> impl SimdOps < Self :: U8 > {
57
+ self
58
+ }
59
+
54
60
fn u16 ( self ) -> impl SimdOps < Self :: U16 > {
55
61
self
56
62
}
@@ -458,6 +464,76 @@ impl SimdIntOps<int8x16_t> for ArmNeonIsa {
458
464
}
459
465
}
460
466
467
+ unsafe impl SimdOps < uint8x16_t > for ArmNeonIsa {
468
+ simd_ops_common ! ( uint8x16_t, uint8x16_t) ;
469
+
470
+ #[ inline]
471
+ fn add ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
472
+ unsafe { vaddq_u8 ( x, y) }
473
+ }
474
+
475
+ #[ inline]
476
+ fn sub ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
477
+ unsafe { vsubq_u8 ( x, y) }
478
+ }
479
+
480
+ #[ inline]
481
+ fn mul ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
482
+ unsafe { vmulq_u8 ( x, y) }
483
+ }
484
+
485
+ #[ inline]
486
+ fn splat ( self , x : u8 ) -> uint8x16_t {
487
+ unsafe { vdupq_n_u8 ( x) }
488
+ }
489
+
490
+ #[ inline]
491
+ fn lt ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
492
+ unsafe { vcltq_u8 ( x, y) }
493
+ }
494
+
495
+ #[ inline]
496
+ fn le ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
497
+ unsafe { vcleq_u8 ( x, y) }
498
+ }
499
+
500
+ #[ inline]
501
+ fn eq ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
502
+ unsafe { vceqq_u8 ( x, y) }
503
+ }
504
+
505
+ #[ inline]
506
+ fn ge ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
507
+ unsafe { vcgeq_u8 ( x, y) }
508
+ }
509
+
510
+ #[ inline]
511
+ fn gt ( self , x : uint8x16_t , y : uint8x16_t ) -> uint8x16_t {
512
+ unsafe { vcgtq_u8 ( x, y) }
513
+ }
514
+
515
+ #[ inline]
516
+ unsafe fn load_ptr ( self , ptr : * const u8 ) -> uint8x16_t {
517
+ unsafe { vld1q_u8 ( ptr) }
518
+ }
519
+
520
+ #[ inline]
521
+ fn first_n_mask ( self , n : usize ) -> uint8x16_t {
522
+ let mask: [ u8 ; 16 ] = std:: array:: from_fn ( |i| if i < n { u8:: MAX } else { 0 } ) ;
523
+ unsafe { vld1q_u8 ( mask. as_ptr ( ) ) }
524
+ }
525
+
526
+ #[ inline]
527
+ fn select ( self , x : uint8x16_t , y : uint8x16_t , mask : <uint8x16_t as Simd >:: Mask ) -> uint8x16_t {
528
+ unsafe { vbslq_u8 ( mask, x, y) }
529
+ }
530
+
531
+ #[ inline]
532
+ unsafe fn store_ptr ( self , x : uint8x16_t , ptr : * mut u8 ) {
533
+ unsafe { vst1q_u8 ( ptr, x) }
534
+ }
535
+ }
536
+
461
537
unsafe impl SimdOps < uint16x8_t > for ArmNeonIsa {
462
538
simd_ops_common ! ( uint16x8_t, uint16x8_t) ;
463
539
@@ -610,4 +686,5 @@ impl_simd!(float32x4_t, f32, 4, uint32x4_t);
610
686
impl_simd ! ( int32x4_t, i32 , 4 , uint32x4_t) ;
611
687
impl_simd ! ( int16x8_t, i16 , 8 , uint16x8_t) ;
612
688
impl_simd ! ( int8x16_t, i8 , 16 , uint8x16_t) ;
689
+ impl_simd ! ( uint8x16_t, u8 , 16 , uint8x16_t) ;
613
690
impl_simd ! ( uint16x8_t, u16 , 8 , uint16x8_t) ;
0 commit comments