1
1
use std:: arch:: aarch64:: {
2
2
float32x4_t, int16x8_t, int32x4_t, int8x16_t, uint16x8_t, uint32x4_t, uint8x16_t, vabsq_f32,
3
- vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddvq_f32, vandq_u16, vandq_u32, vandq_u8,
4
- vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vceqq_f32, vceqq_s16, vceqq_s32, vceqq_s8,
5
- vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8,
6
- vcleq_f32, vcleq_s16, vcleq_s8, vcltq_f32, vcltq_s16, vcltq_s8, vcvtnq_s32_f32, vcvtq_s32_f32,
7
- vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vfmaq_f32, vld1q_f32, vld1q_s16,
8
- vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32, vmulq_f32,
9
- vmulq_s16, vmulq_s32, vmulq_s8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vshlq_n_s16,
10
- vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8, vsubq_f32, vsubq_s16,
11
- vsubq_s32, vsubq_s8,
3
+ vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddvq_f32, vandq_u16, vandq_u32,
4
+ vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vceqq_f32, vceqq_s16,
5
+ vceqq_s32, vceqq_s8, vceqq_u16, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgeq_u16,
6
+ vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcleq_f32, vcleq_s16, vcleq_s8,
7
+ vcleq_u16, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32,
8
+ vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vfmaq_f32, vld1q_f32,
9
+ vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32,
10
+ vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8, vmulq_u16, vnegq_f32, vnegq_s16, vnegq_s32,
11
+ vnegq_s8, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8,
12
+ vst1q_u16, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16,
12
13
} ;
13
14
use std:: mem:: transmute;
14
15
@@ -31,6 +32,7 @@ unsafe impl Isa for ArmNeonIsa {
31
32
type I32 = int32x4_t ;
32
33
type I16 = int16x8_t ;
33
34
type I8 = int8x16_t ;
35
+ type U16 = uint16x8_t ;
34
36
type Bits = int32x4_t ;
35
37
36
38
fn f32 ( self ) -> impl SimdFloatOps < Self :: F32 , Int = Self :: I32 > {
@@ -48,6 +50,10 @@ unsafe impl Isa for ArmNeonIsa {
48
50
fn i8 ( self ) -> impl SimdIntOps < Self :: I8 > {
49
51
self
50
52
}
53
+
54
+ fn u16 ( self ) -> impl SimdOps < Self :: U16 > {
55
+ self
56
+ }
51
57
}
52
58
53
59
macro_rules! simd_ops_common {
@@ -452,6 +458,76 @@ impl SimdIntOps<int8x16_t> for ArmNeonIsa {
452
458
}
453
459
}
454
460
461
+ unsafe impl SimdOps < uint16x8_t > for ArmNeonIsa {
462
+ simd_ops_common ! ( uint16x8_t, uint16x8_t) ;
463
+
464
+ #[ inline]
465
+ fn add ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
466
+ unsafe { vaddq_u16 ( x, y) }
467
+ }
468
+
469
+ #[ inline]
470
+ fn sub ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
471
+ unsafe { vsubq_u16 ( x, y) }
472
+ }
473
+
474
+ #[ inline]
475
+ fn mul ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
476
+ unsafe { vmulq_u16 ( x, y) }
477
+ }
478
+
479
+ #[ inline]
480
+ fn splat ( self , x : u16 ) -> uint16x8_t {
481
+ unsafe { vdupq_n_u16 ( x) }
482
+ }
483
+
484
+ #[ inline]
485
+ fn lt ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
486
+ unsafe { vcltq_u16 ( x, y) }
487
+ }
488
+
489
+ #[ inline]
490
+ fn le ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
491
+ unsafe { vcleq_u16 ( x, y) }
492
+ }
493
+
494
+ #[ inline]
495
+ fn eq ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
496
+ unsafe { vceqq_u16 ( x, y) }
497
+ }
498
+
499
+ #[ inline]
500
+ fn ge ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
501
+ unsafe { vcgeq_u16 ( x, y) }
502
+ }
503
+
504
+ #[ inline]
505
+ fn gt ( self , x : uint16x8_t , y : uint16x8_t ) -> uint16x8_t {
506
+ unsafe { vcgtq_u16 ( x, y) }
507
+ }
508
+
509
+ #[ inline]
510
+ unsafe fn load_ptr ( self , ptr : * const u16 ) -> uint16x8_t {
511
+ unsafe { vld1q_u16 ( ptr) }
512
+ }
513
+
514
+ #[ inline]
515
+ fn first_n_mask ( self , n : usize ) -> uint16x8_t {
516
+ let mask: [ u16 ; 8 ] = std:: array:: from_fn ( |i| if i < n { u16:: MAX } else { 0 } ) ;
517
+ unsafe { vld1q_u16 ( mask. as_ptr ( ) ) }
518
+ }
519
+
520
+ #[ inline]
521
+ fn select ( self , x : uint16x8_t , y : uint16x8_t , mask : <uint16x8_t as Simd >:: Mask ) -> uint16x8_t {
522
+ unsafe { vbslq_u16 ( mask, x, y) }
523
+ }
524
+
525
+ #[ inline]
526
+ unsafe fn store_ptr ( self , x : uint16x8_t , ptr : * mut u16 ) {
527
+ unsafe { vst1q_u16 ( ptr, x) }
528
+ }
529
+ }
530
+
455
531
macro_rules! impl_mask {
456
532
( $mask: ty, $elem: ty, $len: expr) => {
457
533
impl Mask for $mask {
@@ -534,3 +610,4 @@ impl_simd!(float32x4_t, f32, 4, uint32x4_t);
534
610
impl_simd ! ( int32x4_t, i32 , 4 , uint32x4_t) ;
535
611
impl_simd ! ( int16x8_t, i16 , 8 , uint16x8_t) ;
536
612
impl_simd ! ( int8x16_t, i8 , 16 , uint8x16_t) ;
613
+ impl_simd ! ( uint16x8_t, u16 , 8 , uint16x8_t) ;
0 commit comments