robertknight
diff --git a/‎rten-simd/src/safe/arch/aarch64.rs
+86-9 b/‎rten-simd/src/safe/arch/aarch64.rs
+86-9
diff --git a/‎rten-simd/src/safe/arch/generic.rs
+20-4 b/‎rten-simd/src/safe/arch/generic.rs
+20-4
diff --git a/‎rten-simd/src/safe/arch/wasm32.rs
+48-2 b/‎rten-simd/src/safe/arch/wasm32.rs
+48-2
@@ -1,14 +1,15 @@
 use std::arch::aarch64::{
     float32x4_t, int16x8_t, int32x4_t, int8x16_t, uint16x8_t, uint32x4_t, uint8x16_t, vabsq_f32,
-    vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddvq_f32, vandq_u16, vandq_u32, vandq_u8,
-    vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vceqq_f32, vceqq_s16, vceqq_s32, vceqq_s8,
-    vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8,
-    vcleq_f32, vcleq_s16, vcleq_s8, vcltq_f32, vcltq_s16, vcltq_s8, vcvtnq_s32_f32, vcvtq_s32_f32,
-    vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vfmaq_f32, vld1q_f32, vld1q_s16,
-    vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32, vmulq_f32,
-    vmulq_s16, vmulq_s32, vmulq_s8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vshlq_n_s16,
-    vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8, vsubq_f32, vsubq_s16,
-    vsubq_s32, vsubq_s8,
+    vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddvq_f32, vandq_u16, vandq_u32,
+    vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vceqq_f32, vceqq_s16,
+    vceqq_s32, vceqq_s8, vceqq_u16, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgeq_u16,
+    vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcleq_f32, vcleq_s16, vcleq_s8,
+    vcleq_u16, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32,
+    vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vfmaq_f32, vld1q_f32,
+    vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32,
+    vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8, vmulq_u16, vnegq_f32, vnegq_s16, vnegq_s32,
+    vnegq_s8, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8,
+    vst1q_u16, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16,
 };
 use std::mem::transmute;
 
@@ -31,6 +32,7 @@ unsafe impl Isa for ArmNeonIsa {
     type I32 = int32x4_t;
     type I16 = int16x8_t;
     type I8 = int8x16_t;
+    type U16 = uint16x8_t;
     type Bits = int32x4_t;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -48,6 +50,10 @@ unsafe impl Isa for ArmNeonIsa {
     fn i8(self) -> impl SimdIntOps<Self::I8> {
         self
     }
+
+    fn u16(self) -> impl SimdOps<Self::U16> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -452,6 +458,76 @@ impl SimdIntOps<int8x16_t> for ArmNeonIsa {
     }
 }
 
+unsafe impl SimdOps<uint16x8_t> for ArmNeonIsa {
+    simd_ops_common!(uint16x8_t, uint16x8_t);
+
+    #[inline]
+    fn add(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vaddq_u16(x, y) }
+    }
+
+    #[inline]
+    fn sub(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vsubq_u16(x, y) }
+    }
+
+    #[inline]
+    fn mul(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vmulq_u16(x, y) }
+    }
+
+    #[inline]
+    fn splat(self, x: u16) -> uint16x8_t {
+        unsafe { vdupq_n_u16(x) }
+    }
+
+    #[inline]
+    fn lt(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vcltq_u16(x, y) }
+    }
+
+    #[inline]
+    fn le(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vcleq_u16(x, y) }
+    }
+
+    #[inline]
+    fn eq(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vceqq_u16(x, y) }
+    }
+
+    #[inline]
+    fn ge(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vcgeq_u16(x, y) }
+    }
+
+    #[inline]
+    fn gt(self, x: uint16x8_t, y: uint16x8_t) -> uint16x8_t {
+        unsafe { vcgtq_u16(x, y) }
+    }
+
+    #[inline]
+    unsafe fn load_ptr(self, ptr: *const u16) -> uint16x8_t {
+        unsafe { vld1q_u16(ptr) }
+    }
+
+    #[inline]
+    fn first_n_mask(self, n: usize) -> uint16x8_t {
+        let mask: [u16; 8] = std::array::from_fn(|i| if i < n { u16::MAX } else { 0 });
+        unsafe { vld1q_u16(mask.as_ptr()) }
+    }
+
+    #[inline]
+    fn select(self, x: uint16x8_t, y: uint16x8_t, mask: <uint16x8_t as Simd>::Mask) -> uint16x8_t {
+        unsafe { vbslq_u16(mask, x, y) }
+    }
+
+    #[inline]
+    unsafe fn store_ptr(self, x: uint16x8_t, ptr: *mut u16) {
+        unsafe { vst1q_u16(ptr, x) }
+    }
+}
+
 macro_rules! impl_mask {
     ($mask:ty, $elem:ty, $len:expr) => {
         impl Mask for $mask {
@@ -534,3 +610,4 @@ impl_simd!(float32x4_t, f32, 4, uint32x4_t);
 impl_simd!(int32x4_t, i32, 4, uint32x4_t);
 impl_simd!(int16x8_t, i16, 8, uint16x8_t);
 impl_simd!(int8x16_t, i8, 16, uint8x16_t);
+impl_simd!(uint16x8_t, u16, 8, uint16x8_t);
@@ -19,6 +19,7 @@ simd_type!(F32x4, f32, LEN_X32);
 simd_type!(I32x4, i32, LEN_X32);
 simd_type!(I16x8, i16, LEN_X32 * 2);
 simd_type!(I8x16, i8, LEN_X32 * 4);
+simd_type!(U16x8, u16, LEN_X32 * 2);
 
 // Define mask vector types. `Mn` is a mask for a vector with n-bit lanes.
 simd_type!(M32, i32, LEN_X32);
@@ -48,6 +49,7 @@ unsafe impl Isa for GenericIsa {
     type I32 = I32x4;
     type I16 = I16x8;
     type I8 = I8x16;
+    type U16 = U16x8;
     type Bits = I32x4;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -65,6 +67,10 @@ unsafe impl Isa for GenericIsa {
     fn i8(self) -> impl SimdIntOps<Self::I8> {
         self
     }
+
+    fn u16(self) -> impl SimdOps<Self::U16> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -235,7 +241,7 @@ impl SimdFloatOps<F32x4> for GenericIsa {
     }
 }
 
-macro_rules! impl_simd_int_ops {
+macro_rules! impl_simd_signed_int_ops {
     ($simd:ident, $elem:ty, $len:expr, $mask:ident) => {
         unsafe impl SimdOps<$simd> for GenericIsa {
             simd_ops_common!($simd, $elem, $len, $mask);
@@ -257,9 +263,18 @@ macro_rules! impl_simd_int_ops {
     };
 }
 
-impl_simd_int_ops!(I32x4, i32, 4, M32);
-impl_simd_int_ops!(I16x8, i16, 8, M16);
-impl_simd_int_ops!(I8x16, i8, 16, M8);
+impl_simd_signed_int_ops!(I32x4, i32, 4, M32);
+impl_simd_signed_int_ops!(I16x8, i16, 8, M16);
+impl_simd_signed_int_ops!(I8x16, i8, 16, M8);
+
+macro_rules! impl_simd_unsigned_int_ops {
+    ($simd:ident, $elem:ty, $len:expr, $mask:ident) => {
+        unsafe impl SimdOps<$simd> for GenericIsa {
+            simd_ops_common!($simd, $elem, $len, $mask);
+        }
+    };
+}
+impl_simd_unsigned_int_ops!(U16x8, u16, 8, M16);
 
 macro_rules! impl_mask {
     ($mask:ident, $len:expr) => {
@@ -319,3 +334,4 @@ impl_simd!(F32x4, f32, M32, 4);
 impl_simd!(I32x4, i32, M32, 4);
 impl_simd!(I16x8, i16, M16, 8);
 impl_simd!(I8x16, i8, M8, 16);
+impl_simd!(U16x8, u16, M16, 8);
@@ -5,7 +5,8 @@ use std::arch::wasm32::{
     i16x8_mul, i16x8_neg, i16x8_shl, i16x8_splat, i16x8_sub, i32x4_add, i32x4_eq, i32x4_ge,
     i32x4_gt, i32x4_mul, i32x4_neg, i32x4_shl, i32x4_shuffle, i32x4_splat, i32x4_sub,
     i32x4_trunc_sat_f32x4, i8x16_add, i8x16_eq, i8x16_ge, i8x16_gt, i8x16_neg, i8x16_shl,
-    i8x16_shuffle, i8x16_splat, i8x16_sub, v128, v128_and, v128_bitselect, v128_load, v128_store,
+    i8x16_shuffle, i8x16_splat, i8x16_sub, u16x8_add, u16x8_eq, u16x8_ge, u16x8_gt, u16x8_mul,
+    u16x8_splat, u16x8_sub, v128, v128_and, v128_bitselect, v128_load, v128_store,
 };
 use std::mem::transmute;
 
@@ -16,6 +17,7 @@ simd_type!(F32x4, v128, f32, M32, Wasm32Isa);
 simd_type!(I32x4, v128, i32, M32, Wasm32Isa);
 simd_type!(I16x8, v128, i16, M16, Wasm32Isa);
 simd_type!(I8x16, v128, i8, M8, Wasm32Isa);
+simd_type!(U16x8, v128, u16, M16, Wasm32Isa);
 
 #[derive(Copy, Clone)]
 pub struct Wasm32Isa {
@@ -35,6 +37,7 @@ unsafe impl Isa for Wasm32Isa {
     type I32 = I32x4;
     type I16 = I16x8;
     type I8 = I8x16;
+    type U16 = U16x8;
     type Bits = I32x4;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -52,6 +55,10 @@ unsafe impl Isa for Wasm32Isa {
     fn i8(self) -> impl SimdIntOps<Self::I8> {
         self
     }
+
+    fn u16(self) -> impl SimdOps<Self::U16> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -69,7 +76,7 @@ macro_rules! simd_ops_common {
         #[inline]
         fn first_n_mask(self, n: usize) -> $mask {
             let mask: [$mask_elem; lanes::<$simd>()] =
-                std::array::from_fn(|i| if i < n { -1 } else { 0 });
+                std::array::from_fn(|i| if i < n { !0 } else { 0 });
             $mask(unsafe { v128_load(mask.as_ptr() as *const v128) })
         }
 
@@ -392,6 +399,45 @@ impl SimdIntOps<I8x16> for Wasm32Isa {
     }
 }
 
+unsafe impl SimdOps<U16x8> for Wasm32Isa {
+    simd_ops_common!(U16x8, M16, u16);
+
+    #[inline]
+    fn add(self, x: U16x8, y: U16x8) -> U16x8 {
+        U16x8(u16x8_add(x.0, y.0))
+    }
+
+    #[inline]
+    fn sub(self, x: U16x8, y: U16x8) -> U16x8 {
+        U16x8(u16x8_sub(x.0, y.0))
+    }
+
+    #[inline]
+    fn mul(self, x: U16x8, y: U16x8) -> U16x8 {
+        U16x8(u16x8_mul(x.0, y.0))
+    }
+
+    #[inline]
+    fn splat(self, x: u16) -> U16x8 {
+        U16x8(u16x8_splat(x))
+    }
+
+    #[inline]
+    fn eq(self, x: U16x8, y: U16x8) -> M16 {
+        M16(u16x8_eq(x.0, y.0))
+    }
+
+    #[inline]
+    fn ge(self, x: U16x8, y: U16x8) -> M16 {
+        M16(u16x8_ge(x.0, y.0))
+    }
+
+    #[inline]
+    fn gt(self, x: U16x8, y: U16x8) -> M16 {
+        M16(u16x8_gt(x.0, y.0))
+    }
+}
+
 macro_rules! mask_type {
     ($mask:ident, $elem:ty, $len: expr) => {
         #[derive(Copy, Clone, Debug)]