robertknight
diff --git a/‎rten-simd/src/safe/arch/aarch64.rs
+87-10 b/‎rten-simd/src/safe/arch/aarch64.rs
+87-10
diff --git a/‎rten-simd/src/safe/arch/generic.rs
+8 b/‎rten-simd/src/safe/arch/generic.rs
+8
diff --git a/‎rten-simd/src/safe/arch/wasm32.rs
+58-2 b/‎rten-simd/src/safe/arch/wasm32.rs
+58-2
@@ -1,15 +1,16 @@
 use std::arch::aarch64::{
     float32x4_t, int16x8_t, int32x4_t, int8x16_t, uint16x8_t, uint32x4_t, uint8x16_t, vabsq_f32,
-    vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddvq_f32, vandq_u16, vandq_u32,
-    vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vceqq_f32, vceqq_s16,
-    vceqq_s32, vceqq_s8, vceqq_u16, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgeq_u16,
-    vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcleq_f32, vcleq_s16, vcleq_s8,
-    vcleq_u16, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32,
-    vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vfmaq_f32, vld1q_f32,
-    vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32,
-    vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8, vmulq_u16, vnegq_f32, vnegq_s16, vnegq_s32,
-    vnegq_s8, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8,
-    vst1q_u16, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16,
+    vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddq_u16, vaddq_u8, vaddvq_f32, vandq_u16,
+    vandq_u32, vandq_u8, vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vbslq_u16, vbslq_u8, vceqq_f32,
+    vceqq_s16, vceqq_s32, vceqq_s8, vceqq_u16, vceqq_u8, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8,
+    vcgeq_u16, vcgeq_u8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcgtq_u8, vcleq_f32,
+    vcleq_s16, vcleq_s8, vcleq_u16, vcleq_u8, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcltq_u8,
+    vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8,
+    vdupq_n_u16, vdupq_n_u8, vfmaq_f32, vld1q_f32, vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16,
+    vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32, vmulq_f32, vmulq_s16, vmulq_s32, vmulq_s8,
+    vmulq_u16, vmulq_u8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vshlq_n_s16, vshlq_n_s32,
+    vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8, vst1q_u16, vst1q_u8, vsubq_f32,
+    vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16, vsubq_u8,
 };
 use std::mem::transmute;
 
@@ -32,6 +33,7 @@ unsafe impl Isa for ArmNeonIsa {
     type I32 = int32x4_t;
     type I16 = int16x8_t;
     type I8 = int8x16_t;
+    type U8 = uint8x16_t;
     type U16 = uint16x8_t;
     type Bits = int32x4_t;
 
@@ -51,6 +53,10 @@ unsafe impl Isa for ArmNeonIsa {
         self
     }
 
+    fn u8(self) -> impl SimdOps<Self::U8> {
+        self
+    }
+
     fn u16(self) -> impl SimdOps<Self::U16> {
         self
     }
@@ -458,6 +464,76 @@ impl SimdIntOps<int8x16_t> for ArmNeonIsa {
     }
 }
 
+unsafe impl SimdOps<uint8x16_t> for ArmNeonIsa {
+    simd_ops_common!(uint8x16_t, uint8x16_t);
+
+    #[inline]
+    fn add(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vaddq_u8(x, y) }
+    }
+
+    #[inline]
+    fn sub(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vsubq_u8(x, y) }
+    }
+
+    #[inline]
+    fn mul(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vmulq_u8(x, y) }
+    }
+
+    #[inline]
+    fn splat(self, x: u8) -> uint8x16_t {
+        unsafe { vdupq_n_u8(x) }
+    }
+
+    #[inline]
+    fn lt(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vcltq_u8(x, y) }
+    }
+
+    #[inline]
+    fn le(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vcleq_u8(x, y) }
+    }
+
+    #[inline]
+    fn eq(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vceqq_u8(x, y) }
+    }
+
+    #[inline]
+    fn ge(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vcgeq_u8(x, y) }
+    }
+
+    #[inline]
+    fn gt(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vcgtq_u8(x, y) }
+    }
+
+    #[inline]
+    unsafe fn load_ptr(self, ptr: *const u8) -> uint8x16_t {
+        unsafe { vld1q_u8(ptr) }
+    }
+
+    #[inline]
+    fn first_n_mask(self, n: usize) -> uint8x16_t {
+        let mask: [u8; 16] = std::array::from_fn(|i| if i < n { u8::MAX } else { 0 });
+        unsafe { vld1q_u8(mask.as_ptr()) }
+    }
+
+    #[inline]
+    fn select(self, x: uint8x16_t, y: uint8x16_t, mask: <uint8x16_t as Simd>::Mask) -> uint8x16_t {
+        unsafe { vbslq_u8(mask, x, y) }
+    }
+
+    #[inline]
+    unsafe fn store_ptr(self, x: uint8x16_t, ptr: *mut u8) {
+        unsafe { vst1q_u8(ptr, x) }
+    }
+}
+
 unsafe impl SimdOps<uint16x8_t> for ArmNeonIsa {
     simd_ops_common!(uint16x8_t, uint16x8_t);
 
@@ -610,4 +686,5 @@ impl_simd!(float32x4_t, f32, 4, uint32x4_t);
 impl_simd!(int32x4_t, i32, 4, uint32x4_t);
 impl_simd!(int16x8_t, i16, 8, uint16x8_t);
 impl_simd!(int8x16_t, i8, 16, uint8x16_t);
+impl_simd!(uint8x16_t, u8, 16, uint8x16_t);
 impl_simd!(uint16x8_t, u16, 8, uint16x8_t);
@@ -19,6 +19,7 @@ simd_type!(F32x4, f32, LEN_X32);
 simd_type!(I32x4, i32, LEN_X32);
 simd_type!(I16x8, i16, LEN_X32 * 2);
 simd_type!(I8x16, i8, LEN_X32 * 4);
+simd_type!(U8x16, u8, LEN_X32 * 4);
 simd_type!(U16x8, u16, LEN_X32 * 2);
 
 // Define mask vector types. `Mn` is a mask for a vector with n-bit lanes.
@@ -49,6 +50,7 @@ unsafe impl Isa for GenericIsa {
     type I32 = I32x4;
     type I16 = I16x8;
     type I8 = I8x16;
+    type U8 = U8x16;
     type U16 = U16x8;
     type Bits = I32x4;
 
@@ -68,6 +70,10 @@ unsafe impl Isa for GenericIsa {
         self
     }
 
+    fn u8(self) -> impl SimdOps<Self::U8> {
+        self
+    }
+
     fn u16(self) -> impl SimdOps<Self::U16> {
         self
     }
@@ -274,6 +280,7 @@ macro_rules! impl_simd_unsigned_int_ops {
         }
     };
 }
+impl_simd_unsigned_int_ops!(U8x16, u8, 16, M8);
 impl_simd_unsigned_int_ops!(U16x8, u16, 8, M16);
 
 macro_rules! impl_mask {
@@ -334,4 +341,5 @@ impl_simd!(F32x4, f32, M32, 4);
 impl_simd!(I32x4, i32, M32, 4);
 impl_simd!(I16x8, i16, M16, 8);
 impl_simd!(I8x16, i8, M8, 16);
+impl_simd!(U8x16, u8, M8, 16);
 impl_simd!(U16x8, u16, M16, 8);
@@ -5,8 +5,10 @@ use std::arch::wasm32::{
     i16x8_mul, i16x8_neg, i16x8_shl, i16x8_splat, i16x8_sub, i32x4_add, i32x4_eq, i32x4_ge,
     i32x4_gt, i32x4_mul, i32x4_neg, i32x4_shl, i32x4_shuffle, i32x4_splat, i32x4_sub,
     i32x4_trunc_sat_f32x4, i8x16_add, i8x16_eq, i8x16_ge, i8x16_gt, i8x16_neg, i8x16_shl,
-    i8x16_shuffle, i8x16_splat, i8x16_sub, u16x8_add, u16x8_eq, u16x8_ge, u16x8_gt, u16x8_mul,
-    u16x8_splat, u16x8_sub, v128, v128_and, v128_bitselect, v128_load, v128_store,
+    i8x16_shuffle, i8x16_splat, i8x16_sub, u16x8_add, u16x8_eq, u16x8_extmul_high_u8x16,
+    u16x8_extmul_low_u8x16, u16x8_ge, u16x8_gt, u16x8_mul, u16x8_splat, u16x8_sub, u8x16_add,
+    u8x16_eq, u8x16_ge, u8x16_gt, u8x16_shuffle, u8x16_splat, u8x16_sub, v128, v128_and,
+    v128_bitselect, v128_load, v128_store,
 };
 use std::mem::transmute;
 
@@ -17,6 +19,7 @@ simd_type!(F32x4, v128, f32, M32, Wasm32Isa);
 simd_type!(I32x4, v128, i32, M32, Wasm32Isa);
 simd_type!(I16x8, v128, i16, M16, Wasm32Isa);
 simd_type!(I8x16, v128, i8, M8, Wasm32Isa);
+simd_type!(U8x16, v128, u8, M8, Wasm32Isa);
 simd_type!(U16x8, v128, u16, M16, Wasm32Isa);
 
 #[derive(Copy, Clone)]
@@ -37,6 +40,7 @@ unsafe impl Isa for Wasm32Isa {
     type I32 = I32x4;
     type I16 = I16x8;
     type I8 = I8x16;
+    type U8 = U8x16;
     type U16 = U16x8;
     type Bits = I32x4;
 
@@ -56,6 +60,10 @@ unsafe impl Isa for Wasm32Isa {
         self
     }
 
+    fn u8(self) -> impl SimdOps<Self::U8> {
+        self
+    }
+
     fn u16(self) -> impl SimdOps<Self::U16> {
         self
     }
@@ -399,6 +407,54 @@ impl SimdIntOps<I8x16> for Wasm32Isa {
     }
 }
 
+unsafe impl SimdOps<U8x16> for Wasm32Isa {
+    simd_ops_common!(U8x16, M8, i8);
+
+    #[inline]
+    fn add(self, x: U8x16, y: U8x16) -> U8x16 {
+        U8x16(u8x16_add(x.0, y.0))
+    }
+
+    #[inline]
+    fn sub(self, x: U8x16, y: U8x16) -> U8x16 {
+        U8x16(u8x16_sub(x.0, y.0))
+    }
+
+    #[inline]
+    fn mul(self, x: U8x16, y: U8x16) -> U8x16 {
+        let prod_low = u16x8_extmul_low_u8x16(x.0, y.0);
+        let prod_high = u16x8_extmul_high_u8x16(x.0, y.0);
+
+        // Select even bytes from low and high products. This obtains the
+        // u8 truncated product.
+        let prod_u8 = u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(
+            prod_low, prod_high,
+        );
+
+        U8x16(prod_u8)
+    }
+
+    #[inline]
+    fn splat(self, x: u8) -> U8x16 {
+        U8x16(u8x16_splat(x))
+    }
+
+    #[inline]
+    fn eq(self, x: U8x16, y: U8x16) -> M8 {
+        M8(u8x16_eq(x.0, y.0))
+    }
+
+    #[inline]
+    fn ge(self, x: U8x16, y: U8x16) -> M8 {
+        M8(u8x16_ge(x.0, y.0))
+    }
+
+    #[inline]
+    fn gt(self, x: U8x16, y: U8x16) -> M8 {
+        M8(u8x16_gt(x.0, y.0))
+    }
+}
+
 unsafe impl SimdOps<U16x8> for Wasm32Isa {
     simd_ops_common!(U16x8, M16, u16);