robertknight
diff --git a/‎rten-simd/src/safe/arch/aarch64.rs
+106-8 b/‎rten-simd/src/safe/arch/aarch64.rs
+106-8
diff --git a/‎rten-simd/src/safe/arch/generic.rs
+12 b/‎rten-simd/src/safe/arch/generic.rs
+12
diff --git a/‎rten-simd/src/safe/arch/wasm32.rs
+72-4 b/‎rten-simd/src/safe/arch/wasm32.rs
+72-4
@@ -1,12 +1,14 @@
 use std::arch::aarch64::{
-    float32x4_t, int16x8_t, int32x4_t, uint16x8_t, uint32x4_t, vabsq_f32, vaddq_f32, vaddq_s16,
-    vaddq_s32, vaddvq_f32, vandq_u16, vandq_u32, vbslq_f32, vbslq_s16, vbslq_s32, vceqq_f32,
-    vceqq_s16, vceqq_s32, vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgtq_f32, vcgtq_s16, vcgtq_s32,
-    vcleq_f32, vcleq_s16, vcltq_f32, vcltq_s16, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32,
-    vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vfmaq_f32, vld1q_f32, vld1q_s16, vld1q_s32, vld1q_u16,
-    vld1q_u32, vmaxq_f32, vminq_f32, vmulq_f32, vmulq_s16, vmulq_s32, vnegq_f32, vnegq_s16,
-    vnegq_s32, vshlq_n_s16, vshlq_n_s32, vst1q_f32, vst1q_s16, vst1q_s32, vsubq_f32, vsubq_s16,
-    vsubq_s32,
+    float32x4_t, int16x8_t, int32x4_t, int8x16_t, uint16x8_t, uint32x4_t, uint8x16_t, vabsq_f32,
+    vaddq_f32, vaddq_s16, vaddq_s32, vaddq_s8, vaddvq_f32, vandq_u16, vandq_u32, vandq_u8,
+    vbslq_f32, vbslq_s16, vbslq_s32, vbslq_s8, vceqq_f32, vceqq_s16, vceqq_s32, vceqq_s8,
+    vcgeq_f32, vcgeq_s16, vcgeq_s32, vcgeq_s8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8,
+    vcleq_f32, vcleq_s16, vcleq_s8, vcltq_f32, vcltq_s16, vcltq_s8, vcvtnq_s32_f32, vcvtq_s32_f32,
+    vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_s8, vfmaq_f32, vld1q_f32, vld1q_s16,
+    vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32, vminq_f32, vmulq_f32,
+    vmulq_s16, vmulq_s32, vmulq_s8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vshlq_n_s16,
+    vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8, vsubq_f32, vsubq_s16,
+    vsubq_s32, vsubq_s8,
 };
 use std::mem::transmute;
 
@@ -28,6 +30,7 @@ unsafe impl Isa for ArmNeonIsa {
     type F32 = float32x4_t;
     type I32 = int32x4_t;
     type I16 = int16x8_t;
+    type I8 = int8x16_t;
     type Bits = int32x4_t;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -41,6 +44,10 @@ unsafe impl Isa for ArmNeonIsa {
     fn i16(self) -> impl SimdIntOps<Self::I16> {
         self
     }
+
+    fn i8(self) -> impl SimdIntOps<Self::I8> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -363,6 +370,88 @@ impl SimdIntOps<int16x8_t> for ArmNeonIsa {
     }
 }
 
+unsafe impl SimdOps<int8x16_t> for ArmNeonIsa {
+    simd_ops_common!(int8x16_t, uint8x16_t);
+
+    #[inline]
+    fn add(self, x: int8x16_t, y: int8x16_t) -> int8x16_t {
+        unsafe { vaddq_s8(x, y) }
+    }
+
+    #[inline]
+    fn sub(self, x: int8x16_t, y: int8x16_t) -> int8x16_t {
+        unsafe { vsubq_s8(x, y) }
+    }
+
+    #[inline]
+    fn mul(self, x: int8x16_t, y: int8x16_t) -> int8x16_t {
+        unsafe { vmulq_s8(x, y) }
+    }
+
+    #[inline]
+    fn splat(self, x: i8) -> int8x16_t {
+        unsafe { vdupq_n_s8(x) }
+    }
+
+    #[inline]
+    fn lt(self, x: int8x16_t, y: int8x16_t) -> uint8x16_t {
+        unsafe { vcltq_s8(x, y) }
+    }
+
+    #[inline]
+    fn le(self, x: int8x16_t, y: int8x16_t) -> uint8x16_t {
+        unsafe { vcleq_s8(x, y) }
+    }
+
+    #[inline]
+    fn eq(self, x: int8x16_t, y: int8x16_t) -> uint8x16_t {
+        unsafe { vceqq_s8(x, y) }
+    }
+
+    #[inline]
+    fn ge(self, x: int8x16_t, y: int8x16_t) -> uint8x16_t {
+        unsafe { vcgeq_s8(x, y) }
+    }
+
+    #[inline]
+    fn gt(self, x: int8x16_t, y: int8x16_t) -> uint8x16_t {
+        unsafe { vcgtq_s8(x, y) }
+    }
+
+    #[inline]
+    unsafe fn load_ptr(self, ptr: *const i8) -> int8x16_t {
+        unsafe { vld1q_s8(ptr) }
+    }
+
+    #[inline]
+    fn first_n_mask(self, n: usize) -> uint8x16_t {
+        let mask: [u8; 16] = std::array::from_fn(|i| if i < n { u8::MAX } else { 0 });
+        unsafe { vld1q_u8(mask.as_ptr()) }
+    }
+
+    #[inline]
+    fn select(self, x: int8x16_t, y: int8x16_t, mask: <int8x16_t as Simd>::Mask) -> int8x16_t {
+        unsafe { vbslq_s8(mask, x, y) }
+    }
+
+    #[inline]
+    unsafe fn store_ptr(self, x: int8x16_t, ptr: *mut i8) {
+        unsafe { vst1q_s8(ptr, x) }
+    }
+}
+
+impl SimdIntOps<int8x16_t> for ArmNeonIsa {
+    #[inline]
+    fn neg(self, x: int8x16_t) -> int8x16_t {
+        unsafe { vnegq_s8(x) }
+    }
+
+    #[inline]
+    fn shift_left<const SHIFT: i32>(self, x: int8x16_t) -> int8x16_t {
+        unsafe { vshlq_n_s8::<SHIFT>(x) }
+    }
+}
+
 macro_rules! impl_mask {
     ($mask:ty, $elem:ty, $len:expr) => {
         impl Mask for $mask {
@@ -379,6 +468,7 @@ macro_rules! impl_mask {
 
 impl_mask!(uint32x4_t, u32, 4);
 impl_mask!(uint16x8_t, u16, 8);
+impl_mask!(uint8x16_t, u8, 16);
 
 unsafe impl MaskOps<uint32x4_t> for ArmNeonIsa {
     #[inline]
@@ -394,6 +484,13 @@ unsafe impl MaskOps<uint16x8_t> for ArmNeonIsa {
     }
 }
 
+unsafe impl MaskOps<uint8x16_t> for ArmNeonIsa {
+    #[inline]
+    fn and(self, x: uint8x16_t, y: uint8x16_t) -> uint8x16_t {
+        unsafe { vandq_u8(x, y) }
+    }
+}
+
 macro_rules! simd_common {
     ($mask:ty, $len:expr) => {
         type Array = [Self::Elem; $len];
@@ -436,3 +533,4 @@ macro_rules! impl_simd {
 impl_simd!(float32x4_t, f32, 4, uint32x4_t);
 impl_simd!(int32x4_t, i32, 4, uint32x4_t);
 impl_simd!(int16x8_t, i16, 8, uint16x8_t);
+impl_simd!(int8x16_t, i8, 16, uint8x16_t);
@@ -18,6 +18,10 @@ pub struct I32x4([i32; LEN_X32]);
 #[derive(Copy, Clone, Debug)]
 pub struct I16x8([i16; LEN_X32 * 2]);
 
+#[repr(align(16))]
+#[derive(Copy, Clone, Debug)]
+pub struct I8x16([i8; LEN_X32 * 4]);
+
 #[derive(Copy, Clone)]
 pub struct GenericIsa {
     _private: (),
@@ -40,6 +44,7 @@ unsafe impl Isa for GenericIsa {
     type F32 = F32x4;
     type I32 = I32x4;
     type I16 = I16x8;
+    type I8 = I8x16;
     type Bits = I32x4;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -53,6 +58,10 @@ unsafe impl Isa for GenericIsa {
     fn i16(self) -> impl SimdIntOps<Self::I16> {
         self
     }
+
+    fn i8(self) -> impl SimdIntOps<Self::I8> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -247,6 +256,7 @@ macro_rules! impl_simd_int_ops {
 
 impl_simd_int_ops!(I32x4, i32, 4, I32x4);
 impl_simd_int_ops!(I16x8, i16, 8, I16x8);
+impl_simd_int_ops!(I8x16, i8, 16, I8x16);
 
 macro_rules! impl_mask {
     ($mask:ident, $len:expr) => {
@@ -272,6 +282,7 @@ macro_rules! impl_mask {
 
 impl_mask!(I32x4, LEN_X32);
 impl_mask!(I16x8, LEN_X32 * 2);
+impl_mask!(I8x16, LEN_X32 * 4);
 
 macro_rules! impl_simd {
     ($simd:ty, $elem:ty, $mask:ty, $len:expr) => {
@@ -304,3 +315,4 @@ macro_rules! impl_simd {
 impl_simd!(F32x4, f32, I32x4, 4);
 impl_simd!(I32x4, i32, I32x4, 4);
 impl_simd!(I16x8, i16, I16x8, 8);
+impl_simd!(I8x16, i8, I8x16, 16);
@@ -1,10 +1,11 @@
 use std::arch::wasm32::{
     f32x4_abs, f32x4_add, f32x4_div, f32x4_eq, f32x4_extract_lane, f32x4_ge, f32x4_gt, f32x4_le,
     f32x4_lt, f32x4_max, f32x4_min, f32x4_mul, f32x4_nearest, f32x4_neg, f32x4_splat, f32x4_sub,
-    i16x8_add, i16x8_eq, i16x8_ge, i16x8_gt, i16x8_mul, i16x8_neg, i16x8_shl, i16x8_splat,
-    i16x8_sub, i32x4_add, i32x4_eq, i32x4_ge, i32x4_gt, i32x4_mul, i32x4_neg, i32x4_shl,
-    i32x4_shuffle, i32x4_splat, i32x4_sub, i32x4_trunc_sat_f32x4, v128, v128_and, v128_bitselect,
-    v128_load, v128_store,
+    i16x8_add, i16x8_eq, i16x8_extmul_high_i8x16, i16x8_extmul_low_i8x16, i16x8_ge, i16x8_gt,
+    i16x8_mul, i16x8_neg, i16x8_shl, i16x8_splat, i16x8_sub, i32x4_add, i32x4_eq, i32x4_ge,
+    i32x4_gt, i32x4_mul, i32x4_neg, i32x4_shl, i32x4_shuffle, i32x4_splat, i32x4_sub,
+    i32x4_trunc_sat_f32x4, i8x16_add, i8x16_eq, i8x16_ge, i8x16_gt, i8x16_neg, i8x16_shl,
+    i8x16_shuffle, i8x16_splat, i8x16_sub, v128, v128_and, v128_bitselect, v128_load, v128_store,
 };
 use std::mem::transmute;
 
@@ -14,6 +15,7 @@ use crate::safe::{Isa, Mask, MaskOps, Simd, SimdFloatOps, SimdIntOps, SimdOps};
 simd_type!(F32x4, v128, f32, I32x4, Wasm32Isa);
 simd_type!(I32x4, v128, i32, I32x4, Wasm32Isa);
 simd_type!(I16x8, v128, i16, I16x8, Wasm32Isa);
+simd_type!(I8x16, v128, i8, I8x16, Wasm32Isa);
 
 #[derive(Copy, Clone)]
 pub struct Wasm32Isa {
@@ -32,6 +34,7 @@ unsafe impl Isa for Wasm32Isa {
     type F32 = F32x4;
     type I32 = I32x4;
     type I16 = I16x8;
+    type I8 = I8x16;
     type Bits = I32x4;
 
     fn f32(self) -> impl SimdFloatOps<Self::F32, Int = Self::I32> {
@@ -45,6 +48,10 @@ unsafe impl Isa for Wasm32Isa {
     fn i16(self) -> impl SimdIntOps<Self::I16> {
         self
     }
+
+    fn i8(self) -> impl SimdIntOps<Self::I8> {
+        self
+    }
 }
 
 macro_rules! simd_ops_common {
@@ -325,6 +332,66 @@ impl SimdIntOps<I16x8> for Wasm32Isa {
     }
 }
 
+unsafe impl SimdOps<I8x16> for Wasm32Isa {
+    simd_ops_common!(I8x16, I8x16, i8);
+
+    #[inline]
+    fn add(self, x: I8x16, y: I8x16) -> I8x16 {
+        I8x16(i8x16_add(x.0, y.0))
+    }
+
+    #[inline]
+    fn sub(self, x: I8x16, y: I8x16) -> I8x16 {
+        I8x16(i8x16_sub(x.0, y.0))
+    }
+
+    #[inline]
+    fn mul(self, x: I8x16, y: I8x16) -> I8x16 {
+        let prod_low = i16x8_extmul_low_i8x16(x.0, y.0);
+        let prod_high = i16x8_extmul_high_i8x16(x.0, y.0);
+
+        // Select even bytes from low and high products. This obtains the
+        // i8 truncated product.
+        let prod_i8 = i8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(
+            prod_low, prod_high,
+        );
+
+        I8x16(prod_i8)
+    }
+
+    #[inline]
+    fn splat(self, x: i8) -> I8x16 {
+        I8x16(i8x16_splat(x))
+    }
+
+    #[inline]
+    fn eq(self, x: I8x16, y: I8x16) -> I8x16 {
+        I8x16(i8x16_eq(x.0, y.0))
+    }
+
+    #[inline]
+    fn ge(self, x: I8x16, y: I8x16) -> I8x16 {
+        I8x16(i8x16_ge(x.0, y.0))
+    }
+
+    #[inline]
+    fn gt(self, x: I8x16, y: I8x16) -> I8x16 {
+        I8x16(i8x16_gt(x.0, y.0))
+    }
+}
+
+impl SimdIntOps<I8x16> for Wasm32Isa {
+    #[inline]
+    fn neg(self, x: I8x16) -> I8x16 {
+        I8x16(i8x16_neg(x.0))
+    }
+
+    #[inline]
+    fn shift_left<const SHIFT: i32>(self, x: I8x16) -> I8x16 {
+        I8x16(i8x16_shl(x.0, SHIFT as u32))
+    }
+}
+
 macro_rules! mask_type {
     ($mask:ident, $elem:ty, $len: expr) => {
         impl Mask for $mask {
@@ -348,3 +415,4 @@ macro_rules! mask_type {
 
 mask_type!(I32x4, i32, 4);
 mask_type!(I16x8, i16, 8);
+mask_type!(I8x16, i8, 16);