From f417be26b84ebd2306eef27b74c711dd9c17e99e Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 14:13:30 +0100
Subject: [PATCH 1/8] Loop unroll for common cases, fix kernel generation,
 added `Area` resampling method, aarch64 benchmark updates

---
 Cargo.lock                       |   2 +-
 Cargo.toml                       |   2 +-
 README.md                        |  26 +--
 app/benches/resize_rgb/main.rs   |   2 +-
 app/src/main.rs                  |  31 ++--
 src/color_group.rs               |  20 +--
 src/compute_weights.rs           | 238 +++++++++++++++++----------
 src/fixed_point_horizontal.rs    | 191 +++++++++++++++++-----
 src/floating_point_horizontal.rs | 189 +++++++++++++++++++---
 src/floating_point_vertical.rs   | 265 ++++++++++++++++++++++++++++---
 src/handler_provider.rs          |   2 +
 src/resize_floating_point.rs     |   9 +-
 src/resize_nearest.rs            |   8 +-
 src/sampler.rs                   |  48 ++++--
 14 files changed, 813 insertions(+), 220 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5cf0287..4f6bbb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -755,7 +755,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale-safe"
-version = "0.1.2"
+version = "0.1.3"
 dependencies = [
  "num-traits",
  "rayon",
diff --git a/Cargo.toml b/Cargo.toml
index 6de0700..bd1afeb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app"] }
 
 [package]
 name = "pic-scale-safe"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 description = "Fast and safe performance image scaling"
 readme = "README.md"
diff --git a/README.md b/README.md
index b89959e..62572da 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,8 @@ cargo bench --bench resize_rgba --manifest-path ./app/Cargo.toml
 |                           | Lanczos3 | Bilinear |
 |---------------------------|:--------:|:--------:|
 | image(aarch64)            |  121.19  |  48.89   |
-| pic-scale(aarch64)        |  26.90   |  15.13   |
-| fir(aarch64)              |  25.93   |  11.30   |
+| pic-scale(aarch64)        |  11.89   |   8.92   |
+| fir(aarch64)              |  25.89   |  11.30   |
 | image(x86)                |  192.52  |  88.63   |
 | pic-scale(x86)            |  49.79   |  35.98   |
 | pic-scale(x86-cpu-native) |  27.21   |  20.48   |
@@ -47,8 +47,8 @@ cargo bench --bench resize_rgb --manifest-path ./app/Cargo.toml
 |                    | Lanczos3 | Bilinear |
 |--------------------|:--------:|:--------:|
 | image(aarch64)     |  123.85  |  51.30   |
-| pic-scale(aarch64) |  31.73   |  18.20   |
-| fir(aarch64)       |  24.04   |  11.37   |
+| pic-scale(aarch64) |  17.23   |  12.32   |
+| fir(aarch64)       |  23.61   |  10.53   |
 | image(x86)         |  201.52  |  90.82   |
 | pic-scale(x86)     |  34.54   |  25.05   |
 | fir(x86)           |  41.97   |  25.21   |
@@ -61,9 +61,9 @@ cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml
 
 |                           | Lanczos3 | Bilinear |
 |---------------------------|:--------:|:--------:|
-| image(aarch64)            |  123.27  |  52.91   |
-| pic-scale(aarch64)        |  28.041  |  18.89   |
-| fir(aarch64)              |  149.87  |  50.08   |
+| image(aarch64)            |  262.32  |  76.91   |
+| pic-scale(aarch64)        |  15.49   |  11.38   |
+| fir(aarch64)              |  141.78  |  50.08   |
 | image(x86)                |  196.28  |  194.75  |
 | pic-scale(x86)            |  59.89   |  57.99   |
 | pic-scale(x86-cpu-native) |  44.07   |  57.99   |
@@ -77,9 +77,9 @@ cargo bench --bench resize_rgb_u16 --manifest-path ./app/Cargo.toml
 
 |                    | Lanczos3 | Bilinear |
 |--------------------|:--------:|:--------:|
-| image(aarch64)     |  130.45  |  61.06   |
-| pic-scale(aarch64) |  36.10   |  23.80   |
-| fir(aarch64)       |  122.01  |  43.36   |
+| image(aarch64)     |  130.45  |  57.38   |
+| pic-scale(aarch64) |  16.17   |  12.11   |
+| fir(aarch64)       |  110.06  |  42.04   |
 
 Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times.
 
@@ -89,9 +89,9 @@ cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml
 
 |                    | Lanczos3 | Bilinear |
 |--------------------|:--------:|:--------:|
-| image(aarch64)     |  100.16  |  51.21   |
-| pic-scale(aarch64) |  43.04   |  19.16   |
-| fir(aarch64)       |  114.35  |  37.75   |
+| image(aarch64)     |  100.16  |  50.09   |
+| pic-scale(aarch64) |  14.07   |  11.18   |
+| fir(aarch64)       |  105.30  |  37.75   |
 | image(x86)         |  164.04  |  98.90   |
 | pic-scale(x86)     |  57.39   |  43.84   |
 | fir(x86)           |  60.30   |  29.92   |
diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs
index 57fb509..a1be0ed 100644
--- a/app/benches/resize_rgb/main.rs
+++ b/app/benches/resize_rgb/main.rs
@@ -57,7 +57,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("Image RGB: Bilinear", |b| {
+    c.bench_function("Image RGB: Lanczos3", |b| {
         b.iter(|| {
             _ = dyn_image.clone().resize_exact(
                 dimensions.0 / 4,
diff --git a/app/src/main.rs b/app/src/main.rs
index db6299a..19a1c94 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -31,13 +31,14 @@ mod image_wrapper;
 use fast_image_resize::images::Image;
 use fast_image_resize::{CpuExtensions, FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer};
 use image::{
-    DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat, ImageReader, Rgb,
-    RgbImage,
+    imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat,
+    ImageReader, Rgb, RgbImage,
 };
 use pic_scale_safe::{
-    resize_floating_point, resize_rgb16, resize_rgb8, resize_rgba16, resize_rgba8, ImageSize,
-    ResamplingFunction,
+    resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32,
+    resize_rgba16, resize_rgba8, ImageSize, ResamplingFunction,
 };
+use std::ops::{BitXor, Shr};
 use std::time::Instant;
 
 fn main() {
@@ -53,13 +54,16 @@ fn main() {
     let start = Instant::now();
 
     let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
-    let dst_size = ImageSize::new(dimensions.0 as usize * 4, dimensions.1 as usize * 4);
+    let dst_size = ImageSize::new(
+        (dimensions.0 as f32 + 1.) as usize,
+        (dimensions.1 as f32 + 1.) as usize,
+    );
 
     let mut resized = resize_rgb8(
         &working_store,
         src_size,
         dst_size,
-        ResamplingFunction::MitchellNetravalli,
+        ResamplingFunction::Lanczos3,
     )
     .unwrap();
 
@@ -68,10 +72,13 @@ fn main() {
     // let rgba_image = DynamicImage::ImageRgb16(ImageBuffer::<Rgb<u16>, Vec<u16>>::from_vec(dimensions.0 * 4, dimensions.1 / 4, resized).unwrap());
     // rgba_image.save_with_format("converted.png", ImageFormat::Png).unwrap();
 
-    // let shifted = resized.iter().map(|&x| (x >> 8) as u8).collect::<Vec<_>>();
+    // let shifted = resized
+    //     .iter()
+    //     .map(|&x| (x * 255.) as u8)
+    //     .collect::<Vec<_>>();
 
     image::save_buffer(
-        "converted.jpg",
+        "converted.png",
         &resized,
         dst_size.width as u32,
         dst_size.height as u32,
@@ -83,7 +90,7 @@ fn main() {
     // let pixel_type: PixelType = PixelType::U8x3;
     // let src_image =
     //     Image::from_slice_u8(dimensions.0, dimensions.1, &mut src_bytes, pixel_type).unwrap();
-    // let mut dst_image = Image::new(dimensions.0 * 4, dimensions.1 * 4, pixel_type);
+    // let mut dst_image = Image::new(dimensions.0 / 8, dimensions.1 / 8, pixel_type);
     //
     // let mut resizer = Resizer::new();
     // unsafe {
@@ -97,7 +104,7 @@ fn main() {
     //         &src_image,
     //         &mut dst_image,
     //         &ResizeOptions::new()
-    //             .resize_alg(ResizeAlg::Convolution(FilterType::Mitchell))
+    //             .resize_alg(ResizeAlg::Convolution(FilterType::Bilinear))
     //             .use_alpha(false),
     //     )
     //     .unwrap();
@@ -109,13 +116,13 @@ fn main() {
     // // let rgba_image = DynamicImage::ImageRgb8(RgbImage::from_raw(dst_image.width() as u32, dst_image.height() as u32, dst_image.buffer().to_vec()).unwrap());
     // // rgba_image.save_with_format("fast_image.png", ImageFormat::Png).unwrap();
     // image::save_buffer(
-    //     "fast_image.jpg",
+    //     "fast_image.png",
     //     dst_image.buffer(),
     //     dst_image.width(),
     //     dst_image.height(),
     //     image::ColorType::Rgb8,
     // )
-    //     .unwrap();
+    // .unwrap();
 }
 
 fn u8_to_u16(u8_buffer: &[u8]) -> &[u16] {
diff --git a/src/color_group.rs b/src/color_group.rs
index aa20b72..4d9daf0 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -74,30 +74,30 @@ where
 
 #[macro_export]
 macro_rules! fast_load_color_group {
-    ($store: expr, $channels: expr) => {{
+    ($store: expr, $channels: expr, $vtype: ty) => {{
         if $channels == 1 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[0].as_(),
                 g: 0.as_(),
                 b: 0.as_(),
                 a: 0.as_(),
             }
         } else if $channels == 2 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[0].as_(),
                 g: $store[1].as_(),
                 b: 0.as_(),
                 a: 0.as_(),
             }
         } else if $channels == 3 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[0].as_(),
                 g: $store[1].as_(),
                 b: $store[2].as_(),
                 a: 0.as_(),
             }
         } else if $channels == 4 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[0].as_(),
                 g: $store[1].as_(),
                 b: $store[2].as_(),
@@ -111,30 +111,30 @@ macro_rules! fast_load_color_group {
 
 #[macro_export]
 macro_rules! fast_load_color_group_with_offset {
-    ($store: expr, $channels: expr, $offset: expr) => {{
+    ($store: expr, $channels: expr, $offset: expr, $vtype: ty) => {{
         if $channels == 1 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[$offset].as_(),
                 g: 0.as_(),
                 b: 0.as_(),
                 a: 0.as_(),
             }
         } else if $channels == 2 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[$offset].as_(),
                 g: $store[$offset + 1].as_(),
                 b: 0.as_(),
                 a: 0.as_(),
             }
         } else if $channels == 3 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[$offset].as_(),
                 g: $store[$offset + 1].as_(),
                 b: $store[$offset + 2].as_(),
                 a: 0.as_(),
             }
         } else if $channels == 4 {
-            ColorGroup {
+            ColorGroup::<$channels, $vtype> {
                 r: $store[$offset].as_(),
                 g: $store[$offset + 1].as_(),
                 b: $store[$offset + 2].as_(),
diff --git a/src/compute_weights.rs b/src/compute_weights.rs
index a7db42d..6022253 100644
--- a/src/compute_weights.rs
+++ b/src/compute_weights.rs
@@ -30,7 +30,8 @@ use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::math::{ConstPI, ConstSqrt2, Jinc};
 use crate::sampler::ResamplingFunction;
 use num_traits::{AsPrimitive, Float, Signed};
-use std::ops::{AddAssign, MulAssign, Neg};
+use std::fmt::Debug;
+use std::ops::{AddAssign, Div, MulAssign, Neg};
 
 pub(crate) fn generate_weights<T>(
     function: ResamplingFunction,
@@ -47,13 +48,18 @@ where
         + MulAssign<T>
         + AddAssign<T>
         + AsPrimitive<f64>
+        + AsPrimitive<i64>
         + AsPrimitive<usize>
         + Jinc<T>
         + ConstSqrt2
         + Default
-        + AsPrimitive<i32>,
+        + AsPrimitive<i32>
+        + Div<T, Output = T>
+        + Debug,
     f32: AsPrimitive<T>,
     f64: AsPrimitive<T>,
+    i64: AsPrimitive<T>,
+    i32: AsPrimitive<T>,
     usize: AsPrimitive<T>,
 {
     let resampling_filter = function.get_resampling_filter();
@@ -66,98 +72,170 @@ where
     let filter_base_size = resampling_filter.min_kernel_size;
     let resampling_function = resampling_filter.kernel;
     let window_func = resampling_filter.window;
-    let base_size: usize = (filter_base_size.as_() * filter_scale_cutoff).round().as_();
-    // Kernel size must be always odd
-    let kernel_size = base_size * 2 + 1usize;
-    let filter_radius = base_size.as_();
-    let filter_scale = 1f32.as_() / filter_scale_cutoff;
-    let mut weights: Vec<T> = vec![T::default(); kernel_size * out_size];
-    let mut local_filters = vec![T::default(); kernel_size];
-    let mut filter_position = 0usize;
-    let blur_scale = match window_func {
-        None => 1f32.as_(),
-        Some(window) => {
-            if window.blur.as_() > 0f32.as_() {
-                1f32.as_() / window.blur.as_()
-            } else {
-                0f32.as_()
-            }
-        }
-    };
 
     let mut bounds: Vec<FilterBounds> = vec![FilterBounds::new(0, 0); out_size];
 
-    for (i, bound) in bounds.iter_mut().enumerate() {
-        let center_x = ((i.as_() + 0.5.as_()) * scale).min(in_size.as_());
-        let mut weights_sum: T = 0f32.as_();
-
-        let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_();
-        let end: usize = (center_x + filter_radius)
-            .ceil()
-            .min(in_size.as_())
-            .min(start.as_() + kernel_size.as_())
-            .as_();
-
-        let center = center_x - 0.5.as_();
-
-        for (local_filter_iteration, k) in (start..end).enumerate() {
-            let dx = k.as_() - center;
-            let weight;
-            if let Some(resampling_window) = window_func {
-                let mut x = dx.abs();
-                x = if resampling_window.blur.as_() > 0f32.as_() {
-                    x * blur_scale
+    let is_area = resampling_filter.is_area_filter && scale < 1.as_();
+
+    if !is_area {
+        let base_size: usize = (filter_base_size.as_() * filter_scale_cutoff).round().as_();
+        let kernel_size = base_size;
+        let filter_radius = base_size.as_() / 2.as_();
+        let filter_scale = 1f32.as_() / filter_scale_cutoff;
+        let mut weights: Vec<T> = vec![T::default(); kernel_size * out_size];
+        let mut local_filters = vec![T::default(); kernel_size];
+        let mut filter_position = 0usize;
+        let blur_scale = match window_func {
+            None => 1f32.as_(),
+            Some(window) => {
+                if window.blur.as_() > 0f32.as_() {
+                    1f32.as_() / window.blur.as_()
                 } else {
-                    x
-                };
-                x = if x <= resampling_window.taper.as_() {
                     0f32.as_()
+                }
+            }
+        };
+
+        for (i, bound) in bounds.iter_mut().enumerate() {
+            let center_x = ((i.as_() + 0.5.as_()) * scale).min(in_size.as_());
+            let mut weights_sum: T = 0f32.as_();
+
+            let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_();
+            let end: usize = (center_x + filter_radius)
+                .ceil()
+                .min(in_size.as_())
+                .min(start.as_() + kernel_size.as_())
+                .as_();
+
+            let center = center_x - 0.5.as_();
+
+            for (local_filter_iteration, k) in (start..end).enumerate() {
+                let dx = k.as_() - center;
+                let weight;
+                if let Some(resampling_window) = window_func {
+                    let mut x = dx.abs();
+                    x = if resampling_window.blur.as_() > 0f32.as_() {
+                        x * blur_scale
+                    } else {
+                        x
+                    };
+                    x = if x <= resampling_window.taper.as_() {
+                        0f32.as_()
+                    } else {
+                        (x - resampling_window.taper.as_())
+                            / (1f32.as_() - resampling_window.taper.as_())
+                    };
+                    let window_producer = resampling_window.window;
+                    let x_kernel_scaled = x * filter_scale;
+                    let window = if x < resampling_window.window_size.as_() {
+                        window_producer(x_kernel_scaled * resampling_window.window_size.as_())
+                    } else {
+                        0f32.as_()
+                    };
+                    weight = window * resampling_function(x_kernel_scaled);
                 } else {
-                    (x - resampling_window.taper.as_())
-                        / (1f32.as_() - resampling_window.taper.as_())
-                };
-                let window_producer = resampling_window.window;
-                let x_kernel_scaled = x * filter_scale;
-                let window = if x < resampling_window.window_size.as_() {
-                    window_producer(x_kernel_scaled * resampling_window.window_size.as_())
-                } else {
-                    0f32.as_()
-                };
-                weight = window * resampling_function(x_kernel_scaled);
-            } else {
-                let dx = dx.abs();
-                weight = resampling_function(dx * filter_scale);
+                    let dx = dx.abs();
+                    weight = resampling_function(dx * filter_scale);
+                }
+                weights_sum += weight;
+                local_filters[local_filter_iteration] = weight;
+            }
+
+            let size = end - start;
+
+            *bound = FilterBounds::new(start, size);
+
+            if weights_sum != 0f32.as_() {
+                let recpeq = 1f32.as_() / weights_sum;
+
+                for (dst, src) in weights
+                    .iter_mut()
+                    .skip(filter_position)
+                    .take(size)
+                    .zip(local_filters.iter().take(size))
+                {
+                    *dst = *src * recpeq;
+                }
             }
-            weights_sum += weight;
-            local_filters[local_filter_iteration] = weight;
+
+            filter_position += kernel_size;
         }
 
-        let size = end - start;
+        FilterWeights::<T>::new(
+            weights,
+            kernel_size,
+            kernel_size,
+            out_size,
+            filter_radius.as_(),
+            bounds,
+        )
+    } else {
+        // Simulating INTER_AREA from OpenCV, for up scaling here,
+        // this is necessary because weight computation is different
+        // from any other func
+        let inv_scale: T = 1.as_() / scale;
+        let kernel_size = 2;
+        let filter_radius: T = 1.as_();
+        let mut weights: Vec<T> = vec![T::default(); kernel_size * out_size];
+        let mut local_filters = vec![T::default(); kernel_size];
+        let mut filter_position = 0usize;
+
+        for (i, bound) in bounds.iter_mut().enumerate() {
+            let mut weights_sum: T = 0f32.as_();
+
+            let sx: T = (i.as_() * scale).floor();
+            let fx = (i as i64 + 1).as_() - (sx + 1.as_()) * inv_scale;
+            let dx = if fx <= 0.as_() {
+                0.as_()
+            } else {
+                fx - fx.floor()
+            };
+            let dx = dx.abs();
+            let weight0 = 1.as_() - dx;
+            let weight1: T = dx;
+            local_filters[0] = weight0;
+            local_filters[1] = weight1;
 
-        *bound = FilterBounds::new(start, size);
+            let start: usize = sx.floor().max(0f32.as_()).as_();
+            let end: usize = (sx + kernel_size.as_())
+                .ceil()
+                .min(in_size.as_())
+                .min(start.as_() + kernel_size.as_())
+                .as_();
 
-        if weights_sum != 0f32.as_() {
-            let recpeq = 1f32.as_() / weights_sum;
+            let size = end - start;
 
-            for (dst, src) in weights
-                .iter_mut()
-                .skip(filter_position)
-                .take(size)
-                .zip(local_filters.iter().take(size))
-            {
-                *dst = *src * recpeq;
+            weights_sum += weight0;
+            if size > 1 {
+                weights_sum += weight1;
             }
+            *bound = FilterBounds::new(start, size);
+
+            if weights_sum != 0f32.as_() {
+                let recpeq = 1f32.as_() / weights_sum;
+
+                for (dst, src) in weights
+                    .iter_mut()
+                    .skip(filter_position)
+                    .take(size)
+                    .zip(local_filters.iter().take(size))
+                {
+                    *dst = *src * recpeq;
+                }
+            } else {
+                weights[filter_position] = 1.as_();
+            }
+
+            filter_position += kernel_size;
         }
 
-        filter_position += kernel_size;
+        FilterWeights::new(
+            weights,
+            kernel_size,
+            kernel_size,
+            out_size,
+            filter_radius.as_(),
+            bounds,
+        )
     }
-
-    FilterWeights::<T>::new(
-        weights,
-        kernel_size,
-        kernel_size,
-        out_size,
-        filter_radius.as_(),
-        bounds,
-    )
 }
diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs
index 8f90b71..92e4a77 100644
--- a/src/fixed_point_horizontal.rs
+++ b/src/fixed_point_horizontal.rs
@@ -30,14 +30,21 @@ use crate::color_group::ColorGroup;
 use crate::definitions::ROUNDING_CONST;
 use crate::filter_weights::FilterWeights;
 use crate::saturate_narrow::SaturateNarrow;
-use crate::{fast_load_color_group, fast_store_color_group};
+use crate::{fast_load_color_group, fast_load_color_group_with_offset, fast_store_color_group};
 use num_traits::AsPrimitive;
-use std::ops::{AddAssign, Mul};
+use std::ops::{Add, AddAssign, Mul};
 
 #[inline(always)]
 pub(crate) fn convolve_row_handler_fixed_point<
     T: Copy + 'static + AsPrimitive<J> + Default,
-    J: Copy + 'static + AsPrimitive<T> + Mul<Output = J> + AddAssign + SaturateNarrow<T> + Default,
+    J: Copy
+        + 'static
+        + AsPrimitive<T>
+        + Mul<Output = J>
+        + AddAssign
+        + SaturateNarrow<T>
+        + Default
+        + Add<J, Output = J>,
     const CHANNELS: usize,
 >(
     src: &[T],
@@ -60,19 +67,48 @@ pub(crate) fn convolve_row_handler_fixed_point<
         let mut sums = ColorGroup::<CHANNELS, J>::dup(ROUNDING_CONST.as_());
 
         let start_x = bounds.start;
+        let bounds_size = bounds.size;
 
         let px = start_x * CHANNELS;
 
-        let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
-
-        for (&k_weight, src) in weights
-            .iter()
-            .zip(src_ptr0.chunks_exact(CHANNELS))
-            .take(bounds.size)
-        {
-            let weight: J = k_weight.as_();
-            let new_px = fast_load_color_group!(src, CHANNELS);
-            sums += new_px * weight;
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2 * CHANNELS)];
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
+            for (&k_weight, src) in weights
+                .iter()
+                .zip(src_ptr0.chunks_exact(CHANNELS))
+                .take(bounds.size)
+            {
+                let weight: J = k_weight.as_();
+                let new_px = fast_load_color_group!(src, CHANNELS, J);
+                sums += new_px * weight;
+            }
         }
 
         let narrowed = sums.saturate_narrow(bit_depth);
@@ -83,7 +119,14 @@ pub(crate) fn convolve_row_handler_fixed_point<
 #[inline(always)]
 pub(crate) fn convolve_row_handler_fixed_point_4<
     T: Copy + 'static + AsPrimitive<J> + Default,
-    J: Copy + 'static + AsPrimitive<T> + Mul<Output = J> + AddAssign + SaturateNarrow<T> + Default,
+    J: Copy
+        + 'static
+        + AsPrimitive<T>
+        + Mul<Output = J>
+        + AddAssign
+        + SaturateNarrow<T>
+        + Default
+        + Add<J, Output = J>,
     const CHANNELS: usize,
 >(
     src: &[T],
@@ -124,30 +167,102 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
         let start_x = bounds.start;
 
         let px = start_x * CHANNELS;
-        let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
-        let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)];
-        let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)];
-        let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)];
-
-        for ((((&k_weight, src0), src1), src2), src3) in weights
-            .iter()
-            .zip(src_ptr0.chunks_exact(CHANNELS))
-            .zip(src_ptr1.chunks_exact(CHANNELS))
-            .zip(src_ptr2.chunks_exact(CHANNELS))
-            .zip(src_ptr3.chunks_exact(CHANNELS))
-            .take(bounds.size)
-        {
-            let weight: J = k_weight.as_();
-
-            let new_px0 = fast_load_color_group!(src0, CHANNELS);
-            let new_px1 = fast_load_color_group!(src1, CHANNELS);
-            let new_px2 = fast_load_color_group!(src2, CHANNELS);
-            let new_px3 = fast_load_color_group!(src3, CHANNELS);
-
-            sums0 += new_px0 * weight;
-            sums1 += new_px1 * weight;
-            sums2 += new_px2 * weight;
-            sums3 += new_px3 * weight;
+        let bounds_size = bounds.size;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * CHANNELS)];
+
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+            sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1;
+            sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1;
+            sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * CHANNELS)];
+
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
+            sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2;
+            sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2;
+            sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * CHANNELS)];
+
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
+            sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3;
+            sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3;
+            sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)];
+            let src_ptr2 =
+                &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size * CHANNELS)];
+            let src_ptr3 =
+                &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size * CHANNELS)];
+
+            for ((((&k_weight, src0), src1), src2), src3) in weights
+                .iter()
+                .zip(src_ptr0.chunks_exact(CHANNELS))
+                .zip(src_ptr1.chunks_exact(CHANNELS))
+                .zip(src_ptr2.chunks_exact(CHANNELS))
+                .zip(src_ptr3.chunks_exact(CHANNELS))
+                .take(bounds.size)
+            {
+                let weight: J = k_weight.as_();
+
+                let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
+                let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
+                let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
+                let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
+
+                sums0 += new_px0 * weight;
+                sums1 += new_px1 * weight;
+                sums2 += new_px2 * weight;
+                sums3 += new_px3 * weight;
+            }
         }
 
         let narrowed0 = sums0.saturate_narrow(bit_depth);
diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs
index 9c9edb9..e9fcd75 100644
--- a/src/floating_point_horizontal.rs
+++ b/src/floating_point_horizontal.rs
@@ -29,7 +29,9 @@
 use crate::color_group::ColorGroup;
 use crate::filter_weights::FilterWeights;
 use crate::mixed_storage::MixedStorage;
-use crate::{fast_load_color_group, fast_mixed_store_color_group};
+use crate::{
+    fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group,
+};
 use num_traits::{AsPrimitive, Float, MulAdd};
 use std::ops::{Add, Mul};
 
@@ -81,7 +83,7 @@ pub(crate) fn convolve_row_handler_floating_point<
             .take(bounds.size)
         {
             let weight: J = k_weight.as_();
-            let new_px = fast_load_color_group!(src, CHANNELS);
+            let new_px = fast_load_color_group!(src, CHANNELS, J);
             sums = sums.mul_add(new_px, weight);
         }
 
@@ -143,30 +145,171 @@ pub(crate) fn convolve_row_handler_floating_point_4<
 
         let start_x = bounds.start;
         let px = start_x * CHANNELS;
-        let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
-        let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)];
-        let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)];
-        let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)];
 
-        for ((((&k_weight, src0), src1), src2), src3) in weights
-            .iter()
-            .zip(src_ptr0.chunks_exact(CHANNELS))
-            .zip(src_ptr1.chunks_exact(CHANNELS))
-            .zip(src_ptr2.chunks_exact(CHANNELS))
-            .zip(src_ptr3.chunks_exact(CHANNELS))
-            .take(bounds.size)
-        {
-            let weight: J = k_weight.as_();
+        let bounds_size = bounds.size;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * CHANNELS)];
+
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+            sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+            sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+            sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * CHANNELS)];
+
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                );
+            sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                );
+            sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                );
+            sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                );
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * CHANNELS)];
+
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                );
+            sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                );
+            sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                );
+            sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                );
+        } else {
+            let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)];
+            let src_ptr2 =
+                &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)];
+            let src_ptr3 =
+                &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)];
+
+            for ((((&k_weight, src0), src1), src2), src3) in weights
+                .iter()
+                .zip(src_ptr0.chunks_exact(CHANNELS))
+                .zip(src_ptr1.chunks_exact(CHANNELS))
+                .zip(src_ptr2.chunks_exact(CHANNELS))
+                .zip(src_ptr3.chunks_exact(CHANNELS))
+                .take(bounds.size)
+            {
+                let weight: J = k_weight.as_();
 
-            let new_px0 = fast_load_color_group!(src0, CHANNELS);
-            let new_px1 = fast_load_color_group!(src1, CHANNELS);
-            let new_px2 = fast_load_color_group!(src2, CHANNELS);
-            let new_px3 = fast_load_color_group!(src3, CHANNELS);
+                let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
+                let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
+                let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
+                let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
 
-            sums0 = sums0.mul_add(new_px0, weight);
-            sums1 = sums1.mul_add(new_px1, weight);
-            sums2 = sums2.mul_add(new_px2, weight);
-            sums3 = sums3.mul_add(new_px3, weight);
+                sums0 = sums0.mul_add(new_px0, weight);
+                sums1 = sums1.mul_add(new_px1, weight);
+                sums2 = sums2.mul_add(new_px2, weight);
+                sums3 = sums3.mul_add(new_px3, weight);
+            }
         }
 
         fast_mixed_store_color_group!(sums0, chunk0, CHANNELS, bit_depth);
diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs
index 8872200..91cea1e 100644
--- a/src/floating_point_vertical.rs
+++ b/src/floating_point_vertical.rs
@@ -70,21 +70,175 @@ pub(crate) fn convolve_column_handler_floating_point_4<
 
     let v_start_px = x * CHANNELS;
 
-    for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() {
-        let py = bounds.start + j;
-        let weight = k_weight.as_();
-        let offset = src_stride * py + v_start_px;
-        let src_ptr = &src[offset..(offset + CHANNELS * 4)];
-
-        let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0);
-        let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS);
-        let new_px2 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2);
-        let new_px3 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3);
-
-        sums0 = sums0.mul_add(new_px0, weight);
-        sums1 = sums1.mul_add(new_px1, weight);
-        sums2 = sums2.mul_add(new_px2, weight);
-        sums3 = sums3.mul_add(new_px3, weight);
+    let bounds_start = bounds.start;
+    let bounds_size = bounds.size;
+
+    if bounds_size == 2 {
+        let weights = &filter[0..2];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+            fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+            weight1,
+        );
+        sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+        sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                weight1,
+            );
+        sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                weight1,
+            );
+    } else if bounds_size == 3 {
+        let weights = &filter[0..3];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            );
+
+        sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                weight2,
+            );
+
+        sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                weight2,
+            );
+
+        sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                weight2,
+            );
+    } else if bounds_size == 4 {
+        let weights = &filter[0..4];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let weight3 = weights[3].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let offset3 = src_stride * (bounds_start + 3) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+        let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                weight3,
+            );
+
+        sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                weight3,
+            );
+
+        sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                weight3,
+            );
+
+        sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                weight3,
+            );
+    } else {
+        for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() {
+            let py = bounds_start + j;
+            let weight = k_weight.as_();
+            let offset = src_stride * py + v_start_px;
+            let src_ptr = &src[offset..(offset + CHANNELS * 4)];
+
+            let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0, J);
+            let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J);
+            let new_px2 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J);
+            let new_px3 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J);
+
+            sums0 = sums0.mul_add(new_px0, weight);
+            sums1 = sums1.mul_add(new_px1, weight);
+            sums2 = sums2.mul_add(new_px2, weight);
+            sums3 = sums3.mul_add(new_px3, weight);
+        }
     }
 
     let v_dst = &mut dst[v_start_px..(v_start_px + CHANNELS * 4)];
@@ -142,15 +296,82 @@ pub(crate) fn convolve_column_handler_floating_point<
 
     let v_start_px = x * CHANNELS;
 
-    for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() {
-        let py = bounds.start + j;
-        let weight = k_weight.as_();
-        let offset = src_stride * py + v_start_px;
-        let src_ptr = &src[offset..(offset + CHANNELS)];
+    let bounds_size = bounds.size;
+    let bounds_start = bounds.start;
+
+    if bounds_size == 2 {
+        let weights = &filter[0..2];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+            fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+            weight1,
+        );
+    } else if bounds_size == 3 {
+        let weights = &filter[0..3];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            );
+    } else if bounds_size == 4 {
+        let weights = &filter[0..4];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let weight3 = weights[3].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let offset3 = src_stride * (bounds_start + 3) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+        let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                weight3,
+            );
+    } else {
+        for (j, &k_weight) in filter.iter().take(bounds_size).enumerate() {
+            let py = bounds_start + j;
+            let weight = k_weight.as_();
+            let offset = src_stride * py + v_start_px;
+            let src_ptr = &src[offset..(offset + CHANNELS)];
 
-        let new_px0 = fast_load_color_group!(src_ptr, CHANNELS);
+            let new_px0 = fast_load_color_group!(src_ptr, CHANNELS, J);
 
-        sums0 = sums0.mul_add(new_px0, weight);
+            sums0 = sums0.mul_add(new_px0, weight);
+        }
     }
 
     fast_mixed_store_color_group!(
diff --git a/src/handler_provider.rs b/src/handler_provider.rs
index 3487fc5..ded4b30 100644
--- a/src/handler_provider.rs
+++ b/src/handler_provider.rs
@@ -90,6 +90,7 @@ where
         + Mul<Output = J>
         + AddAssign
         + SaturateNarrow<u8>
+        + Add<J, Output = J>
         + Default,
     i32: AsPrimitive<J>,
     u8: AsPrimitive<J>,
@@ -131,6 +132,7 @@ where
         + Mul<Output = J>
         + AddAssign
         + SaturateNarrow<u16>
+        + Add<J, Output = J>
         + Default,
     i32: AsPrimitive<J>,
     u16: AsPrimitive<J>,
diff --git a/src/resize_floating_point.rs b/src/resize_floating_point.rs
index c3cfc64..a05d040 100644
--- a/src/resize_floating_point.rs
+++ b/src/resize_floating_point.rs
@@ -34,6 +34,7 @@ use crate::mixed_storage::MixedStorage;
 use crate::resize_nearest::resize_nearest;
 use crate::{ImageSize, ResamplingFunction};
 use num_traits::{AsPrimitive, Float, MulAdd, Signed};
+use std::fmt::Debug;
 use std::ops::{AddAssign, MulAssign, Neg};
 
 /// Resizing image using exact convolution
@@ -82,13 +83,17 @@ where
         + AddAssign<F>
         + AsPrimitive<f64>
         + AsPrimitive<usize>
+        + AsPrimitive<i32>
+        + AsPrimitive<i64>
         + Jinc<F>
         + ConstSqrt2
         + Default
         + AsPrimitive<i32>
         + Send
-        + Sync,
-    i32: AsPrimitive<J>,
+        + Sync
+        + Debug,
+    i32: AsPrimitive<J> + AsPrimitive<F>,
+    i64: AsPrimitive<F>,
     f32: AsPrimitive<J>,
     f32: AsPrimitive<F>,
     f64: AsPrimitive<F>,
diff --git a/src/resize_nearest.rs b/src/resize_nearest.rs
index f302d23..dd7f10c 100644
--- a/src/resize_nearest.rs
+++ b/src/resize_nearest.rs
@@ -60,8 +60,12 @@ pub fn resize_nearest<T: Copy + Send + Sync, const CHANNELS: usize>(
 
     iter.enumerate().for_each(|(y, dst_row)| {
         for (x, dst_chunk) in dst_row.chunks_exact_mut(CHANNELS).enumerate() {
-            let src_x = (x as f32 * x_scale + 0.5f32).min(clip_width).max(0f32) as usize;
-            let src_y = (y as f32 * y_scale + 0.5f32).min(clip_height).max(0f32) as usize;
+            let src_x = ((x as f32 + 0.5f32) * x_scale - 0.5f32)
+                .min(clip_width)
+                .max(0f32) as usize;
+            let src_y = ((y as f32 + 0.5f32) * y_scale - 0.5f32)
+                .min(clip_height)
+                .max(0f32) as usize;
             let src_offset_y = src_y * src_stride;
             let src_px = src_x * CHANNELS;
             let offset = src_offset_y + src_px;
diff --git a/src/sampler.rs b/src/sampler.rs
index e9f7ccb..bb30110 100644
--- a/src/sampler.rs
+++ b/src/sampler.rs
@@ -103,6 +103,8 @@ pub enum ResamplingFunction {
     Lagrange3,
     Lanczos6,
     Lanczos6Jinc,
+    /// This method replicates `INTER_AREA` behaviour from OpenCV
+    Area,
 }
 
 impl From<u32> for ResamplingFunction {
@@ -146,6 +148,7 @@ impl From<u32> for ResamplingFunction {
             35 => ResamplingFunction::Lagrange3,
             36 => ResamplingFunction::Lanczos6,
             37 => ResamplingFunction::Lanczos6Jinc,
+            38 => ResamplingFunction::Area,
             _ => ResamplingFunction::Bilinear,
         }
     }
@@ -176,6 +179,7 @@ pub struct ResamplingFilter<T> {
     pub window: Option<ResamplingWindow<T>>,
     pub min_kernel_size: f32,
     pub is_resizable_kernel: bool,
+    pub is_area_filter: bool,
 }
 
 impl<T> ResamplingFilter<T> {
@@ -185,6 +189,17 @@ impl<T> ResamplingFilter<T> {
             window: None,
             min_kernel_size,
             is_resizable_kernel: true,
+            is_area_filter: false,
+        }
+    }
+
+    fn new_area(kernel: fn(T) -> T, min_kernel_size: f32) -> ResamplingFilter<T> {
+        ResamplingFilter {
+            kernel,
+            window: None,
+            min_kernel_size,
+            is_resizable_kernel: true,
+            is_area_filter: true,
         }
     }
 
@@ -198,6 +213,7 @@ impl<T> ResamplingFilter<T> {
             window: Some(window),
             min_kernel_size,
             is_resizable_kernel: true,
+            is_area_filter: false,
         }
     }
 
@@ -207,6 +223,7 @@ impl<T> ResamplingFilter<T> {
             window: None,
             min_kernel_size,
             is_resizable_kernel: false,
+            is_area_filter: false,
         }
     }
 }
@@ -232,37 +249,38 @@ impl ResamplingFunction {
     {
         match self {
             ResamplingFunction::Bilinear => ResamplingFilter::new(bilinear, 2f32),
+            ResamplingFunction::Area => ResamplingFilter::new_area(box_weight, 2f32),
             ResamplingFunction::Nearest => {
                 // Just a stab for nearest
-                ResamplingFilter::new(bilinear, 1f32)
+                ResamplingFilter::new(bilinear, 2f32)
             }
-            ResamplingFunction::Cubic => ResamplingFilter::new(cubic_spline::<T>, 2f32),
+            ResamplingFunction::Cubic => ResamplingFilter::new(cubic_spline, 2f32),
             ResamplingFunction::MitchellNetravalli => {
-                ResamplingFilter::new(mitchell_netravalli::<T>, 2f32)
+                ResamplingFilter::new(mitchell_netravalli, 2f32)
             }
             ResamplingFunction::Lanczos3 => ResamplingFilter::new(lanczos3, 3f32),
-            ResamplingFunction::CatmullRom => ResamplingFilter::new(catmull_rom::<T>, 2f32),
-            ResamplingFunction::Hermite => ResamplingFilter::new(hermite_spline::<T>, 2f32),
-            ResamplingFunction::BSpline => ResamplingFilter::new(b_spline::<T>, 2f32),
+            ResamplingFunction::CatmullRom => ResamplingFilter::new(catmull_rom, 2f32),
+            ResamplingFunction::Hermite => ResamplingFilter::new(hermite_spline, 2f32),
+            ResamplingFunction::BSpline => ResamplingFilter::new(b_spline, 2f32),
             ResamplingFunction::Hann => ResamplingFilter::new(hann, 3f32),
-            ResamplingFunction::Bicubic => ResamplingFilter::new(bicubic_spline::<T>, 3f32),
+            ResamplingFunction::Bicubic => ResamplingFilter::new(bicubic_spline, 3f32),
             ResamplingFunction::Lanczos4 => ResamplingFilter::new(lanczos4, 4f32),
             ResamplingFunction::Lanczos2 => ResamplingFilter::new(lanczos2, 2f32),
-            ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 1f32),
-            ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 1f32),
-            ResamplingFunction::Welch => ResamplingFilter::new(welch, 1f32),
-            ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 1.5f32),
+            ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 2f32),
+            ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 2f32),
+            ResamplingFunction::Welch => ResamplingFilter::new(welch, 2f32),
+            ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 2f32),
             ResamplingFunction::Gaussian => ResamplingFilter::new(gaussian, 2f32),
             ResamplingFunction::Sphinx => ResamplingFilter::new(sphinx, 2f32),
-            ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 1f32),
-            ResamplingFunction::Robidoux => ResamplingFilter::new(robidoux::<T>, 2f32),
-            ResamplingFunction::RobidouxSharp => ResamplingFilter::new(robidoux_sharp::<T>, 2f32),
+            ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 2f32),
+            ResamplingFunction::Robidoux => ResamplingFilter::new(robidoux, 2f32),
+            ResamplingFunction::RobidouxSharp => ResamplingFilter::new(robidoux_sharp, 2f32),
             ResamplingFunction::Spline16 => ResamplingFilter::new_with_fixed_kernel(spline16, 2f32),
             ResamplingFunction::Spline36 => ResamplingFilter::new_with_fixed_kernel(spline36, 4f32),
             ResamplingFunction::Spline64 => ResamplingFilter::new_with_fixed_kernel(spline64, 6f32),
             ResamplingFunction::Kaiser => ResamplingFilter::new(kaiser, 2f32),
             ResamplingFunction::BartlettHann => ResamplingFilter::new(bartlett_hann, 2f32),
-            ResamplingFunction::Box => ResamplingFilter::new(box_weight, 0.5f32),
+            ResamplingFunction::Box => ResamplingFilter::new(box_weight, 2f32),
             ResamplingFunction::Bohman => ResamplingFilter::new(bohman, 2f32),
             ResamplingFunction::Lanczos2Jinc => ResamplingFilter::new(lanczos2_jinc, 2f32),
             ResamplingFunction::Lanczos3Jinc => ResamplingFilter::new(lanczos3_jinc, 3f32),

From 69e4bdba931a019c81a5453fb6c71b3fca1d4d4c Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 14:45:59 +0100
Subject: [PATCH 2/8] Bench updates for x86

---
 README.md      | 71 ++++++++++++++++++++++++++++++--------------------
 command.ps1    |  4 +--
 src/resizer.rs | 16 ++++++------
 3 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 62572da..e2d8bab 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,11 @@ cargo bench --bench resize_rgba --manifest-path ./app/Cargo.toml
 | pic-scale(aarch64)        |  11.89   |   8.92   |
 | fir(aarch64)              |  25.89   |  11.30   |
 | image(x86)                |  192.52  |  88.63   |
-| pic-scale(x86)            |  49.79   |  35.98   |
-| pic-scale(x86-cpu-native) |  27.21   |  20.48   |
+| pic-scale(x86)            |  25.50   |  18.37   |
 | fir(x86)                  |  42.89   |  24.13   |
-| fir(x86-cpu-native)       |  41.17   |  23.62   |
+| image(x86-cpu-native)     |  205.64  |  89.02   |
+| pic-scale(x86-cpu-native) |  14.39   |  11.31   |
+| fir(x86-cpu-native)       |  41.21   |  22.77   |
 
 Example comparison time for downscale RGB 4928x3279 `8 bit` image in 4 times.
 
@@ -44,14 +45,17 @@ Example comparison time for downscale RGB 4928x3279 `8 bit` image in 4 times.
 cargo bench --bench resize_rgb --manifest-path ./app/Cargo.toml
 ```
 
-|                    | Lanczos3 | Bilinear |
-|--------------------|:--------:|:--------:|
-| image(aarch64)     |  123.85  |  51.30   |
-| pic-scale(aarch64) |  17.23   |  12.32   |
-| fir(aarch64)       |  23.61   |  10.53   |
-| image(x86)         |  201.52  |  90.82   |
-| pic-scale(x86)     |  34.54   |  25.05   |
-| fir(x86)           |  41.97   |  25.21   |
+|                           | Lanczos3 | Bilinear |
+|---------------------------|:--------:|:--------:|
+| image(aarch64)            |  123.85  |  51.30   |
+| pic-scale(aarch64)        |  17.23   |  12.32   |
+| fir(aarch64)              |  23.61   |  10.53   |
+| image(x86)                |  201.52  |  90.82   |
+| pic-scale(x86)            |  27.17   |  21.08   |
+| fir(x86)                  |  41.97   |  24.39   |
+| image(x86-cpu-native)     |  184.57  |  84.69   |
+| pic-scale(x86-cpu-native) |  20.96   |  15.16   |
+| fir(x86-cpu-native)       |  41.49   |  20.38   |
 
 Example comparison time for downscale RGBA 4928x3279 `16 bit` image in 4 times.
 
@@ -64,10 +68,12 @@ cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml
 | image(aarch64)            |  262.32  |  76.91   |
 | pic-scale(aarch64)        |  15.49   |  11.38   |
 | fir(aarch64)              |  141.78  |  50.08   |
-| image(x86)                |  196.28  |  194.75  |
-| pic-scale(x86)            |  59.89   |  57.99   |
-| pic-scale(x86-cpu-native) |  44.07   |  57.99   |
-| fir(x86)                  |  52.73   |  28.35   |
+| image(x86)                |  196.28  |  107.78  |
+| pic-scale(x86)            |  57.48   |  50.85   |
+| fir(x86)                  |  139.56  |  58.48   |
+| image(x86-cpu-native)     |  192.85  |  102.05  |
+| pic-scale(x86-cpu-native) |  39.60   |  46.44   |
+| fir(x86-cpu-native)       |  101.48  |  52.58   |
 
 Example comparison time for downscale RGB 4928x3279 `16 bit` image in 4 times.
 
@@ -75,11 +81,17 @@ Example comparison time for downscale RGB 4928x3279 `16 bit` image in 4 times.
 cargo bench --bench resize_rgb_u16 --manifest-path ./app/Cargo.toml
 ```
 
-|                    | Lanczos3 | Bilinear |
-|--------------------|:--------:|:--------:|
-| image(aarch64)     |  130.45  |  57.38   |
-| pic-scale(aarch64) |  16.17   |  12.11   |
-| fir(aarch64)       |  110.06  |  42.04   |
+|                           | Lanczos3 | Bilinear |
+|---------------------------|:--------:|:--------:|
+| image(aarch64)            |  130.45  |  57.38   |
+| pic-scale(aarch64)        |  16.17   |  12.11   |
+| fir(aarch64)              |  110.06  |  42.04   |
+| image(x86)                |  204.10  |  148.34  |
+| pic-scale(x86)            |  43.21   |  145.73  |
+| fir(x86)                  |  210.28  |  51.29   |
+| image(x86-cpu-native)     |  190.21  |  98.42   |
+| pic-scale(x86-cpu-native) |  33.48   |  28.50   |
+| fir(x86-cpu-native)       |  72.88   |  45.17   |
 
 Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times.
 
@@ -87,14 +99,17 @@ Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times.
 cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml
 ```
 
-|                    | Lanczos3 | Bilinear |
-|--------------------|:--------:|:--------:|
-| image(aarch64)     |  100.16  |  50.09   |
-| pic-scale(aarch64) |  14.07   |  11.18   |
-| fir(aarch64)       |  105.30  |  37.75   |
-| image(x86)         |  164.04  |  98.90   |
-| pic-scale(x86)     |  57.39   |  43.84   |
-| fir(x86)           |  60.30   |  29.92   |
+|                           | Lanczos3 | Bilinear |
+|---------------------------|:--------:|:--------:|
+| image(aarch64)            |  100.16  |  50.09   |
+| pic-scale(aarch64)        |  14.07   |  11.18   |
+| fir(aarch64)              |  105.30  |  37.75   |
+| image(x86)                |  208.25  |  107.84  |
+| pic-scale(x86)            |  33.55   |  28.97   |
+| fir(x86)                  |  92.38   |  74.12   |
+| image(x86-cpu-native)     |  162.83  |  108.54  |
+| pic-scale(x86-cpu-native) |  33.13   |  28.54   |
+| fir(x86-cpu-native)       |  56.65   |  59.96   |
 
 This project is licensed under either of
 
diff --git a/command.ps1 b/command.ps1
index 77258e2..f9f6155 100644
--- a/command.ps1
+++ b/command.ps1
@@ -1,2 +1,2 @@
-# $env:RUSTFLAGS = "-C target-cpu=native"
-cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml
\ No newline at end of file
+$env:RUSTFLAGS = "-C target-cpu=native"
+cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml
\ No newline at end of file
diff --git a/src/resizer.rs b/src/resizer.rs
index 7311505..b405ba6 100644
--- a/src/resizer.rs
+++ b/src/resizer.rs
@@ -225,8 +225,8 @@ pub fn resize_rgba16(
     bit_depth: u32,
     resampling_function: ResamplingFunction,
 ) -> Result<Vec<u16>, String> {
-    if bit_depth > 16 {
-        return Err("Bit depth cannot be greater than 16".parse().unwrap());
+    if bit_depth > 16 || bit_depth == 0 {
+        return Err("Bit depth cannot be greater than 16 and not a zero".to_string());
     }
     if bit_depth == 10 || bit_depth == 12 {
         resize_fixed_point::<u16, i32, 4>(
@@ -280,8 +280,8 @@ pub fn resize_rgb16(
     bit_depth: u32,
     resampling_function: ResamplingFunction,
 ) -> Result<Vec<u16>, String> {
-    if bit_depth > 16 {
-        return Err("Bit depth cannot be greater than 16".parse().unwrap());
+    if bit_depth > 16 || bit_depth == 0 {
+        return Err("Bit depth cannot be greater than 16 and not a zero".to_string());
     }
     if bit_depth == 10 || bit_depth == 12 {
         resize_fixed_point::<u16, i32, 3>(
@@ -335,8 +335,8 @@ pub fn resize_plane16(
     bit_depth: u32,
     resampling_function: ResamplingFunction,
 ) -> Result<Vec<u16>, String> {
-    if bit_depth > 16 {
-        return Err("Bit depth cannot be greater than 16".parse().unwrap());
+    if bit_depth > 16 || bit_depth == 0 {
+        return Err("Bit depth cannot be greater than 16 and not a zero".to_string());
     }
     if bit_depth == 10 || bit_depth == 12 {
         resize_fixed_point::<u16, i32, 1>(
@@ -390,8 +390,8 @@ pub fn resize_plane16_with_alpha(
     bit_depth: u32,
     resampling_function: ResamplingFunction,
 ) -> Result<Vec<u16>, String> {
-    if bit_depth > 16 {
-        return Err("Bit depth cannot be greater than 16".parse().unwrap());
+    if bit_depth > 16 || bit_depth == 0 {
+        return Err("Bit depth cannot be greater than 16 and not a zero".to_string());
     }
     if bit_depth == 10 || bit_depth == 12 {
         resize_fixed_point::<u16, i32, 2>(

From 59b19fb2b48f9bdbf84653089a92fe881e124365 Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 18:12:55 +0100
Subject: [PATCH 3/8] Loop unroll for 6 kernel size, bugfix and improvement

---
 app/src/main.rs                  |  19 +--
 src/fixed_point_horizontal.rs    |  55 ++++++++-
 src/fixed_point_vertical.rs      |   8 +-
 src/floating_point_horizontal.rs | 195 +++++++++++++++++++++++++++++--
 src/floating_point_vertical.rs   | 150 ++++++++++++++++++++++++
 src/sampler.rs                   |   2 +-
 6 files changed, 405 insertions(+), 24 deletions(-)

diff --git a/app/src/main.rs b/app/src/main.rs
index 19a1c94..bc2ea6c 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -34,10 +34,7 @@ use image::{
     imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat,
     ImageReader, Rgb, RgbImage,
 };
-use pic_scale_safe::{
-    resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32,
-    resize_rgba16, resize_rgba8, ImageSize, ResamplingFunction,
-};
+use pic_scale_safe::{premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize, ResamplingFunction};
 use std::ops::{BitXor, Shr};
 use std::time::Instant;
 
@@ -47,7 +44,7 @@ fn main() {
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
-    let transient = img.to_rgb8();
+    let transient = img.to_rgba8();
 
     let mut working_store = transient.to_vec();
 
@@ -55,11 +52,13 @@ fn main() {
 
     let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
     let dst_size = ImageSize::new(
-        (dimensions.0 as f32 + 1.) as usize,
-        (dimensions.1 as f32 + 1.) as usize,
+        dimensions.0 as usize / 2,
+        dimensions.1 as usize / 2,
     );
 
-    let mut resized = resize_rgb8(
+    premultiply_rgba8(&mut working_store);
+
+    let mut resized = resize_rgba8(
         &working_store,
         src_size,
         dst_size,
@@ -67,6 +66,8 @@ fn main() {
     )
     .unwrap();
 
+    // unpremultiply_rgba8(&mut resized);
+
     println!("Working time {:?}", start.elapsed());
 
     // let rgba_image = DynamicImage::ImageRgb16(ImageBuffer::<Rgb<u16>, Vec<u16>>::from_vec(dimensions.0 * 4, dimensions.1 / 4, resized).unwrap());
@@ -82,7 +83,7 @@ fn main() {
         &resized,
         dst_size.width as u32,
         dst_size.height as u32,
-        image::ColorType::Rgb8,
+        image::ColorType::Rgba8,
     )
     .unwrap();
 
diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs
index 92e4a77..c6ea69b 100644
--- a/src/fixed_point_horizontal.rs
+++ b/src/fixed_point_horizontal.rs
@@ -98,8 +98,24 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
                 + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
                 + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            let weight4 = sliced_weights[4].as_();
+            let weight5 = sliced_weights[5].as_();
+            sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
         } else {
-            let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
+            let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
             for (&k_weight, src) in weights
                 .iter()
                 .zip(src_ptr0.chunks_exact(CHANNELS))
@@ -235,6 +251,43 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
                 + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
                 + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * CHANNELS)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            let weight4 = sliced_weights[4].as_();
+            let weight5 = sliced_weights[5].as_();
+            sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
+            sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J) * weight4
+                + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J) * weight5;
+            sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J) * weight4
+                + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J) * weight5;
+            sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J) * weight4
+                + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J) * weight5;
         } else {
             let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
             let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)];
diff --git a/src/fixed_point_vertical.rs b/src/fixed_point_vertical.rs
index 82ff83f..819c2b8 100644
--- a/src/fixed_point_vertical.rs
+++ b/src/fixed_point_vertical.rs
@@ -68,7 +68,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
-        let py = bounds.start + j;
+        let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;
         let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
@@ -127,7 +127,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
-        let py = bounds.start + j;
+        let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;
         let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
@@ -207,7 +207,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_four<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
-        let py = bounds.start + j;
+        let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;
         let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
@@ -305,7 +305,7 @@ pub(crate) fn column_handler_fixed_point<
         cx += 8;
     }
 
-    while cx + 1 < total_width {
+    while cx < total_width {
         convolve_column_handler_fixed_point_direct_buffer::<T, J, 1>(
             src, src_stride, dst, weight, bounds, bit_depth, cx,
         );
diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs
index e9fcd75..ccf9b63 100644
--- a/src/floating_point_horizontal.rs
+++ b/src/floating_point_horizontal.rs
@@ -72,19 +72,99 @@ pub(crate) fn convolve_row_handler_floating_point<
         let mut sums = ColorGroup::<CHANNELS, J>::dup(0.as_());
 
         let start_x = bounds.start;
+        let bounds_size = bounds.size;
 
         let px = start_x * CHANNELS;
 
-        let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2 * CHANNELS)];
 
-        for (&k_weight, src) in weights
-            .iter()
-            .zip(src_ptr0.chunks_exact(CHANNELS))
-            .take(bounds.size)
-        {
-            let weight: J = k_weight.as_();
-            let new_px = fast_load_color_group!(src, CHANNELS, J);
-            sums = sums.mul_add(new_px, weight);
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                weight1,
+            );
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
+
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                );
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
+
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                );
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            let weight4 = sliced_weights[4].as_();
+            let weight5 = sliced_weights[5].as_();
+            sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                    weight4,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                    weight5,
+                );
+        } else {
+            let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
+            for (&k_weight, src) in weights
+                .iter()
+                .zip(src_ptr0.chunks_exact(CHANNELS))
+                .take(bounds.size)
+            {
+                let weight: J = k_weight.as_();
+                let new_px = fast_load_color_group!(src, CHANNELS, J);
+                sums = sums.mul_add(new_px, weight);
+            }
         }
 
         fast_mixed_store_color_group!(sums, chunk, CHANNELS, bit_depth);
@@ -282,6 +362,103 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                     fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                     weight3,
                 );
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * CHANNELS)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * CHANNELS)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0].as_();
+            let weight1 = sliced_weights[1].as_();
+            let weight2 = sliced_weights[2].as_();
+            let weight3 = sliced_weights[3].as_();
+            let weight4 = sliced_weights[4].as_();
+            let weight5 = sliced_weights[5].as_();
+            sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                    weight4,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                    weight5,
+                );
+            sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J),
+                    weight4,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J),
+                    weight5,
+                );
+            sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J),
+                    weight4,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J),
+                    weight5,
+                );
+            sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    weight1,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    weight2,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    weight3,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J),
+                    weight4,
+                )
+                .mul_add(
+                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J),
+                    weight5,
+                );
         } else {
             let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)];
             let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)];
diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs
index 91cea1e..6ec9000 100644
--- a/src/floating_point_vertical.rs
+++ b/src/floating_point_vertical.rs
@@ -222,6 +222,114 @@ pub(crate) fn convolve_column_handler_floating_point_4<
                 fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                 weight3,
             );
+    } else if bounds_size == 6 {
+        let weights = &filter[0..6];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let weight3 = weights[3].as_();
+        let weight4 = weights[4].as_();
+        let weight5 = weights[5].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let offset3 = src_stride * (bounds_start + 3) + v_start_px;
+        let offset4 = src_stride * (bounds_start + 4) + v_start_px;
+        let offset5 = src_stride * (bounds_start + 5) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+        let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
+        let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)];
+        let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                weight3,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                weight4,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                weight5,
+            );
+
+        sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                weight3,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J),
+                weight4,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J),
+                weight5,
+            );
+
+        sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                weight3,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J),
+                weight4,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J),
+                weight5,
+            );
+
+        sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                weight3,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J),
+                weight4,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J),
+                weight5,
+            );
     } else {
         for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() {
             let py = bounds_start + j;
@@ -361,6 +469,48 @@ pub(crate) fn convolve_column_handler_floating_point<
                 fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                 weight3,
             );
+    } else if bounds_size == 6 {
+        let weights = &filter[0..6];
+        let weight0 = weights[0].as_();
+        let weight1 = weights[1].as_();
+        let weight2 = weights[2].as_();
+        let weight3 = weights[3].as_();
+        let weight4 = weights[4].as_();
+        let weight5 = weights[5].as_();
+        let offset0 = src_stride * bounds_start + v_start_px;
+        let offset1 = src_stride * (bounds_start + 1) + v_start_px;
+        let offset2 = src_stride * (bounds_start + 2) + v_start_px;
+        let offset3 = src_stride * (bounds_start + 3) + v_start_px;
+        let offset4 = src_stride * (bounds_start + 4) + v_start_px;
+        let offset5 = src_stride * (bounds_start + 5) + v_start_px;
+        let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
+        let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
+        let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
+        let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
+        let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)];
+        let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)];
+
+        sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                weight2,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                weight3,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                weight4,
+            )
+            .mul_add(
+                fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                weight5,
+            );
     } else {
         for (j, &k_weight) in filter.iter().take(bounds_size).enumerate() {
             let py = bounds_start + j;
diff --git a/src/sampler.rs b/src/sampler.rs
index bb30110..2d90493 100644
--- a/src/sampler.rs
+++ b/src/sampler.rs
@@ -269,7 +269,7 @@ impl ResamplingFunction {
             ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 2f32),
             ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 2f32),
             ResamplingFunction::Welch => ResamplingFilter::new(welch, 2f32),
-            ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 2f32),
+            ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 3f32),
             ResamplingFunction::Gaussian => ResamplingFilter::new(gaussian, 2f32),
             ResamplingFunction::Sphinx => ResamplingFilter::new(sphinx, 2f32),
             ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 2f32),

From 2487b043e3926a28aefb14ee7b5a4c7d6f71aa49 Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 20:09:46 +0100
Subject: [PATCH 4/8] Fix vertical fixed point bug pass, adding something
 beautiful to all alpha functions to gets auto-vectorization

---
 app/src/main.rs             |  15 ++-
 src/alpha.rs                | 179 ++++++++++++++++--------------------
 src/fixed_point_vertical.rs |   3 +
 3 files changed, 93 insertions(+), 104 deletions(-)

diff --git a/app/src/main.rs b/app/src/main.rs
index bc2ea6c..6412152 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -34,7 +34,11 @@ use image::{
     imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat,
     ImageReader, Rgb, RgbImage,
 };
-use pic_scale_safe::{premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize, ResamplingFunction};
+use pic_scale_safe::{
+    premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8,
+    resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize,
+    ResamplingFunction,
+};
 use std::ops::{BitXor, Shr};
 use std::time::Instant;
 
@@ -51,13 +55,14 @@ fn main() {
     let start = Instant::now();
 
     let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
-    let dst_size = ImageSize::new(
-        dimensions.0 as usize / 2,
-        dimensions.1 as usize / 2,
-    );
+    let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+
+    let start_mul = Instant::now();
 
     premultiply_rgba8(&mut working_store);
 
+    println!("Alpha mul time {:?}", start_mul.elapsed());
+
     let mut resized = resize_rgba8(
         &working_store,
         src_size,
diff --git a/src/alpha.rs b/src/alpha.rs
index 1d178c8..fd8bb5e 100644
--- a/src/alpha.rs
+++ b/src/alpha.rs
@@ -27,6 +27,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#[inline]
+fn div_by_255(v: u16) -> u8 {
+    ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
+}
+
 /// Associate alpha in place
 ///
 /// Note, for scaling alpha must be *associated*
@@ -36,17 +41,14 @@
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_rgba8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(4) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u16;
-        let mut r = chunk[0] as u16;
-        let mut g = chunk[1] as u16;
-        let mut b = chunk[2] as u16;
-        r = (r * a) / 255;
-        g = (g * a) / 255;
-        b = (b * a) / 255;
-        chunk[0] = r as u8;
-        chunk[1] = g as u8;
-        chunk[2] = b as u8;
+        chunk[0] = div_by_255(chunk[0] as u16 * a);
+        chunk[1] = div_by_255(chunk[1] as u16 * a);
+        chunk[2] = div_by_255(chunk[2] as u16 * a);
+        chunk[3] = div_by_255(a * a);
     }
 }
 
@@ -60,23 +62,17 @@ pub fn premultiply_rgba8(in_place: &mut [u8]) {
 ///
 ///
 pub fn unpremultiply_rgba8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(4) {
-        let a = chunk[3] as u16;
-        let mut r = chunk[0] as u16;
-        let mut g = chunk[1] as u16;
-        let mut b = chunk[2] as u16;
-        if a == 0 {
-            r = 0;
-            g = 0;
-            b = 0;
-        } else {
-            r = (r * 255) / a;
-            g = (g * 255) / a;
-            b = (b * 255) / a;
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(4) {
+        let a = chunk[3];
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8;
+            chunk[1] = ((chunk[1] as f32 * 255.) * a_recip) as u8;
+            chunk[2] = ((chunk[2] as f32 * 255.) * a_recip) as u8;
+            chunk[3] = ((a as f32 * 255.) * a_recip) as u8;
         }
-        chunk[0] = r as u8;
-        chunk[1] = g as u8;
-        chunk[2] = b as u8;
     }
 }
 
@@ -89,11 +85,12 @@ pub fn unpremultiply_rgba8(in_place: &mut [u8]) {
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_la8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(2) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(2) {
         let a = chunk[1] as u16;
-        let mut r = chunk[0] as u16;
-        r = (r * a) / 255;
-        chunk[0] = r as u8;
+        chunk[0] = div_by_255(chunk[0] as u16 * a);
+        chunk[1] = div_by_255(chunk[1] as u16 * a);
     }
 }
 
@@ -107,15 +104,15 @@ pub fn premultiply_la8(in_place: &mut [u8]) {
 ///
 ///
 pub fn unpremultiply_la8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(2) {
-        let a = chunk[1] as u16;
-        let mut r = chunk[0] as u16;
-        if a == 0 {
-            r = 0;
-        } else {
-            r = (r * 255) / a;
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(2) {
+        let a = chunk[1];
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8;
+            chunk[1] = ((a as f32 * 255.) * a_recip) as u8;
         }
-        chunk[0] = r as u8;
     }
 }
 
@@ -129,19 +126,20 @@ pub fn unpremultiply_la8(in_place: &mut [u8]) {
 /// * `bit_depth`: Bit-depth of the image
 ///
 pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(4) {
+    let recip_max_colors = 1. / max_colors as f32;
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u32;
-        let mut r = chunk[0] as u32;
-        let mut g = chunk[1] as u32;
-        let mut b = chunk[2] as u32;
-        r = (r * a) / max_colors;
-        g = (g * a) / max_colors;
-        b = (b * a) / max_colors;
-        chunk[0] = r as u16;
-        chunk[1] = g as u16;
-        chunk[2] = b as u16;
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+            as u16;
+        chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+            as u16;
+        chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+            as u16;
+        chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16;
     }
 }
 
@@ -155,13 +153,16 @@ pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
 /// * `bit_depth`: Bit-depth of the image
 ///
 pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(2) {
+    let recip_max_colors = 1. / max_colors as f32;
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[1] as u32;
-        let mut r = chunk[0] as u32;
-        r = (r * a) / max_colors;
-        chunk[0] = r as u16;
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+            as u16;
+        chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16;
     }
 }
 
@@ -176,17 +177,17 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
 ///
 ///
 pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(2) {
+    for chunk in in_place.chunks_exact_mut(2) {
         let a = chunk[1] as u32;
-        let mut r = chunk[0] as u32;
-        if a == 0 {
-            r = 0;
-        } else {
-            r = (r * max_colors) / a;
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[1] = ((a * max_colors) as f32 * a_recip) as u16;
         }
-        chunk[0] = r as u16;
     }
 }
 
@@ -201,25 +202,19 @@ pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) {
 ///
 ///
 pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(4) {
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u32;
-        let mut r = chunk[0] as u32;
-        let mut g = chunk[1] as u32;
-        let mut b = chunk[2] as u32;
-        if a == 0 {
-            r = 0;
-            g = 0;
-            b = 0;
-        } else {
-            r = (r * max_colors) / a;
-            g = (g * max_colors) / a;
-            b = (b * max_colors) / a;
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[1] = ((chunk[1] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[2] = ((chunk[2] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[3] = ((a * max_colors) as f32 * a_recip) as u16;
         }
-        chunk[0] = r as u16;
-        chunk[1] = g as u16;
-        chunk[2] = b as u16;
     }
 }
 
@@ -232,17 +227,12 @@ pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_rgba_f32(in_place: &mut [f32]) {
-    for chunk in in_place.chunks_mut(4) {
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3];
-        let mut r = chunk[0];
-        let mut g = chunk[1];
-        let mut b = chunk[2];
-        r *= a;
-        g *= a;
-        b *= a;
-        chunk[0] = r;
-        chunk[1] = g;
-        chunk[2] = b;
+        chunk[0] = chunk[0] * a;
+        chunk[1] = chunk[1] * a;
+        chunk[2] = chunk[2] * a;
+        chunk[3] = a;
     }
 }
 
@@ -256,22 +246,13 @@ pub fn premultiply_rgba_f32(in_place: &mut [f32]) {
 ///
 ///
 pub fn unpremultiply_rgba_f32(in_place: &mut [f32]) {
-    for chunk in in_place.chunks_mut(4) {
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3];
-        let mut r = chunk[0];
-        let mut g = chunk[1];
-        let mut b = chunk[2];
-        if a == 0. {
-            r = 0.;
-            g = 0.;
-            b = 0.;
-        } else {
-            r /= a;
-            g /= a;
-            b /= a;
+        if a != 0. {
+            let a_recip = 1. / a;
+            chunk[0] *= a_recip;
+            chunk[1] *= a_recip;
+            chunk[2] *= a_recip;
         }
-        chunk[0] = r;
-        chunk[1] = g;
-        chunk[2] = b;
     }
 }
diff --git a/src/fixed_point_vertical.rs b/src/fixed_point_vertical.rs
index 819c2b8..fefc7d2 100644
--- a/src/fixed_point_vertical.rs
+++ b/src/fixed_point_vertical.rs
@@ -68,6 +68,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
         let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;
@@ -127,6 +128,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
         let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;
@@ -207,6 +209,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_four<
     }
 
     for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
         let py = bounds.start + j + 1;
         let weight = k_weight.as_();
         let offset = src_stride * py + v_start_px;

From a859f0e6e88036dccd864bb4ee29e4154b69ecd1 Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 20:12:26 +0100
Subject: [PATCH 5/8] Fix clamp in alpha

---
 src/alpha.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/alpha.rs b/src/alpha.rs
index fd8bb5e..6c384e8 100644
--- a/src/alpha.rs
+++ b/src/alpha.rs
@@ -133,13 +133,13 @@ pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
     let recip_max_colors = 1. / max_colors as f32;
     for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u32;
-        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
             as u16;
-        chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+        chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
             as u16;
-        chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+        chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
             as u16;
-        chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16;
+        chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16;
     }
 }
 
@@ -160,9 +160,9 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
     let recip_max_colors = 1. / max_colors as f32;
     for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[1] as u32;
-        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32)
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
             as u16;
-        chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16;
+        chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16;
     }
 }
 

From c296b4c9f7b203380aeb7fd3a0cae818a4cf45fd Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 20:13:25 +0100
Subject: [PATCH 6/8] Fix clippy lints

---
 src/alpha.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/alpha.rs b/src/alpha.rs
index 6c384e8..f8ad5e8 100644
--- a/src/alpha.rs
+++ b/src/alpha.rs
@@ -227,11 +227,13 @@ pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_rgba_f32(in_place: &mut [f32]) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3];
-        chunk[0] = chunk[0] * a;
-        chunk[1] = chunk[1] * a;
-        chunk[2] = chunk[2] * a;
+        chunk[0] *= a;
+        chunk[1] *= a;
+        chunk[2] *= a;
         chunk[3] = a;
     }
 }
@@ -253,6 +255,7 @@ pub fn unpremultiply_rgba_f32(in_place: &mut [f32]) {
             chunk[0] *= a_recip;
             chunk[1] *= a_recip;
             chunk[2] *= a_recip;
+            chunk[3] = a;
         }
     }
 }

From 24beec4655c41be3c17b5a9d480d1e647d5b2485 Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 20:49:04 +0100
Subject: [PATCH 7/8] Fix readme

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 62572da..0ed5e06 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,6 @@ let img = ImageReader::open("./assets/nasa-4928x3279.png")
 let dimensions = img.dimensions();
 let transient = img.to_rgb8();
 
-let start = Instant::now();
-
 let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
 let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
 

From 2ba1f584185d310ac4778f2e7851b0a53dfc8e17 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 26 Oct 2024 20:53:35 +0100
Subject: [PATCH 8/8] Fix la16

---
 src/alpha.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alpha.rs b/src/alpha.rs
index f8ad5e8..813f9e1 100644
--- a/src/alpha.rs
+++ b/src/alpha.rs
@@ -158,7 +158,7 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
     let recip_max_colors = 1. / max_colors as f32;
-    for chunk in in_place.chunks_exact_mut(4) {
+    for chunk in in_place.chunks_exact_mut(2) {
         let a = chunk[1] as u32;
         chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
             as u16;