From f417be26b84ebd2306eef27b74c711dd9c17e99e Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 14:13:30 +0100 Subject: [PATCH 1/8] Loop unroll for common cases, fix kernel generation, added `Area` resampling method, aarch64 benchmark updates --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 26 +-- app/benches/resize_rgb/main.rs | 2 +- app/src/main.rs | 31 ++-- src/color_group.rs | 20 +-- src/compute_weights.rs | 238 +++++++++++++++++---------- src/fixed_point_horizontal.rs | 191 +++++++++++++++++----- src/floating_point_horizontal.rs | 189 +++++++++++++++++++--- src/floating_point_vertical.rs | 265 ++++++++++++++++++++++++++++--- src/handler_provider.rs | 2 + src/resize_floating_point.rs | 9 +- src/resize_nearest.rs | 8 +- src/sampler.rs | 48 ++++-- 14 files changed, 813 insertions(+), 220 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5cf0287..4f6bbb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -755,7 +755,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pic-scale-safe" -version = "0.1.2" +version = "0.1.3" dependencies = [ "num-traits", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 6de0700..bd1afeb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["app"] } [package] name = "pic-scale-safe" -version = "0.1.2" +version = "0.1.3" edition = "2021" description = "Fast and safe performance image scaling" readme = "README.md" diff --git a/README.md b/README.md index b89959e..62572da 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,8 @@ cargo bench --bench resize_rgba --manifest-path ./app/Cargo.toml | | Lanczos3 | Bilinear | |---------------------------|:--------:|:--------:| | image(aarch64) | 121.19 | 48.89 | -| pic-scale(aarch64) | 26.90 | 15.13 | -| fir(aarch64) | 25.93 | 11.30 | +| pic-scale(aarch64) | 11.89 | 8.92 | +| fir(aarch64) | 25.89 | 11.30 | | image(x86) | 192.52 | 88.63 | | pic-scale(x86) | 49.79 | 35.98 | | pic-scale(x86-cpu-native) | 27.21 | 20.48 | @@ -47,8 +47,8 @@ cargo bench --bench resize_rgb --manifest-path ./app/Cargo.toml | | Lanczos3 | Bilinear | |--------------------|:--------:|:--------:| | image(aarch64) | 123.85 | 51.30 | -| pic-scale(aarch64) | 31.73 | 18.20 | -| fir(aarch64) | 24.04 | 11.37 | +| pic-scale(aarch64) | 17.23 | 12.32 | +| fir(aarch64) | 23.61 | 10.53 | | image(x86) | 201.52 | 90.82 | | pic-scale(x86) | 34.54 | 25.05 | | fir(x86) | 41.97 | 25.21 | @@ -61,9 +61,9 @@ cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml | | Lanczos3 | Bilinear | |---------------------------|:--------:|:--------:| -| image(aarch64) | 123.27 | 52.91 | -| pic-scale(aarch64) | 28.041 | 18.89 | -| fir(aarch64) | 149.87 | 50.08 | +| image(aarch64) | 262.32 | 76.91 | +| pic-scale(aarch64) | 15.49 | 11.38 | +| fir(aarch64) | 141.78 | 50.08 | | image(x86) | 196.28 | 194.75 | | pic-scale(x86) | 59.89 | 57.99 | | pic-scale(x86-cpu-native) | 44.07 | 57.99 | @@ -77,9 +77,9 @@ cargo bench --bench resize_rgb_u16 --manifest-path ./app/Cargo.toml | | Lanczos3 | Bilinear | |--------------------|:--------:|:--------:| -| image(aarch64) | 130.45 | 61.06 | -| pic-scale(aarch64) | 36.10 | 23.80 | -| fir(aarch64) | 122.01 | 43.36 | +| image(aarch64) | 130.45 | 57.38 | +| pic-scale(aarch64) | 16.17 | 12.11 | +| fir(aarch64) | 110.06 | 42.04 | Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times. @@ -89,9 +89,9 @@ cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml | | Lanczos3 | Bilinear | |--------------------|:--------:|:--------:| -| image(aarch64) | 100.16 | 51.21 | -| pic-scale(aarch64) | 43.04 | 19.16 | -| fir(aarch64) | 114.35 | 37.75 | +| image(aarch64) | 100.16 | 50.09 | +| pic-scale(aarch64) | 14.07 | 11.18 | +| fir(aarch64) | 105.30 | 37.75 | | image(x86) | 164.04 | 98.90 | | pic-scale(x86) | 57.39 | 43.84 | | fir(x86) | 60.30 | 29.92 | diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs index 57fb509..a1be0ed 100644 --- a/app/benches/resize_rgb/main.rs +++ b/app/benches/resize_rgb/main.rs @@ -57,7 +57,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("Image RGB: Bilinear", |b| { + c.bench_function("Image RGB: Lanczos3", |b| { b.iter(|| { _ = dyn_image.clone().resize_exact( dimensions.0 / 4, diff --git a/app/src/main.rs b/app/src/main.rs index db6299a..19a1c94 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -31,13 +31,14 @@ mod image_wrapper; use fast_image_resize::images::Image; use fast_image_resize::{CpuExtensions, FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer}; use image::{ - DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat, ImageReader, Rgb, - RgbImage, + imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat, + ImageReader, Rgb, RgbImage, }; use pic_scale_safe::{ - resize_floating_point, resize_rgb16, resize_rgb8, resize_rgba16, resize_rgba8, ImageSize, - ResamplingFunction, + resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, + resize_rgba16, resize_rgba8, ImageSize, ResamplingFunction, }; +use std::ops::{BitXor, Shr}; use std::time::Instant; fn main() { @@ -53,13 +54,16 @@ fn main() { let start = Instant::now(); let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize); - let dst_size = ImageSize::new(dimensions.0 as usize * 4, dimensions.1 as usize * 4); + let dst_size = ImageSize::new( + (dimensions.0 as f32 + 1.) as usize, + (dimensions.1 as f32 + 1.) as usize, + ); let mut resized = resize_rgb8( &working_store, src_size, dst_size, - ResamplingFunction::MitchellNetravalli, + ResamplingFunction::Lanczos3, ) .unwrap(); @@ -68,10 +72,13 @@ fn main() { // let rgba_image = DynamicImage::ImageRgb16(ImageBuffer::, Vec>::from_vec(dimensions.0 * 4, dimensions.1 / 4, resized).unwrap()); // rgba_image.save_with_format("converted.png", ImageFormat::Png).unwrap(); - // let shifted = resized.iter().map(|&x| (x >> 8) as u8).collect::>(); + // let shifted = resized + // .iter() + // .map(|&x| (x * 255.) as u8) + // .collect::>(); image::save_buffer( - "converted.jpg", + "converted.png", &resized, dst_size.width as u32, dst_size.height as u32, @@ -83,7 +90,7 @@ fn main() { // let pixel_type: PixelType = PixelType::U8x3; // let src_image = // Image::from_slice_u8(dimensions.0, dimensions.1, &mut src_bytes, pixel_type).unwrap(); - // let mut dst_image = Image::new(dimensions.0 * 4, dimensions.1 * 4, pixel_type); + // let mut dst_image = Image::new(dimensions.0 / 8, dimensions.1 / 8, pixel_type); // // let mut resizer = Resizer::new(); // unsafe { @@ -97,7 +104,7 @@ fn main() { // &src_image, // &mut dst_image, // &ResizeOptions::new() - // .resize_alg(ResizeAlg::Convolution(FilterType::Mitchell)) + // .resize_alg(ResizeAlg::Convolution(FilterType::Bilinear)) // .use_alpha(false), // ) // .unwrap(); @@ -109,13 +116,13 @@ fn main() { // // let rgba_image = DynamicImage::ImageRgb8(RgbImage::from_raw(dst_image.width() as u32, dst_image.height() as u32, dst_image.buffer().to_vec()).unwrap()); // // rgba_image.save_with_format("fast_image.png", ImageFormat::Png).unwrap(); // image::save_buffer( - // "fast_image.jpg", + // "fast_image.png", // dst_image.buffer(), // dst_image.width(), // dst_image.height(), // image::ColorType::Rgb8, // ) - // .unwrap(); + // .unwrap(); } fn u8_to_u16(u8_buffer: &[u8]) -> &[u16] { diff --git a/src/color_group.rs b/src/color_group.rs index aa20b72..4d9daf0 100644 --- a/src/color_group.rs +++ b/src/color_group.rs @@ -74,30 +74,30 @@ where #[macro_export] macro_rules! fast_load_color_group { - ($store: expr, $channels: expr) => {{ + ($store: expr, $channels: expr, $vtype: ty) => {{ if $channels == 1 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[0].as_(), g: 0.as_(), b: 0.as_(), a: 0.as_(), } } else if $channels == 2 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[0].as_(), g: $store[1].as_(), b: 0.as_(), a: 0.as_(), } } else if $channels == 3 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[0].as_(), g: $store[1].as_(), b: $store[2].as_(), a: 0.as_(), } } else if $channels == 4 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[0].as_(), g: $store[1].as_(), b: $store[2].as_(), @@ -111,30 +111,30 @@ macro_rules! fast_load_color_group { #[macro_export] macro_rules! fast_load_color_group_with_offset { - ($store: expr, $channels: expr, $offset: expr) => {{ + ($store: expr, $channels: expr, $offset: expr, $vtype: ty) => {{ if $channels == 1 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[$offset].as_(), g: 0.as_(), b: 0.as_(), a: 0.as_(), } } else if $channels == 2 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[$offset].as_(), g: $store[$offset + 1].as_(), b: 0.as_(), a: 0.as_(), } } else if $channels == 3 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[$offset].as_(), g: $store[$offset + 1].as_(), b: $store[$offset + 2].as_(), a: 0.as_(), } } else if $channels == 4 { - ColorGroup { + ColorGroup::<$channels, $vtype> { r: $store[$offset].as_(), g: $store[$offset + 1].as_(), b: $store[$offset + 2].as_(), diff --git a/src/compute_weights.rs b/src/compute_weights.rs index a7db42d..6022253 100644 --- a/src/compute_weights.rs +++ b/src/compute_weights.rs @@ -30,7 +30,8 @@ use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::math::{ConstPI, ConstSqrt2, Jinc}; use crate::sampler::ResamplingFunction; use num_traits::{AsPrimitive, Float, Signed}; -use std::ops::{AddAssign, MulAssign, Neg}; +use std::fmt::Debug; +use std::ops::{AddAssign, Div, MulAssign, Neg}; pub(crate) fn generate_weights( function: ResamplingFunction, @@ -47,13 +48,18 @@ where + MulAssign + AddAssign + AsPrimitive + + AsPrimitive + AsPrimitive + Jinc + ConstSqrt2 + Default - + AsPrimitive, + + AsPrimitive + + Div + + Debug, f32: AsPrimitive, f64: AsPrimitive, + i64: AsPrimitive, + i32: AsPrimitive, usize: AsPrimitive, { let resampling_filter = function.get_resampling_filter(); @@ -66,98 +72,170 @@ where let filter_base_size = resampling_filter.min_kernel_size; let resampling_function = resampling_filter.kernel; let window_func = resampling_filter.window; - let base_size: usize = (filter_base_size.as_() * filter_scale_cutoff).round().as_(); - // Kernel size must be always odd - let kernel_size = base_size * 2 + 1usize; - let filter_radius = base_size.as_(); - let filter_scale = 1f32.as_() / filter_scale_cutoff; - let mut weights: Vec = vec![T::default(); kernel_size * out_size]; - let mut local_filters = vec![T::default(); kernel_size]; - let mut filter_position = 0usize; - let blur_scale = match window_func { - None => 1f32.as_(), - Some(window) => { - if window.blur.as_() > 0f32.as_() { - 1f32.as_() / window.blur.as_() - } else { - 0f32.as_() - } - } - }; let mut bounds: Vec = vec![FilterBounds::new(0, 0); out_size]; - for (i, bound) in bounds.iter_mut().enumerate() { - let center_x = ((i.as_() + 0.5.as_()) * scale).min(in_size.as_()); - let mut weights_sum: T = 0f32.as_(); - - let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_(); - let end: usize = (center_x + filter_radius) - .ceil() - .min(in_size.as_()) - .min(start.as_() + kernel_size.as_()) - .as_(); - - let center = center_x - 0.5.as_(); - - for (local_filter_iteration, k) in (start..end).enumerate() { - let dx = k.as_() - center; - let weight; - if let Some(resampling_window) = window_func { - let mut x = dx.abs(); - x = if resampling_window.blur.as_() > 0f32.as_() { - x * blur_scale + let is_area = resampling_filter.is_area_filter && scale < 1.as_(); + + if !is_area { + let base_size: usize = (filter_base_size.as_() * filter_scale_cutoff).round().as_(); + let kernel_size = base_size; + let filter_radius = base_size.as_() / 2.as_(); + let filter_scale = 1f32.as_() / filter_scale_cutoff; + let mut weights: Vec = vec![T::default(); kernel_size * out_size]; + let mut local_filters = vec![T::default(); kernel_size]; + let mut filter_position = 0usize; + let blur_scale = match window_func { + None => 1f32.as_(), + Some(window) => { + if window.blur.as_() > 0f32.as_() { + 1f32.as_() / window.blur.as_() } else { - x - }; - x = if x <= resampling_window.taper.as_() { 0f32.as_() + } + } + }; + + for (i, bound) in bounds.iter_mut().enumerate() { + let center_x = ((i.as_() + 0.5.as_()) * scale).min(in_size.as_()); + let mut weights_sum: T = 0f32.as_(); + + let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_(); + let end: usize = (center_x + filter_radius) + .ceil() + .min(in_size.as_()) + .min(start.as_() + kernel_size.as_()) + .as_(); + + let center = center_x - 0.5.as_(); + + for (local_filter_iteration, k) in (start..end).enumerate() { + let dx = k.as_() - center; + let weight; + if let Some(resampling_window) = window_func { + let mut x = dx.abs(); + x = if resampling_window.blur.as_() > 0f32.as_() { + x * blur_scale + } else { + x + }; + x = if x <= resampling_window.taper.as_() { + 0f32.as_() + } else { + (x - resampling_window.taper.as_()) + / (1f32.as_() - resampling_window.taper.as_()) + }; + let window_producer = resampling_window.window; + let x_kernel_scaled = x * filter_scale; + let window = if x < resampling_window.window_size.as_() { + window_producer(x_kernel_scaled * resampling_window.window_size.as_()) + } else { + 0f32.as_() + }; + weight = window * resampling_function(x_kernel_scaled); } else { - (x - resampling_window.taper.as_()) - / (1f32.as_() - resampling_window.taper.as_()) - }; - let window_producer = resampling_window.window; - let x_kernel_scaled = x * filter_scale; - let window = if x < resampling_window.window_size.as_() { - window_producer(x_kernel_scaled * resampling_window.window_size.as_()) - } else { - 0f32.as_() - }; - weight = window * resampling_function(x_kernel_scaled); - } else { - let dx = dx.abs(); - weight = resampling_function(dx * filter_scale); + let dx = dx.abs(); + weight = resampling_function(dx * filter_scale); + } + weights_sum += weight; + local_filters[local_filter_iteration] = weight; + } + + let size = end - start; + + *bound = FilterBounds::new(start, size); + + if weights_sum != 0f32.as_() { + let recpeq = 1f32.as_() / weights_sum; + + for (dst, src) in weights + .iter_mut() + .skip(filter_position) + .take(size) + .zip(local_filters.iter().take(size)) + { + *dst = *src * recpeq; + } } - weights_sum += weight; - local_filters[local_filter_iteration] = weight; + + filter_position += kernel_size; } - let size = end - start; + FilterWeights::::new( + weights, + kernel_size, + kernel_size, + out_size, + filter_radius.as_(), + bounds, + ) + } else { + // Simulating INTER_AREA from OpenCV, for up scaling here, + // this is necessary because weight computation is different + // from any other func + let inv_scale: T = 1.as_() / scale; + let kernel_size = 2; + let filter_radius: T = 1.as_(); + let mut weights: Vec = vec![T::default(); kernel_size * out_size]; + let mut local_filters = vec![T::default(); kernel_size]; + let mut filter_position = 0usize; + + for (i, bound) in bounds.iter_mut().enumerate() { + let mut weights_sum: T = 0f32.as_(); + + let sx: T = (i.as_() * scale).floor(); + let fx = (i as i64 + 1).as_() - (sx + 1.as_()) * inv_scale; + let dx = if fx <= 0.as_() { + 0.as_() + } else { + fx - fx.floor() + }; + let dx = dx.abs(); + let weight0 = 1.as_() - dx; + let weight1: T = dx; + local_filters[0] = weight0; + local_filters[1] = weight1; - *bound = FilterBounds::new(start, size); + let start: usize = sx.floor().max(0f32.as_()).as_(); + let end: usize = (sx + kernel_size.as_()) + .ceil() + .min(in_size.as_()) + .min(start.as_() + kernel_size.as_()) + .as_(); - if weights_sum != 0f32.as_() { - let recpeq = 1f32.as_() / weights_sum; + let size = end - start; - for (dst, src) in weights - .iter_mut() - .skip(filter_position) - .take(size) - .zip(local_filters.iter().take(size)) - { - *dst = *src * recpeq; + weights_sum += weight0; + if size > 1 { + weights_sum += weight1; } + *bound = FilterBounds::new(start, size); + + if weights_sum != 0f32.as_() { + let recpeq = 1f32.as_() / weights_sum; + + for (dst, src) in weights + .iter_mut() + .skip(filter_position) + .take(size) + .zip(local_filters.iter().take(size)) + { + *dst = *src * recpeq; + } + } else { + weights[filter_position] = 1.as_(); + } + + filter_position += kernel_size; } - filter_position += kernel_size; + FilterWeights::new( + weights, + kernel_size, + kernel_size, + out_size, + filter_radius.as_(), + bounds, + ) } - - FilterWeights::::new( - weights, - kernel_size, - kernel_size, - out_size, - filter_radius.as_(), - bounds, - ) } diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs index 8f90b71..92e4a77 100644 --- a/src/fixed_point_horizontal.rs +++ b/src/fixed_point_horizontal.rs @@ -30,14 +30,21 @@ use crate::color_group::ColorGroup; use crate::definitions::ROUNDING_CONST; use crate::filter_weights::FilterWeights; use crate::saturate_narrow::SaturateNarrow; -use crate::{fast_load_color_group, fast_store_color_group}; +use crate::{fast_load_color_group, fast_load_color_group_with_offset, fast_store_color_group}; use num_traits::AsPrimitive; -use std::ops::{AddAssign, Mul}; +use std::ops::{Add, AddAssign, Mul}; #[inline(always)] pub(crate) fn convolve_row_handler_fixed_point< T: Copy + 'static + AsPrimitive + Default, - J: Copy + 'static + AsPrimitive + Mul + AddAssign + SaturateNarrow + Default, + J: Copy + + 'static + + AsPrimitive + + Mul + + AddAssign + + SaturateNarrow + + Default + + Add, const CHANNELS: usize, >( src: &[T], @@ -60,19 +67,48 @@ pub(crate) fn convolve_row_handler_fixed_point< let mut sums = ColorGroup::::dup(ROUNDING_CONST.as_()); let start_x = bounds.start; + let bounds_size = bounds.size; let px = start_x * CHANNELS; - let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; - - for (&k_weight, src) in weights - .iter() - .zip(src_ptr0.chunks_exact(CHANNELS)) - .take(bounds.size) - { - let weight: J = k_weight.as_(); - let new_px = fast_load_color_group!(src, CHANNELS); - sums += new_px * weight; + if bounds_size == 2 { + let src_ptr0 = &src[px..(px + 2 * CHANNELS)]; + let sliced_weights = &weights[0..2]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1; + } else if bounds_size == 3 { + let src_ptr0 = &src[px..(px + 3 * CHANNELS)]; + let sliced_weights = &weights[0..3]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2; + } else if bounds_size == 4 { + let src_ptr0 = &src[px..(px + 4 * CHANNELS)]; + let sliced_weights = &weights[0..4]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3; + } else { + let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; + for (&k_weight, src) in weights + .iter() + .zip(src_ptr0.chunks_exact(CHANNELS)) + .take(bounds.size) + { + let weight: J = k_weight.as_(); + let new_px = fast_load_color_group!(src, CHANNELS, J); + sums += new_px * weight; + } } let narrowed = sums.saturate_narrow(bit_depth); @@ -83,7 +119,14 @@ pub(crate) fn convolve_row_handler_fixed_point< #[inline(always)] pub(crate) fn convolve_row_handler_fixed_point_4< T: Copy + 'static + AsPrimitive + Default, - J: Copy + 'static + AsPrimitive + Mul + AddAssign + SaturateNarrow + Default, + J: Copy + + 'static + + AsPrimitive + + Mul + + AddAssign + + SaturateNarrow + + Default + + Add, const CHANNELS: usize, >( src: &[T], @@ -124,30 +167,102 @@ pub(crate) fn convolve_row_handler_fixed_point_4< let start_x = bounds.start; let px = start_x * CHANNELS; - let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)]; - - for ((((&k_weight, src0), src1), src2), src3) in weights - .iter() - .zip(src_ptr0.chunks_exact(CHANNELS)) - .zip(src_ptr1.chunks_exact(CHANNELS)) - .zip(src_ptr2.chunks_exact(CHANNELS)) - .zip(src_ptr3.chunks_exact(CHANNELS)) - .take(bounds.size) - { - let weight: J = k_weight.as_(); - - let new_px0 = fast_load_color_group!(src0, CHANNELS); - let new_px1 = fast_load_color_group!(src1, CHANNELS); - let new_px2 = fast_load_color_group!(src2, CHANNELS); - let new_px3 = fast_load_color_group!(src3, CHANNELS); - - sums0 += new_px0 * weight; - sums1 += new_px1 * weight; - sums2 += new_px2 * weight; - sums3 += new_px3 * weight; + let bounds_size = bounds.size; + + if bounds_size == 2 { + let src_ptr0 = &src[px..(px + 2 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * CHANNELS)]; + + let sliced_weights = &weights[0..2]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1; + sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1; + sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1; + sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1; + } else if bounds_size == 3 { + let src_ptr0 = &src[px..(px + 3 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * CHANNELS)]; + + let sliced_weights = &weights[0..3]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2; + sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2; + sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2; + sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2; + } else if bounds_size == 4 { + let src_ptr0 = &src[px..(px + 4 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * CHANNELS)]; + + let sliced_weights = &weights[0..4]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3; + sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3; + sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3; + sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3; + } else { + let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)]; + let src_ptr2 = + &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size * CHANNELS)]; + let src_ptr3 = + &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size * CHANNELS)]; + + for ((((&k_weight, src0), src1), src2), src3) in weights + .iter() + .zip(src_ptr0.chunks_exact(CHANNELS)) + .zip(src_ptr1.chunks_exact(CHANNELS)) + .zip(src_ptr2.chunks_exact(CHANNELS)) + .zip(src_ptr3.chunks_exact(CHANNELS)) + .take(bounds.size) + { + let weight: J = k_weight.as_(); + + let new_px0 = fast_load_color_group!(src0, CHANNELS, J); + let new_px1 = fast_load_color_group!(src1, CHANNELS, J); + let new_px2 = fast_load_color_group!(src2, CHANNELS, J); + let new_px3 = fast_load_color_group!(src3, CHANNELS, J); + + sums0 += new_px0 * weight; + sums1 += new_px1 * weight; + sums2 += new_px2 * weight; + sums3 += new_px3 * weight; + } } let narrowed0 = sums0.saturate_narrow(bit_depth); diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs index 9c9edb9..e9fcd75 100644 --- a/src/floating_point_horizontal.rs +++ b/src/floating_point_horizontal.rs @@ -29,7 +29,9 @@ use crate::color_group::ColorGroup; use crate::filter_weights::FilterWeights; use crate::mixed_storage::MixedStorage; -use crate::{fast_load_color_group, fast_mixed_store_color_group}; +use crate::{ + fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group, +}; use num_traits::{AsPrimitive, Float, MulAdd}; use std::ops::{Add, Mul}; @@ -81,7 +83,7 @@ pub(crate) fn convolve_row_handler_floating_point< .take(bounds.size) { let weight: J = k_weight.as_(); - let new_px = fast_load_color_group!(src, CHANNELS); + let new_px = fast_load_color_group!(src, CHANNELS, J); sums = sums.mul_add(new_px, weight); } @@ -143,30 +145,171 @@ pub(crate) fn convolve_row_handler_floating_point_4< let start_x = bounds.start; let px = start_x * CHANNELS; - let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)]; - for ((((&k_weight, src0), src1), src2), src3) in weights - .iter() - .zip(src_ptr0.chunks_exact(CHANNELS)) - .zip(src_ptr1.chunks_exact(CHANNELS)) - .zip(src_ptr2.chunks_exact(CHANNELS)) - .zip(src_ptr3.chunks_exact(CHANNELS)) - .take(bounds.size) - { - let weight: J = k_weight.as_(); + let bounds_size = bounds.size; + + if bounds_size == 2 { + let src_ptr0 = &src[px..(px + 2 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * CHANNELS)]; + + let sliced_weights = &weights[0..2]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ); + sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ); + sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight1, + ); + sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight1, + ); + } else if bounds_size == 3 { + let src_ptr0 = &src[px..(px + 3 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * CHANNELS)]; + + let sliced_weights = &weights[0..3]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ); + sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight2, + ); + sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ); + sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J), + weight2, + ); + } else if bounds_size == 4 { + let src_ptr0 = &src[px..(px + 4 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * CHANNELS)]; + + let sliced_weights = &weights[0..4]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J), + weight3, + ); + sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight3, + ); + sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J), + weight3, + ); + sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), + weight3, + ); + } else { + let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)]; + let src_ptr2 = + &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CHANNELS)]; + let src_ptr3 = + &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CHANNELS)]; + + for ((((&k_weight, src0), src1), src2), src3) in weights + .iter() + .zip(src_ptr0.chunks_exact(CHANNELS)) + .zip(src_ptr1.chunks_exact(CHANNELS)) + .zip(src_ptr2.chunks_exact(CHANNELS)) + .zip(src_ptr3.chunks_exact(CHANNELS)) + .take(bounds.size) + { + let weight: J = k_weight.as_(); - let new_px0 = fast_load_color_group!(src0, CHANNELS); - let new_px1 = fast_load_color_group!(src1, CHANNELS); - let new_px2 = fast_load_color_group!(src2, CHANNELS); - let new_px3 = fast_load_color_group!(src3, CHANNELS); + let new_px0 = fast_load_color_group!(src0, CHANNELS, J); + let new_px1 = fast_load_color_group!(src1, CHANNELS, J); + let new_px2 = fast_load_color_group!(src2, CHANNELS, J); + let new_px3 = fast_load_color_group!(src3, CHANNELS, J); - sums0 = sums0.mul_add(new_px0, weight); - sums1 = sums1.mul_add(new_px1, weight); - sums2 = sums2.mul_add(new_px2, weight); - sums3 = sums3.mul_add(new_px3, weight); + sums0 = sums0.mul_add(new_px0, weight); + sums1 = sums1.mul_add(new_px1, weight); + sums2 = sums2.mul_add(new_px2, weight); + sums3 = sums3.mul_add(new_px3, weight); + } } fast_mixed_store_color_group!(sums0, chunk0, CHANNELS, bit_depth); diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs index 8872200..91cea1e 100644 --- a/src/floating_point_vertical.rs +++ b/src/floating_point_vertical.rs @@ -70,21 +70,175 @@ pub(crate) fn convolve_column_handler_floating_point_4< let v_start_px = x * CHANNELS; - for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() { - let py = bounds.start + j; - let weight = k_weight.as_(); - let offset = src_stride * py + v_start_px; - let src_ptr = &src[offset..(offset + CHANNELS * 4)]; - - let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0); - let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS); - let new_px2 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2); - let new_px3 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3); - - sums0 = sums0.mul_add(new_px0, weight); - sums1 = sums1.mul_add(new_px1, weight); - sums2 = sums2.mul_add(new_px2, weight); - sums3 = sums3.mul_add(new_px3, weight); + let bounds_start = bounds.start; + let bounds_size = bounds.size; + + if bounds_size == 2 { + let weights = &filter[0..2]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ); + sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ); + sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight1, + ); + sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight1, + ); + } else if bounds_size == 3 { + let weights = &filter[0..3]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ); + + sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight2, + ); + + sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ); + + sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J), + weight2, + ); + } else if bounds_size == 4 { + let weights = &filter[0..4]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let weight3 = weights[3].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let offset3 = src_stride * (bounds_start + 3) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J), + weight3, + ); + + sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight3, + ); + + sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J), + weight3, + ); + + sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), + weight3, + ); + } else { + for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() { + let py = bounds_start + j; + let weight = k_weight.as_(); + let offset = src_stride * py + v_start_px; + let src_ptr = &src[offset..(offset + CHANNELS * 4)]; + + let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0, J); + let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J); + let new_px2 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J); + let new_px3 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J); + + sums0 = sums0.mul_add(new_px0, weight); + sums1 = sums1.mul_add(new_px1, weight); + sums2 = sums2.mul_add(new_px2, weight); + sums3 = sums3.mul_add(new_px3, weight); + } } let v_dst = &mut dst[v_start_px..(v_start_px + CHANNELS * 4)]; @@ -142,15 +296,82 @@ pub(crate) fn convolve_column_handler_floating_point< let v_start_px = x * CHANNELS; - for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() { - let py = bounds.start + j; - let weight = k_weight.as_(); - let offset = src_stride * py + v_start_px; - let src_ptr = &src[offset..(offset + CHANNELS)]; + let bounds_size = bounds.size; + let bounds_start = bounds.start; + + if bounds_size == 2 { + let weights = &filter[0..2]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ); + } else if bounds_size == 3 { + let weights = &filter[0..3]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ); + } else if bounds_size == 4 { + let weights = &filter[0..4]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let weight3 = weights[3].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let offset3 = src_stride * (bounds_start + 3) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J), + weight3, + ); + } else { + for (j, &k_weight) in filter.iter().take(bounds_size).enumerate() { + let py = bounds_start + j; + let weight = k_weight.as_(); + let offset = src_stride * py + v_start_px; + let src_ptr = &src[offset..(offset + CHANNELS)]; - let new_px0 = fast_load_color_group!(src_ptr, CHANNELS); + let new_px0 = fast_load_color_group!(src_ptr, CHANNELS, J); - sums0 = sums0.mul_add(new_px0, weight); + sums0 = sums0.mul_add(new_px0, weight); + } } fast_mixed_store_color_group!( diff --git a/src/handler_provider.rs b/src/handler_provider.rs index 3487fc5..ded4b30 100644 --- a/src/handler_provider.rs +++ b/src/handler_provider.rs @@ -90,6 +90,7 @@ where + Mul + AddAssign + SaturateNarrow + + Add + Default, i32: AsPrimitive, u8: AsPrimitive, @@ -131,6 +132,7 @@ where + Mul + AddAssign + SaturateNarrow + + Add + Default, i32: AsPrimitive, u16: AsPrimitive, diff --git a/src/resize_floating_point.rs b/src/resize_floating_point.rs index c3cfc64..a05d040 100644 --- a/src/resize_floating_point.rs +++ b/src/resize_floating_point.rs @@ -34,6 +34,7 @@ use crate::mixed_storage::MixedStorage; use crate::resize_nearest::resize_nearest; use crate::{ImageSize, ResamplingFunction}; use num_traits::{AsPrimitive, Float, MulAdd, Signed}; +use std::fmt::Debug; use std::ops::{AddAssign, MulAssign, Neg}; /// Resizing image using exact convolution @@ -82,13 +83,17 @@ where + AddAssign + AsPrimitive + AsPrimitive + + AsPrimitive + + AsPrimitive + Jinc + ConstSqrt2 + Default + AsPrimitive + Send - + Sync, - i32: AsPrimitive, + + Sync + + Debug, + i32: AsPrimitive + AsPrimitive, + i64: AsPrimitive, f32: AsPrimitive, f32: AsPrimitive, f64: AsPrimitive, diff --git a/src/resize_nearest.rs b/src/resize_nearest.rs index f302d23..dd7f10c 100644 --- a/src/resize_nearest.rs +++ b/src/resize_nearest.rs @@ -60,8 +60,12 @@ pub fn resize_nearest( iter.enumerate().for_each(|(y, dst_row)| { for (x, dst_chunk) in dst_row.chunks_exact_mut(CHANNELS).enumerate() { - let src_x = (x as f32 * x_scale + 0.5f32).min(clip_width).max(0f32) as usize; - let src_y = (y as f32 * y_scale + 0.5f32).min(clip_height).max(0f32) as usize; + let src_x = ((x as f32 + 0.5f32) * x_scale - 0.5f32) + .min(clip_width) + .max(0f32) as usize; + let src_y = ((y as f32 + 0.5f32) * y_scale - 0.5f32) + .min(clip_height) + .max(0f32) as usize; let src_offset_y = src_y * src_stride; let src_px = src_x * CHANNELS; let offset = src_offset_y + src_px; diff --git a/src/sampler.rs b/src/sampler.rs index e9f7ccb..bb30110 100644 --- a/src/sampler.rs +++ b/src/sampler.rs @@ -103,6 +103,8 @@ pub enum ResamplingFunction { Lagrange3, Lanczos6, Lanczos6Jinc, + /// This method replicates `INTER_AREA` behaviour from OpenCV + Area, } impl From for ResamplingFunction { @@ -146,6 +148,7 @@ impl From for ResamplingFunction { 35 => ResamplingFunction::Lagrange3, 36 => ResamplingFunction::Lanczos6, 37 => ResamplingFunction::Lanczos6Jinc, + 38 => ResamplingFunction::Area, _ => ResamplingFunction::Bilinear, } } @@ -176,6 +179,7 @@ pub struct ResamplingFilter { pub window: Option>, pub min_kernel_size: f32, pub is_resizable_kernel: bool, + pub is_area_filter: bool, } impl ResamplingFilter { @@ -185,6 +189,17 @@ impl ResamplingFilter { window: None, min_kernel_size, is_resizable_kernel: true, + is_area_filter: false, + } + } + + fn new_area(kernel: fn(T) -> T, min_kernel_size: f32) -> ResamplingFilter { + ResamplingFilter { + kernel, + window: None, + min_kernel_size, + is_resizable_kernel: true, + is_area_filter: true, } } @@ -198,6 +213,7 @@ impl ResamplingFilter { window: Some(window), min_kernel_size, is_resizable_kernel: true, + is_area_filter: false, } } @@ -207,6 +223,7 @@ impl ResamplingFilter { window: None, min_kernel_size, is_resizable_kernel: false, + is_area_filter: false, } } } @@ -232,37 +249,38 @@ impl ResamplingFunction { { match self { ResamplingFunction::Bilinear => ResamplingFilter::new(bilinear, 2f32), + ResamplingFunction::Area => ResamplingFilter::new_area(box_weight, 2f32), ResamplingFunction::Nearest => { // Just a stab for nearest - ResamplingFilter::new(bilinear, 1f32) + ResamplingFilter::new(bilinear, 2f32) } - ResamplingFunction::Cubic => ResamplingFilter::new(cubic_spline::, 2f32), + ResamplingFunction::Cubic => ResamplingFilter::new(cubic_spline, 2f32), ResamplingFunction::MitchellNetravalli => { - ResamplingFilter::new(mitchell_netravalli::, 2f32) + ResamplingFilter::new(mitchell_netravalli, 2f32) } ResamplingFunction::Lanczos3 => ResamplingFilter::new(lanczos3, 3f32), - ResamplingFunction::CatmullRom => ResamplingFilter::new(catmull_rom::, 2f32), - ResamplingFunction::Hermite => ResamplingFilter::new(hermite_spline::, 2f32), - ResamplingFunction::BSpline => ResamplingFilter::new(b_spline::, 2f32), + ResamplingFunction::CatmullRom => ResamplingFilter::new(catmull_rom, 2f32), + ResamplingFunction::Hermite => ResamplingFilter::new(hermite_spline, 2f32), + ResamplingFunction::BSpline => ResamplingFilter::new(b_spline, 2f32), ResamplingFunction::Hann => ResamplingFilter::new(hann, 3f32), - ResamplingFunction::Bicubic => ResamplingFilter::new(bicubic_spline::, 3f32), + ResamplingFunction::Bicubic => ResamplingFilter::new(bicubic_spline, 3f32), ResamplingFunction::Lanczos4 => ResamplingFilter::new(lanczos4, 4f32), ResamplingFunction::Lanczos2 => ResamplingFilter::new(lanczos2, 2f32), - ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 1f32), - ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 1f32), - ResamplingFunction::Welch => ResamplingFilter::new(welch, 1f32), - ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 1.5f32), + ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 2f32), + ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 2f32), + ResamplingFunction::Welch => ResamplingFilter::new(welch, 2f32), + ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 2f32), ResamplingFunction::Gaussian => ResamplingFilter::new(gaussian, 2f32), ResamplingFunction::Sphinx => ResamplingFilter::new(sphinx, 2f32), - ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 1f32), - ResamplingFunction::Robidoux => ResamplingFilter::new(robidoux::, 2f32), - ResamplingFunction::RobidouxSharp => ResamplingFilter::new(robidoux_sharp::, 2f32), + ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 2f32), + ResamplingFunction::Robidoux => ResamplingFilter::new(robidoux, 2f32), + ResamplingFunction::RobidouxSharp => ResamplingFilter::new(robidoux_sharp, 2f32), ResamplingFunction::Spline16 => ResamplingFilter::new_with_fixed_kernel(spline16, 2f32), ResamplingFunction::Spline36 => ResamplingFilter::new_with_fixed_kernel(spline36, 4f32), ResamplingFunction::Spline64 => ResamplingFilter::new_with_fixed_kernel(spline64, 6f32), ResamplingFunction::Kaiser => ResamplingFilter::new(kaiser, 2f32), ResamplingFunction::BartlettHann => ResamplingFilter::new(bartlett_hann, 2f32), - ResamplingFunction::Box => ResamplingFilter::new(box_weight, 0.5f32), + ResamplingFunction::Box => ResamplingFilter::new(box_weight, 2f32), ResamplingFunction::Bohman => ResamplingFilter::new(bohman, 2f32), ResamplingFunction::Lanczos2Jinc => ResamplingFilter::new(lanczos2_jinc, 2f32), ResamplingFunction::Lanczos3Jinc => ResamplingFilter::new(lanczos3_jinc, 3f32), From 69e4bdba931a019c81a5453fb6c71b3fca1d4d4c Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 26 Oct 2024 14:45:59 +0100 Subject: [PATCH 2/8] Bench updates for x86 --- README.md | 71 ++++++++++++++++++++++++++++++-------------------- command.ps1 | 4 +-- src/resizer.rs | 16 ++++++------ 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 62572da..e2d8bab 100644 --- a/README.md +++ b/README.md @@ -33,10 +33,11 @@ cargo bench --bench resize_rgba --manifest-path ./app/Cargo.toml | pic-scale(aarch64) | 11.89 | 8.92 | | fir(aarch64) | 25.89 | 11.30 | | image(x86) | 192.52 | 88.63 | -| pic-scale(x86) | 49.79 | 35.98 | -| pic-scale(x86-cpu-native) | 27.21 | 20.48 | +| pic-scale(x86) | 25.50 | 18.37 | | fir(x86) | 42.89 | 24.13 | -| fir(x86-cpu-native) | 41.17 | 23.62 | +| image(x86-cpu-native) | 205.64 | 89.02 | +| pic-scale(x86-cpu-native) | 14.39 | 11.31 | +| fir(x86-cpu-native) | 41.21 | 22.77 | Example comparison time for downscale RGB 4928x3279 `8 bit` image in 4 times. @@ -44,14 +45,17 @@ Example comparison time for downscale RGB 4928x3279 `8 bit` image in 4 times. cargo bench --bench resize_rgb --manifest-path ./app/Cargo.toml ``` -| | Lanczos3 | Bilinear | -|--------------------|:--------:|:--------:| -| image(aarch64) | 123.85 | 51.30 | -| pic-scale(aarch64) | 17.23 | 12.32 | -| fir(aarch64) | 23.61 | 10.53 | -| image(x86) | 201.52 | 90.82 | -| pic-scale(x86) | 34.54 | 25.05 | -| fir(x86) | 41.97 | 25.21 | +| | Lanczos3 | Bilinear | +|---------------------------|:--------:|:--------:| +| image(aarch64) | 123.85 | 51.30 | +| pic-scale(aarch64) | 17.23 | 12.32 | +| fir(aarch64) | 23.61 | 10.53 | +| image(x86) | 201.52 | 90.82 | +| pic-scale(x86) | 27.17 | 21.08 | +| fir(x86) | 41.97 | 24.39 | +| image(x86-cpu-native) | 184.57 | 84.69 | +| pic-scale(x86-cpu-native) | 20.96 | 15.16 | +| fir(x86-cpu-native) | 41.49 | 20.38 | Example comparison time for downscale RGBA 4928x3279 `16 bit` image in 4 times. @@ -64,10 +68,12 @@ cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml | image(aarch64) | 262.32 | 76.91 | | pic-scale(aarch64) | 15.49 | 11.38 | | fir(aarch64) | 141.78 | 50.08 | -| image(x86) | 196.28 | 194.75 | -| pic-scale(x86) | 59.89 | 57.99 | -| pic-scale(x86-cpu-native) | 44.07 | 57.99 | -| fir(x86) | 52.73 | 28.35 | +| image(x86) | 196.28 | 107.78 | +| pic-scale(x86) | 57.48 | 50.85 | +| fir(x86) | 139.56 | 58.48 | +| image(x86-cpu-native) | 192.85 | 102.05 | +| pic-scale(x86-cpu-native) | 39.60 | 46.44 | +| fir(x86-cpu-native) | 101.48 | 52.58 | Example comparison time for downscale RGB 4928x3279 `16 bit` image in 4 times. @@ -75,11 +81,17 @@ Example comparison time for downscale RGB 4928x3279 `16 bit` image in 4 times. cargo bench --bench resize_rgb_u16 --manifest-path ./app/Cargo.toml ``` -| | Lanczos3 | Bilinear | -|--------------------|:--------:|:--------:| -| image(aarch64) | 130.45 | 57.38 | -| pic-scale(aarch64) | 16.17 | 12.11 | -| fir(aarch64) | 110.06 | 42.04 | +| | Lanczos3 | Bilinear | +|---------------------------|:--------:|:--------:| +| image(aarch64) | 130.45 | 57.38 | +| pic-scale(aarch64) | 16.17 | 12.11 | +| fir(aarch64) | 110.06 | 42.04 | +| image(x86) | 204.10 | 148.34 | +| pic-scale(x86) | 43.21 | 145.73 | +| fir(x86) | 210.28 | 51.29 | +| image(x86-cpu-native) | 190.21 | 98.42 | +| pic-scale(x86-cpu-native) | 33.48 | 28.50 | +| fir(x86-cpu-native) | 72.88 | 45.17 | Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times. @@ -87,14 +99,17 @@ Example comparison time for downscale RGBA 4928x3279 `f32` image in 4 times. cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml ``` -| | Lanczos3 | Bilinear | -|--------------------|:--------:|:--------:| -| image(aarch64) | 100.16 | 50.09 | -| pic-scale(aarch64) | 14.07 | 11.18 | -| fir(aarch64) | 105.30 | 37.75 | -| image(x86) | 164.04 | 98.90 | -| pic-scale(x86) | 57.39 | 43.84 | -| fir(x86) | 60.30 | 29.92 | +| | Lanczos3 | Bilinear | +|---------------------------|:--------:|:--------:| +| image(aarch64) | 100.16 | 50.09 | +| pic-scale(aarch64) | 14.07 | 11.18 | +| fir(aarch64) | 105.30 | 37.75 | +| image(x86) | 208.25 | 107.84 | +| pic-scale(x86) | 33.55 | 28.97 | +| fir(x86) | 92.38 | 74.12 | +| image(x86-cpu-native) | 162.83 | 108.54 | +| pic-scale(x86-cpu-native) | 33.13 | 28.54 | +| fir(x86-cpu-native) | 56.65 | 59.96 | This project is licensed under either of diff --git a/command.ps1 b/command.ps1 index 77258e2..f9f6155 100644 --- a/command.ps1 +++ b/command.ps1 @@ -1,2 +1,2 @@ -# $env:RUSTFLAGS = "-C target-cpu=native" -cargo bench --bench resize_rgba_u16 --manifest-path ./app/Cargo.toml \ No newline at end of file +$env:RUSTFLAGS = "-C target-cpu=native" +cargo bench --bench resize_rgba_f32 --manifest-path ./app/Cargo.toml \ No newline at end of file diff --git a/src/resizer.rs b/src/resizer.rs index 7311505..b405ba6 100644 --- a/src/resizer.rs +++ b/src/resizer.rs @@ -225,8 +225,8 @@ pub fn resize_rgba16( bit_depth: u32, resampling_function: ResamplingFunction, ) -> Result, String> { - if bit_depth > 16 { - return Err("Bit depth cannot be greater than 16".parse().unwrap()); + if bit_depth > 16 || bit_depth == 0 { + return Err("Bit depth cannot be greater than 16 and not a zero".to_string()); } if bit_depth == 10 || bit_depth == 12 { resize_fixed_point::( @@ -280,8 +280,8 @@ pub fn resize_rgb16( bit_depth: u32, resampling_function: ResamplingFunction, ) -> Result, String> { - if bit_depth > 16 { - return Err("Bit depth cannot be greater than 16".parse().unwrap()); + if bit_depth > 16 || bit_depth == 0 { + return Err("Bit depth cannot be greater than 16 and not a zero".to_string()); } if bit_depth == 10 || bit_depth == 12 { resize_fixed_point::( @@ -335,8 +335,8 @@ pub fn resize_plane16( bit_depth: u32, resampling_function: ResamplingFunction, ) -> Result, String> { - if bit_depth > 16 { - return Err("Bit depth cannot be greater than 16".parse().unwrap()); + if bit_depth > 16 || bit_depth == 0 { + return Err("Bit depth cannot be greater than 16 and not a zero".to_string()); } if bit_depth == 10 || bit_depth == 12 { resize_fixed_point::( @@ -390,8 +390,8 @@ pub fn resize_plane16_with_alpha( bit_depth: u32, resampling_function: ResamplingFunction, ) -> Result, String> { - if bit_depth > 16 { - return Err("Bit depth cannot be greater than 16".parse().unwrap()); + if bit_depth > 16 || bit_depth == 0 { + return Err("Bit depth cannot be greater than 16 and not a zero".to_string()); } if bit_depth == 10 || bit_depth == 12 { resize_fixed_point::( From 59b19fb2b48f9bdbf84653089a92fe881e124365 Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 18:12:55 +0100 Subject: [PATCH 3/8] Loop unroll for 6 kernel size, bugfix and improvement --- app/src/main.rs | 19 +-- src/fixed_point_horizontal.rs | 55 ++++++++- src/fixed_point_vertical.rs | 8 +- src/floating_point_horizontal.rs | 195 +++++++++++++++++++++++++++++-- src/floating_point_vertical.rs | 150 ++++++++++++++++++++++++ src/sampler.rs | 2 +- 6 files changed, 405 insertions(+), 24 deletions(-) diff --git a/app/src/main.rs b/app/src/main.rs index 19a1c94..bc2ea6c 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -34,10 +34,7 @@ use image::{ imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat, ImageReader, Rgb, RgbImage, }; -use pic_scale_safe::{ - resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, - resize_rgba16, resize_rgba8, ImageSize, ResamplingFunction, -}; +use pic_scale_safe::{premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize, ResamplingFunction}; use std::ops::{BitXor, Shr}; use std::time::Instant; @@ -47,7 +44,7 @@ fn main() { .decode() .unwrap(); let dimensions = img.dimensions(); - let transient = img.to_rgb8(); + let transient = img.to_rgba8(); let mut working_store = transient.to_vec(); @@ -55,11 +52,13 @@ fn main() { let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize); let dst_size = ImageSize::new( - (dimensions.0 as f32 + 1.) as usize, - (dimensions.1 as f32 + 1.) as usize, + dimensions.0 as usize / 2, + dimensions.1 as usize / 2, ); - let mut resized = resize_rgb8( + premultiply_rgba8(&mut working_store); + + let mut resized = resize_rgba8( &working_store, src_size, dst_size, @@ -67,6 +66,8 @@ fn main() { ) .unwrap(); + // unpremultiply_rgba8(&mut resized); + println!("Working time {:?}", start.elapsed()); // let rgba_image = DynamicImage::ImageRgb16(ImageBuffer::, Vec>::from_vec(dimensions.0 * 4, dimensions.1 / 4, resized).unwrap()); @@ -82,7 +83,7 @@ fn main() { &resized, dst_size.width as u32, dst_size.height as u32, - image::ColorType::Rgb8, + image::ColorType::Rgba8, ) .unwrap(); diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs index 92e4a77..c6ea69b 100644 --- a/src/fixed_point_horizontal.rs +++ b/src/fixed_point_horizontal.rs @@ -98,8 +98,24 @@ pub(crate) fn convolve_row_handler_fixed_point< + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2 + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3; + } else if bounds_size == 6 { + let src_ptr0 = &src[px..(px + 6 * CHANNELS)]; + + let sliced_weights = &weights[0..6]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + let weight4 = sliced_weights[4].as_(); + let weight5 = sliced_weights[5].as_(); + sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5; } else { - let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; + let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)]; for (&k_weight, src) in weights .iter() .zip(src_ptr0.chunks_exact(CHANNELS)) @@ -235,6 +251,43 @@ pub(crate) fn convolve_row_handler_fixed_point_4< + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1 + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2 + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3; + } else if bounds_size == 6 { + let src_ptr0 = &src[px..(px + 6 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * CHANNELS)]; + + let sliced_weights = &weights[0..6]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + let weight4 = sliced_weights[4].as_(); + let weight5 = sliced_weights[5].as_(); + sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4 + + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5; + sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J) * weight4 + + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J) * weight5; + sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J) * weight4 + + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J) * weight5; + sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J) * weight4 + + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J) * weight5; } else { let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)]; let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)]; diff --git a/src/fixed_point_vertical.rs b/src/fixed_point_vertical.rs index 82ff83f..819c2b8 100644 --- a/src/fixed_point_vertical.rs +++ b/src/fixed_point_vertical.rs @@ -68,7 +68,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { - let py = bounds.start + j; + let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; let src_ptr = &src[offset..(offset + BUFFER_SIZE)]; @@ -127,7 +127,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { - let py = bounds.start + j; + let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; @@ -207,7 +207,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_four< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { - let py = bounds.start + j; + let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; @@ -305,7 +305,7 @@ pub(crate) fn column_handler_fixed_point< cx += 8; } - while cx + 1 < total_width { + while cx < total_width { convolve_column_handler_fixed_point_direct_buffer::( src, src_stride, dst, weight, bounds, bit_depth, cx, ); diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs index e9fcd75..ccf9b63 100644 --- a/src/floating_point_horizontal.rs +++ b/src/floating_point_horizontal.rs @@ -72,19 +72,99 @@ pub(crate) fn convolve_row_handler_floating_point< let mut sums = ColorGroup::::dup(0.as_()); let start_x = bounds.start; + let bounds_size = bounds.size; let px = start_x * CHANNELS; - let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; + if bounds_size == 2 { + let src_ptr0 = &src[px..(px + 2 * CHANNELS)]; - for (&k_weight, src) in weights - .iter() - .zip(src_ptr0.chunks_exact(CHANNELS)) - .take(bounds.size) - { - let weight: J = k_weight.as_(); - let new_px = fast_load_color_group!(src, CHANNELS, J); - sums = sums.mul_add(new_px, weight); + let sliced_weights = &weights[0..2]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ); + } else if bounds_size == 3 { + let src_ptr0 = &src[px..(px + 3 * CHANNELS)]; + + let sliced_weights = &weights[0..3]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ); + } else if bounds_size == 4 { + let src_ptr0 = &src[px..(px + 4 * CHANNELS)]; + + let sliced_weights = &weights[0..4]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J), + weight3, + ); + } else if bounds_size == 6 { + let src_ptr0 = &src[px..(px + 6 * CHANNELS)]; + + let sliced_weights = &weights[0..6]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + let weight4 = sliced_weights[4].as_(); + let weight5 = sliced_weights[5].as_(); + sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J), + weight5, + ); + } else { + let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; + for (&k_weight, src) in weights + .iter() + .zip(src_ptr0.chunks_exact(CHANNELS)) + .take(bounds.size) + { + let weight: J = k_weight.as_(); + let new_px = fast_load_color_group!(src, CHANNELS, J); + sums = sums.mul_add(new_px, weight); + } } fast_mixed_store_color_group!(sums, chunk, CHANNELS, bit_depth); @@ -282,6 +362,103 @@ pub(crate) fn convolve_row_handler_floating_point_4< fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), weight3, ); + } else if bounds_size == 6 { + let src_ptr0 = &src[px..(px + 6 * CHANNELS)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * CHANNELS)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * CHANNELS)]; + + let sliced_weights = &weights[0..6]; + let weight0 = sliced_weights[0].as_(); + let weight1 = sliced_weights[1].as_(); + let weight2 = sliced_weights[2].as_(); + let weight3 = sliced_weights[3].as_(); + let weight4 = sliced_weights[4].as_(); + let weight5 = sliced_weights[5].as_(); + sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J), + weight5, + ); + sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J), + weight5, + ); + sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J), + weight5, + ); + sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J), + weight5, + ); } else { let src_ptr0 = &src[px..(px + bounds.size * CHANNELS)]; let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CHANNELS)]; diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs index 91cea1e..6ec9000 100644 --- a/src/floating_point_vertical.rs +++ b/src/floating_point_vertical.rs @@ -222,6 +222,114 @@ pub(crate) fn convolve_column_handler_floating_point_4< fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), weight3, ); + } else if bounds_size == 6 { + let weights = &filter[0..6]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let weight3 = weights[3].as_(); + let weight4 = weights[4].as_(); + let weight5 = weights[5].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let offset3 = src_stride * (bounds_start + 3) + v_start_px; + let offset4 = src_stride * (bounds_start + 4) + v_start_px; + let offset5 = src_stride * (bounds_start + 5) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)]; + let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)]; + let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J), + weight5, + ); + + sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J), + weight5, + ); + + sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J), + weight5, + ); + + sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J), + weight5, + ); } else { for (j, &k_weight) in filter.iter().take(bounds.size).enumerate() { let py = bounds_start + j; @@ -361,6 +469,48 @@ pub(crate) fn convolve_column_handler_floating_point< fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J), weight3, ); + } else if bounds_size == 6 { + let weights = &filter[0..6]; + let weight0 = weights[0].as_(); + let weight1 = weights[1].as_(); + let weight2 = weights[2].as_(); + let weight3 = weights[3].as_(); + let weight4 = weights[4].as_(); + let weight5 = weights[5].as_(); + let offset0 = src_stride * bounds_start + v_start_px; + let offset1 = src_stride * (bounds_start + 1) + v_start_px; + let offset2 = src_stride * (bounds_start + 2) + v_start_px; + let offset3 = src_stride * (bounds_start + 3) + v_start_px; + let offset4 = src_stride * (bounds_start + 4) + v_start_px; + let offset5 = src_stride * (bounds_start + 5) + v_start_px; + let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)]; + let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)]; + let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)]; + let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)]; + let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)]; + let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)]; + + sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0) + .mul_add( + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J), + weight1, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J), + weight2, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J), + weight3, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J), + weight4, + ) + .mul_add( + fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J), + weight5, + ); } else { for (j, &k_weight) in filter.iter().take(bounds_size).enumerate() { let py = bounds_start + j; diff --git a/src/sampler.rs b/src/sampler.rs index bb30110..2d90493 100644 --- a/src/sampler.rs +++ b/src/sampler.rs @@ -269,7 +269,7 @@ impl ResamplingFunction { ResamplingFunction::Hamming => ResamplingFilter::new(hamming, 2f32), ResamplingFunction::Hanning => ResamplingFilter::new(hanning, 2f32), ResamplingFunction::Welch => ResamplingFilter::new(welch, 2f32), - ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 2f32), + ResamplingFunction::Quadric => ResamplingFilter::new(quadric, 3f32), ResamplingFunction::Gaussian => ResamplingFilter::new(gaussian, 2f32), ResamplingFunction::Sphinx => ResamplingFilter::new(sphinx, 2f32), ResamplingFunction::Bartlett => ResamplingFilter::new(bartlett, 2f32), From 2487b043e3926a28aefb14ee7b5a4c7d6f71aa49 Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 20:09:46 +0100 Subject: [PATCH 4/8] Fix vertical fixed point bug pass, adding something beautiful to all alpha functions to gets auto-vectorization --- app/src/main.rs | 15 ++- src/alpha.rs | 179 ++++++++++++++++-------------------- src/fixed_point_vertical.rs | 3 + 3 files changed, 93 insertions(+), 104 deletions(-) diff --git a/app/src/main.rs b/app/src/main.rs index bc2ea6c..6412152 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -34,7 +34,11 @@ use image::{ imageops, DynamicImage, EncodableLayout, GenericImageView, ImageBuffer, ImageFormat, ImageReader, Rgb, RgbImage, }; -use pic_scale_safe::{premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize, ResamplingFunction}; +use pic_scale_safe::{ + premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, + resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize, + ResamplingFunction, +}; use std::ops::{BitXor, Shr}; use std::time::Instant; @@ -51,13 +55,14 @@ fn main() { let start = Instant::now(); let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize); - let dst_size = ImageSize::new( - dimensions.0 as usize / 2, - dimensions.1 as usize / 2, - ); + let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2); + + let start_mul = Instant::now(); premultiply_rgba8(&mut working_store); + println!("Alpha mul time {:?}", start_mul.elapsed()); + let mut resized = resize_rgba8( &working_store, src_size, diff --git a/src/alpha.rs b/src/alpha.rs index 1d178c8..fd8bb5e 100644 --- a/src/alpha.rs +++ b/src/alpha.rs @@ -27,6 +27,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[inline] +fn div_by_255(v: u16) -> u8 { + ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8 +} + /// Associate alpha in place /// /// Note, for scaling alpha must be *associated* @@ -36,17 +41,14 @@ /// * `in_place`: Slice to where premultiply /// pub fn premultiply_rgba8(in_place: &mut [u8]) { - for chunk in in_place.chunks_mut(4) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3] as u16; - let mut r = chunk[0] as u16; - let mut g = chunk[1] as u16; - let mut b = chunk[2] as u16; - r = (r * a) / 255; - g = (g * a) / 255; - b = (b * a) / 255; - chunk[0] = r as u8; - chunk[1] = g as u8; - chunk[2] = b as u8; + chunk[0] = div_by_255(chunk[0] as u16 * a); + chunk[1] = div_by_255(chunk[1] as u16 * a); + chunk[2] = div_by_255(chunk[2] as u16 * a); + chunk[3] = div_by_255(a * a); } } @@ -60,23 +62,17 @@ pub fn premultiply_rgba8(in_place: &mut [u8]) { /// /// pub fn unpremultiply_rgba8(in_place: &mut [u8]) { - for chunk in in_place.chunks_mut(4) { - let a = chunk[3] as u16; - let mut r = chunk[0] as u16; - let mut g = chunk[1] as u16; - let mut b = chunk[2] as u16; - if a == 0 { - r = 0; - g = 0; - b = 0; - } else { - r = (r * 255) / a; - g = (g * 255) / a; - b = (b * 255) / a; + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. + for chunk in in_place.chunks_exact_mut(4) { + let a = chunk[3]; + if a != 0 { + let a_recip = 1. / a as f32; + chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8; + chunk[1] = ((chunk[1] as f32 * 255.) * a_recip) as u8; + chunk[2] = ((chunk[2] as f32 * 255.) * a_recip) as u8; + chunk[3] = ((a as f32 * 255.) * a_recip) as u8; } - chunk[0] = r as u8; - chunk[1] = g as u8; - chunk[2] = b as u8; } } @@ -89,11 +85,12 @@ pub fn unpremultiply_rgba8(in_place: &mut [u8]) { /// * `in_place`: Slice to where premultiply /// pub fn premultiply_la8(in_place: &mut [u8]) { - for chunk in in_place.chunks_mut(2) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. + for chunk in in_place.chunks_exact_mut(2) { let a = chunk[1] as u16; - let mut r = chunk[0] as u16; - r = (r * a) / 255; - chunk[0] = r as u8; + chunk[0] = div_by_255(chunk[0] as u16 * a); + chunk[1] = div_by_255(chunk[1] as u16 * a); } } @@ -107,15 +104,15 @@ pub fn premultiply_la8(in_place: &mut [u8]) { /// /// pub fn unpremultiply_la8(in_place: &mut [u8]) { - for chunk in in_place.chunks_mut(2) { - let a = chunk[1] as u16; - let mut r = chunk[0] as u16; - if a == 0 { - r = 0; - } else { - r = (r * 255) / a; + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. + for chunk in in_place.chunks_exact_mut(2) { + let a = chunk[1]; + if a != 0 { + let a_recip = 1. / a as f32; + chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8; + chunk[1] = ((a as f32 * 255.) * a_recip) as u8; } - chunk[0] = r as u8; } } @@ -129,19 +126,20 @@ pub fn unpremultiply_la8(in_place: &mut [u8]) { /// * `bit_depth`: Bit-depth of the image /// pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. assert!(bit_depth > 0 && bit_depth <= 16); let max_colors = (1 << bit_depth) - 1; - for chunk in in_place.chunks_mut(4) { + let recip_max_colors = 1. / max_colors as f32; + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3] as u32; - let mut r = chunk[0] as u32; - let mut g = chunk[1] as u32; - let mut b = chunk[2] as u32; - r = (r * a) / max_colors; - g = (g * a) / max_colors; - b = (b * a) / max_colors; - chunk[0] = r as u16; - chunk[1] = g as u16; - chunk[2] = b as u16; + chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + as u16; + chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + as u16; + chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + as u16; + chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16; } } @@ -155,13 +153,16 @@ pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { /// * `bit_depth`: Bit-depth of the image /// pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. assert!(bit_depth > 0 && bit_depth <= 16); let max_colors = (1 << bit_depth) - 1; - for chunk in in_place.chunks_mut(2) { + let recip_max_colors = 1. / max_colors as f32; + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[1] as u32; - let mut r = chunk[0] as u32; - r = (r * a) / max_colors; - chunk[0] = r as u16; + chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + as u16; + chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16; } } @@ -176,17 +177,17 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) { /// /// pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. assert!(bit_depth > 0 && bit_depth <= 16); let max_colors = (1 << bit_depth) - 1; - for chunk in in_place.chunks_mut(2) { + for chunk in in_place.chunks_exact_mut(2) { let a = chunk[1] as u32; - let mut r = chunk[0] as u32; - if a == 0 { - r = 0; - } else { - r = (r * max_colors) / a; + if a != 0 { + let a_recip = 1. / a as f32; + chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16; + chunk[1] = ((a * max_colors) as f32 * a_recip) as u16; } - chunk[0] = r as u16; } } @@ -201,25 +202,19 @@ pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) { /// /// pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. assert!(bit_depth > 0 && bit_depth <= 16); let max_colors = (1 << bit_depth) - 1; - for chunk in in_place.chunks_mut(4) { + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3] as u32; - let mut r = chunk[0] as u32; - let mut g = chunk[1] as u32; - let mut b = chunk[2] as u32; - if a == 0 { - r = 0; - g = 0; - b = 0; - } else { - r = (r * max_colors) / a; - g = (g * max_colors) / a; - b = (b * max_colors) / a; + if a != 0 { + let a_recip = 1. / a as f32; + chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16; + chunk[1] = ((chunk[1] as u32 * max_colors) as f32 * a_recip) as u16; + chunk[2] = ((chunk[2] as u32 * max_colors) as f32 * a_recip) as u16; + chunk[3] = ((a * max_colors) as f32 * a_recip) as u16; } - chunk[0] = r as u16; - chunk[1] = g as u16; - chunk[2] = b as u16; } } @@ -232,17 +227,12 @@ pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { /// * `in_place`: Slice to where premultiply /// pub fn premultiply_rgba_f32(in_place: &mut [f32]) { - for chunk in in_place.chunks_mut(4) { + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3]; - let mut r = chunk[0]; - let mut g = chunk[1]; - let mut b = chunk[2]; - r *= a; - g *= a; - b *= a; - chunk[0] = r; - chunk[1] = g; - chunk[2] = b; + chunk[0] = chunk[0] * a; + chunk[1] = chunk[1] * a; + chunk[2] = chunk[2] * a; + chunk[3] = a; } } @@ -256,22 +246,13 @@ pub fn premultiply_rgba_f32(in_place: &mut [f32]) { /// /// pub fn unpremultiply_rgba_f32(in_place: &mut [f32]) { - for chunk in in_place.chunks_mut(4) { + for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3]; - let mut r = chunk[0]; - let mut g = chunk[1]; - let mut b = chunk[2]; - if a == 0. { - r = 0.; - g = 0.; - b = 0.; - } else { - r /= a; - g /= a; - b /= a; + if a != 0. { + let a_recip = 1. / a; + chunk[0] *= a_recip; + chunk[1] *= a_recip; + chunk[2] *= a_recip; } - chunk[0] = r; - chunk[1] = g; - chunk[2] = b; } } diff --git a/src/fixed_point_vertical.rs b/src/fixed_point_vertical.rs index 819c2b8..fefc7d2 100644 --- a/src/fixed_point_vertical.rs +++ b/src/fixed_point_vertical.rs @@ -68,6 +68,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { + // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; @@ -127,6 +128,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { + // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; @@ -207,6 +209,7 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_four< } for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { + // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = k_weight.as_(); let offset = src_stride * py + v_start_px; From a859f0e6e88036dccd864bb4ee29e4154b69ecd1 Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 20:12:26 +0100 Subject: [PATCH 5/8] Fix clamp in alpha --- src/alpha.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/alpha.rs b/src/alpha.rs index fd8bb5e..6c384e8 100644 --- a/src/alpha.rs +++ b/src/alpha.rs @@ -133,13 +133,13 @@ pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { let recip_max_colors = 1. / max_colors as f32; for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3] as u32; - chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; - chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; - chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; - chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16; + chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; } } @@ -160,9 +160,9 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) { let recip_max_colors = 1. / max_colors as f32; for chunk in in_place.chunks_exact_mut(4) { let a = chunk[1] as u32; - chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) + chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; - chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).max(max_colors as u32) as u16; + chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16; } } From c296b4c9f7b203380aeb7fd3a0cae818a4cf45fd Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 20:13:25 +0100 Subject: [PATCH 6/8] Fix clippy lints --- src/alpha.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/alpha.rs b/src/alpha.rs index 6c384e8..f8ad5e8 100644 --- a/src/alpha.rs +++ b/src/alpha.rs @@ -227,11 +227,13 @@ pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) { /// * `in_place`: Slice to where premultiply /// pub fn premultiply_rgba_f32(in_place: &mut [f32]) { + // Almost all loops are not auto-vectorised without doing anything dirty. + // So everywhere is just added something beautiful. for chunk in in_place.chunks_exact_mut(4) { let a = chunk[3]; - chunk[0] = chunk[0] * a; - chunk[1] = chunk[1] * a; - chunk[2] = chunk[2] * a; + chunk[0] *= a; + chunk[1] *= a; + chunk[2] *= a; chunk[3] = a; } } @@ -253,6 +255,7 @@ pub fn unpremultiply_rgba_f32(in_place: &mut [f32]) { chunk[0] *= a_recip; chunk[1] *= a_recip; chunk[2] *= a_recip; + chunk[3] = a; } } } From 24beec4655c41be3c17b5a9d480d1e647d5b2485 Mon Sep 17 00:00:00 2001 From: awxkee Date: Sat, 26 Oct 2024 20:49:04 +0100 Subject: [PATCH 7/8] Fix readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 62572da..0ed5e06 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,6 @@ let img = ImageReader::open("./assets/nasa-4928x3279.png") let dimensions = img.dimensions(); let transient = img.to_rgb8(); -let start = Instant::now(); - let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); From 2ba1f584185d310ac4778f2e7851b0a53dfc8e17 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 26 Oct 2024 20:53:35 +0100 Subject: [PATCH 8/8] Fix la16 --- src/alpha.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alpha.rs b/src/alpha.rs index f8ad5e8..813f9e1 100644 --- a/src/alpha.rs +++ b/src/alpha.rs @@ -158,7 +158,7 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) { assert!(bit_depth > 0 && bit_depth <= 16); let max_colors = (1 << bit_depth) - 1; let recip_max_colors = 1. / max_colors as f32; - for chunk in in_place.chunks_exact_mut(4) { + for chunk in in_place.chunks_exact_mut(2) { let a = chunk[1] as u32; chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16;