- Renamed NormalizerGuard into Normalizer.

- Removed unsafe blocks from implementation of Normalizer.
Cykooz · Jul 7, 2022 · 8a159fc · 8a159fc
1 parent c6ce200
commit 8a159fc
Show file tree

Hide file tree

Showing 30 changed files with 341 additions and 466 deletions.
diff --git a/benchmarks.md b/benchmarks.md
@@ -32,11 +32,11 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| image      |  19.24  |  82.52   |   152.17   |  207.63  |
-| resize     |    -    |  52.19   |   103.40   |  154.15  |
-| fir rust   |  0.28   |  40.88   |   69.39    |  101.53  |
-| fir sse4.1 |  0.28   |  28.21   |   43.03    |  59.46   |
-| fir avx2   |  0.28   |   7.33   |    9.47    |  13.59   |
+| image      |  19.50  |  83.55   |   142.66   |  202.49  |
+| resize     |    -    |  52.12   |   102.98   |  153.42  |
+| fir rust   |  0.28   |  40.94   |   69.96    |  100.86  |
+| fir sse4.1 |  0.28   |  28.10   |   43.06    |  58.05   |
+| fir avx2   |  0.28   |   7.24   |    9.49    |  13.61   |
 
 ### Resize RGBA8 image (U8x4) 4928x3279 => 852x567
 
@@ -51,10 +51,10 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| resize     |    -    |  61.93   |   122.10   |  182.55  |
-| fir rust   |  0.18   |  36.57   |   52.28    |  74.14   |
-| fir sse4.1 |  0.18   |  13.14   |   17.21    |  22.44   |
-| fir avx2   |  0.18   |   9.69   |   11.99    |  16.23   |
+| resize     |    -    |  61.96   |   122.09   |  182.24  |
+| fir rust   |  0.19   |  36.38   |   52.06    |  74.03   |
+| fir sse4.1 |  0.19   |  13.37   |   17.44    |  22.63   |
+| fir avx2   |  0.19   |   9.76   |   12.18    |  16.30   |
 
 ### Resize L8 (luma) image (U8) 4928x3279 => 852x567
 
@@ -68,11 +68,11 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| image      |  15.86  |  47.17   |   74.46    |  102.53  |
-| resize     |    -    |  17.30   |   35.92    |  61.52   |
-| fir rust   |  0.15   |  14.10   |   16.20    |  24.12   |
-| fir sse4.1 |  0.15   |  11.93   |   12.13    |  18.20   |
-| fir avx2   |  0.15   |   6.30   |    4.71    |   7.62   |
+| image      |  15.95  |  47.09   |   74.65    |  103.92  |
+| resize     |    -    |  17.28   |   35.54    |  61.15   |
+| fir rust   |  0.16   |  14.33   |   16.25    |  24.35   |
+| fir sse4.1 |  0.16   |  12.17   |   12.18    |  18.41   |
+| fir avx2   |  0.16   |   6.31   |    4.66    |   7.97   |
 
 ### Resize LA8 (luma with alpha channel) image (U8x2) 4928x3279 => 852x567
 
@@ -89,9 +89,9 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| fir rust   |  0.17   |  25.73   |   30.75    |  42.34   |
-| fir sse4.1 |  0.17   |  12.81   |   14.64    |  18.06   |
-| fir avx2   |  0.17   |  11.26   |   12.42    |  15.46   |
+| fir rust   |  0.18   |  25.79   |   30.79    |  42.38   |
+| fir sse4.1 |  0.17   |  12.71   |   14.68    |  18.16   |
+| fir avx2   |  0.17   |  11.25   |   12.53    |  15.53   |
 
 ### Resize RGB16 image (U16x3) 4928x3279 => 852x567
 
@@ -105,11 +105,11 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| image      |  18.58  |  76.20   |   138.35   |  193.78  |
-| resize     |    -    |  54.72   |   106.58   |  158.30  |
-| fir rust   |  0.33   |  43.80   |   80.11    |  116.95  |
-| fir sse4.1 |  0.33   |  24.40   |   39.44    |  55.86   |
-| fir avx2   |  0.33   |  20.51   |   30.34    |  35.88   |
+| image      |  19.09  |  76.86   |   140.93   |  191.14  |
+| resize     |    -    |  55.19   |   107.76   |  159.66  |
+| fir rust   |  0.33   |  43.89   |   80.03    |  117.89  |
+| fir sse4.1 |  0.33   |  24.46   |   39.45    |  55.91   |
+| fir avx2   |  0.33   |  21.01   |   31.07    |  36.95   |
 
 ### Resize RGBA16 image (U16x4) 4928x3279 => 852x567
 
@@ -124,10 +124,10 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| resize     |    -    |  63.81   |   127.53   |  191.08  |
-| fir rust   |  0.37   |  80.36   |   118.89   |  159.05  |
-| fir sse4.1 |  0.37   |  42.70   |   63.96    |  86.08   |
-| fir avx2   |  0.37   |  25.40   |   36.62    |  47.99   |
+| resize     |    -    |  62.92   |   124.11   |  185.29  |
+| fir rust   |  0.38   |  84.98   |   123.53   |  163.92  |
+| fir sse4.1 |  0.38   |  42.35   |   63.57    |  85.62   |
+| fir avx2   |  0.38   |  23.60   |   34.29    |  45.60   |
 
 ### Resize L16 image (U16) 4928x3279 => 852x567
 
@@ -141,11 +141,11 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| image      |  16.37  |  47.13   |   74.89    |  104.83  |
-| resize     |    -    |  15.35   |   31.91    |  57.04   |
-| fir rust   |  0.17   |  19.05   |   28.02    |  37.48   |
-| fir sse4.1 |  0.17   |   7.80   |   13.16    |  19.13   |
-| fir avx2   |  0.17   |   7.07   |    9.48    |  14.80   |
+| image      |  16.43  |  48.12   |   75.85    |  104.63  |
+| resize     |    -    |  15.48   |   32.05    |  57.16   |
+| fir rust   |  0.17   |  19.31   |   26.93    |  37.76   |
+| fir sse4.1 |  0.18   |   7.87   |   13.22    |  19.26   |
+| fir avx2   |  0.18   |   7.17   |    9.62    |  14.73   |
 
 ### Resize LA16 (luma with alpha channel) image (U16x2) 4928x3279 => 852x567
 
@@ -162,6 +162,6 @@ Pipeline:
 
 |            | Nearest | Bilinear | CatmullRom | Lanczos3 |
 |------------|:-------:|:--------:|:----------:|:--------:|
-| fir rust   |  0.19   |  33.44   |   53.17    |  72.06   |
-| fir sse4.1 |  0.19   |  21.89   |   33.99    |  46.56   |
-| fir avx2   |  0.19   |  15.22   |   21.95    |  28.99   |
+| fir rust   |  0.19   |  34.70   |   54.59    |  74.64   |
+| fir sse4.1 |  0.19   |  22.71   |   34.82    |  47.46   |
+| fir avx2   |  0.20   |  15.68   |   22.42    |  29.13   |
diff --git a/src/convolution/optimisations.rs b/src/convolution/optimisations.rs
@@ -1,6 +1,5 @@
-use std::slice;
-
 use super::Bound;
+use crate::convolution::Coefficients;
 
 // This code is based on C-implementation from Pillow-SIMD package for Python
 // https://github.com/uploadcare/pillow-simd
@@ -28,25 +27,27 @@ const CLIP8_LOOKUPS: [u8; 1280] = get_clip_table();
 // two extra bits for overflow and i32 type.
 const PRECISION_BITS: u8 = 32 - 8 - 2;
 // We use i16 type to store coefficients.
-const MAX_COEFS_PRECISION: u8 = 16 - 1;
+const MAX_COEFFS_PRECISION: u8 = 16 - 1;
 
-/// Converts `Vec<f64>` into `&[i16]` without additional memory allocations.
-/// The memory buffer from `Vec<f64>` uses as `[i16]` .
-pub struct NormalizerGuard16 {
-    values: Vec<f64>,
+/// Converts `Vec<f64>` into `Vec<i16>`.
+pub(crate) struct Normalizer16 {
+    values: Vec<i16>,
     precision: u8,
+    window_size: usize,
+    bounds: Vec<Bound>,
 }
 
 #[derive(Debug, Clone, Copy)]
-pub struct CoefficientsI16Chunk<'a> {
+pub(crate) struct CoefficientsI16Chunk<'a> {
     pub start: u32,
     pub values: &'a [i16],
 }
 
-impl NormalizerGuard16 {
+impl Normalizer16 {
     #[inline]
-    pub fn new(mut values: Vec<f64>) -> Self {
-        let max_weight = values
+    pub fn new(coefficients: Coefficients) -> Self {
+        let max_weight = coefficients
+            .values
             .iter()
             .max_by(|&x, &y| x.partial_cmp(y).unwrap())
             .unwrap_or(&0.0)
@@ -57,36 +58,32 @@ impl NormalizerGuard16 {
             precision = cur_precision;
             let next_value: i32 = (max_weight * (1 << (precision + 1)) as f64).round() as i32;
             // The next value will be outside the range, so just stop
-            if next_value >= (1 << MAX_COEFS_PRECISION) {
+            if next_value >= (1 << MAX_COEFFS_PRECISION) {
                 break;
             }
         }
         debug_assert!(precision >= 4); // required for some SIMD optimisations
 
-        let len = values.len();
-        let ptr = values.as_mut_ptr();
-        // Size of `[i16]` always will be not greater than `[f64]` with same number of items
-        let values_i16 = unsafe { slice::from_raw_parts_mut(ptr as *mut i16, len) };
+        let mut values_i16 = Vec::with_capacity(coefficients.values.len());
 
         let scale = (1 << precision) as f64;
-        for (&src, dst) in values.iter().zip(values_i16.iter_mut()) {
-            *dst = (src * scale).round() as i16;
+        for src in coefficients.values.iter().copied() {
+            values_i16.push((src * scale).round() as i16);
+        }
+        Self {
+            values: values_i16,
+            precision,
+            window_size: coefficients.window_size,
+            bounds: coefficients.bounds,
         }
-        Self { values, precision }
     }
 
     #[inline]
-    pub fn normalized_chunks(
-        &self,
-        window_size: usize,
-        bounds: &[Bound],
-    ) -> Vec<CoefficientsI16Chunk> {
-        let len = self.values.len();
-        let ptr = self.values.as_ptr();
-        let mut cooefs = unsafe { slice::from_raw_parts(ptr as *const i16, len) };
-        let mut res = Vec::with_capacity(bounds.len());
-        for bound in bounds {
-            let (left, right) = cooefs.split_at(window_size);
+    pub fn normalized_chunks(&self) -> Vec<CoefficientsI16Chunk> {
+        let mut cooefs = self.values.as_slice();
+        let mut res = Vec::with_capacity(self.bounds.len());
+        for bound in self.bounds.iter() {
+            let (left, right) = cooefs.split_at(self.window_size);
             cooefs = right;
             let size = bound.size as usize;
             res.push(CoefficientsI16Chunk {
@@ -121,25 +118,27 @@ impl NormalizerGuard16 {
 // two extra bits for overflow and i64 type.
 const PRECISION16_BITS: u8 = 64 - 16 - 2;
 // We use i32 type to store coefficients.
-const MAX_COEFS_PRECISION16: u8 = 32 - 1;
+const MAX_COEFFS_PRECISION16: u8 = 32 - 1;
 
 #[derive(Debug, Clone, Copy)]
-pub struct CoefficientsI32Chunk<'a> {
+pub(crate) struct CoefficientsI32Chunk<'a> {
     pub start: u32,
     pub values: &'a [i32],
 }
 
-/// Converts `Vec<f64>` into `&[i32]` without additional memory allocations.
-/// The memory buffer from `Vec<f64>` uses as `[i32]` .
-pub struct NormalizerGuard32 {
-    values: Vec<f64>,
+/// Converts `Vec<f64>` into `Vec<i32>`.
+pub(crate) struct Normalizer32 {
+    values: Vec<i32>,
     precision: u8,
+    window_size: usize,
+    bounds: Vec<Bound>,
 }
 
-impl NormalizerGuard32 {
+impl Normalizer32 {
     #[inline]
-    pub fn new(mut values: Vec<f64>) -> Self {
-        let max_weight = values
+    pub fn new(coefficients: Coefficients) -> Self {
+        let max_weight = coefficients
+            .values
             .iter()
             .max_by(|&x, &y| x.partial_cmp(y).unwrap())
             .unwrap_or(&0.0)
@@ -150,36 +149,32 @@ impl NormalizerGuard32 {
             precision = cur_precision;
             let next_value: i64 = (max_weight * (1i64 << (precision + 1)) as f64).round() as i64;
             // The next value will be outside the range, so just stop
-            if next_value >= (1i64 << MAX_COEFS_PRECISION16) {
+            if next_value >= (1i64 << MAX_COEFFS_PRECISION16) {
                 break;
             }
         }
         debug_assert!(precision >= 4); // required for some SIMD optimisations
 
-        let len = values.len();
-        let ptr = values.as_mut_ptr();
-        // Size of `[i32]` always will be not greater than `[f64]` with same number of items
-        let values_i32 = unsafe { slice::from_raw_parts_mut(ptr as *mut i32, len) };
+        let mut values_i32 = Vec::with_capacity(coefficients.values.len());
 
         let scale = (1i64 << precision) as f64;
-        for (&src, dst) in values.iter().zip(values_i32.iter_mut()) {
-            *dst = (src * scale).round() as i32;
+        for src in coefficients.values.iter().copied() {
+            values_i32.push((src * scale).round() as i32);
+        }
+        Self {
+            values: values_i32,
+            precision,
+            window_size: coefficients.window_size,
+            bounds: coefficients.bounds,
         }
-        Self { values, precision }
     }
 
     #[inline]
-    pub fn normalized_chunks(
-        &self,
-        window_size: usize,
-        bounds: &[Bound],
-    ) -> Vec<CoefficientsI32Chunk> {
-        let len = self.values.len();
-        let ptr = self.values.as_ptr();
-        let mut cooefs = unsafe { slice::from_raw_parts(ptr as *const i32, len) };
-        let mut res = Vec::with_capacity(bounds.len());
-        for bound in bounds {
-            let (left, right) = cooefs.split_at(window_size);
+    pub fn normalized_chunks(&self) -> Vec<CoefficientsI32Chunk> {
+        let mut cooefs = self.values.as_slice();
+        let mut res = Vec::with_capacity(self.bounds.len());
+        for bound in self.bounds.iter() {
+            let (left, right) = cooefs.split_at(self.window_size);
             cooefs = right;
             let size = bound.size as usize;
             res.push(CoefficientsI32Chunk {
@@ -205,12 +200,20 @@ impl NormalizerGuard32 {
 mod tests {
     use super::*;
 
+    fn get_coefficients(value: f64) -> Coefficients {
+        Coefficients {
+            values: vec![value],
+            window_size: 0,
+            bounds: vec![],
+        }
+    }
+
     #[test]
     fn test_minimal_precision() {
         // required for some SIMD optimisations
-        assert!(NormalizerGuard16::new(vec![0.0]).precision() >= 4);
-        assert!(NormalizerGuard16::new(vec![2.0]).precision() >= 4);
-        assert!(NormalizerGuard32::new(vec![0.0]).precision() >= 4);
-        assert!(NormalizerGuard32::new(vec![2.0]).precision() >= 4);
+        assert!(Normalizer16::new(get_coefficients(0.0)).precision() >= 4);
+        assert!(Normalizer16::new(get_coefficients(2.0)).precision() >= 4);
+        assert!(Normalizer32::new(get_coefficients(0.0)).precision() >= 4);
+        assert!(Normalizer32::new(get_coefficients(2.0)).precision() >= 4);
     }
 }