Skip to content

Commit

Permalink
- Renamed NormalizerGuard into Normalizer.
Browse files Browse the repository at this point in the history
- Removed unsafe blocks from implementation of Normalizer.
  • Loading branch information
Cykooz committed Jul 7, 2022
1 parent c6ce200 commit 8a159fc
Show file tree
Hide file tree
Showing 30 changed files with 341 additions and 466 deletions.
68 changes: 34 additions & 34 deletions benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| image | 19.24 | 82.52 | 152.17 | 207.63 |
| resize | - | 52.19 | 103.40 | 154.15 |
| fir rust | 0.28 | 40.88 | 69.39 | 101.53 |
| fir sse4.1 | 0.28 | 28.21 | 43.03 | 59.46 |
| fir avx2 | 0.28 | 7.33 | 9.47 | 13.59 |
| image | 19.50 | 83.55 | 142.66 | 202.49 |
| resize | - | 52.12 | 102.98 | 153.42 |
| fir rust | 0.28 | 40.94 | 69.96 | 100.86 |
| fir sse4.1 | 0.28 | 28.10 | 43.06 | 58.05 |
| fir avx2 | 0.28 | 7.24 | 9.49 | 13.61 |

### Resize RGBA8 image (U8x4) 4928x3279 => 852x567

Expand All @@ -51,10 +51,10 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| resize | - | 61.93 | 122.10 | 182.55 |
| fir rust | 0.18 | 36.57 | 52.28 | 74.14 |
| fir sse4.1 | 0.18 | 13.14 | 17.21 | 22.44 |
| fir avx2 | 0.18 | 9.69 | 11.99 | 16.23 |
| resize | - | 61.96 | 122.09 | 182.24 |
| fir rust | 0.19 | 36.38 | 52.06 | 74.03 |
| fir sse4.1 | 0.19 | 13.37 | 17.44 | 22.63 |
| fir avx2 | 0.19 | 9.76 | 12.18 | 16.30 |

### Resize L8 (luma) image (U8) 4928x3279 => 852x567

Expand All @@ -68,11 +68,11 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| image | 15.86 | 47.17 | 74.46 | 102.53 |
| resize | - | 17.30 | 35.92 | 61.52 |
| fir rust | 0.15 | 14.10 | 16.20 | 24.12 |
| fir sse4.1 | 0.15 | 11.93 | 12.13 | 18.20 |
| fir avx2 | 0.15 | 6.30 | 4.71 | 7.62 |
| image | 15.95 | 47.09 | 74.65 | 103.92 |
| resize | - | 17.28 | 35.54 | 61.15 |
| fir rust | 0.16 | 14.33 | 16.25 | 24.35 |
| fir sse4.1 | 0.16 | 12.17 | 12.18 | 18.41 |
| fir avx2 | 0.16 | 6.31 | 4.66 | 7.97 |

### Resize LA8 (luma with alpha channel) image (U8x2) 4928x3279 => 852x567

Expand All @@ -89,9 +89,9 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| fir rust | 0.17 | 25.73 | 30.75 | 42.34 |
| fir sse4.1 | 0.17 | 12.81 | 14.64 | 18.06 |
| fir avx2 | 0.17 | 11.26 | 12.42 | 15.46 |
| fir rust | 0.18 | 25.79 | 30.79 | 42.38 |
| fir sse4.1 | 0.17 | 12.71 | 14.68 | 18.16 |
| fir avx2 | 0.17 | 11.25 | 12.53 | 15.53 |

### Resize RGB16 image (U16x3) 4928x3279 => 852x567

Expand All @@ -105,11 +105,11 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| image | 18.58 | 76.20 | 138.35 | 193.78 |
| resize | - | 54.72 | 106.58 | 158.30 |
| fir rust | 0.33 | 43.80 | 80.11 | 116.95 |
| fir sse4.1 | 0.33 | 24.40 | 39.44 | 55.86 |
| fir avx2 | 0.33 | 20.51 | 30.34 | 35.88 |
| image | 19.09 | 76.86 | 140.93 | 191.14 |
| resize | - | 55.19 | 107.76 | 159.66 |
| fir rust | 0.33 | 43.89 | 80.03 | 117.89 |
| fir sse4.1 | 0.33 | 24.46 | 39.45 | 55.91 |
| fir avx2 | 0.33 | 21.01 | 31.07 | 36.95 |

### Resize RGBA16 image (U16x4) 4928x3279 => 852x567

Expand All @@ -124,10 +124,10 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| resize | - | 63.81 | 127.53 | 191.08 |
| fir rust | 0.37 | 80.36 | 118.89 | 159.05 |
| fir sse4.1 | 0.37 | 42.70 | 63.96 | 86.08 |
| fir avx2 | 0.37 | 25.40 | 36.62 | 47.99 |
| resize | - | 62.92 | 124.11 | 185.29 |
| fir rust | 0.38 | 84.98 | 123.53 | 163.92 |
| fir sse4.1 | 0.38 | 42.35 | 63.57 | 85.62 |
| fir avx2 | 0.38 | 23.60 | 34.29 | 45.60 |

### Resize L16 image (U16) 4928x3279 => 852x567

Expand All @@ -141,11 +141,11 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| image | 16.37 | 47.13 | 74.89 | 104.83 |
| resize | - | 15.35 | 31.91 | 57.04 |
| fir rust | 0.17 | 19.05 | 28.02 | 37.48 |
| fir sse4.1 | 0.17 | 7.80 | 13.16 | 19.13 |
| fir avx2 | 0.17 | 7.07 | 9.48 | 14.80 |
| image | 16.43 | 48.12 | 75.85 | 104.63 |
| resize | - | 15.48 | 32.05 | 57.16 |
| fir rust | 0.17 | 19.31 | 26.93 | 37.76 |
| fir sse4.1 | 0.18 | 7.87 | 13.22 | 19.26 |
| fir avx2 | 0.18 | 7.17 | 9.62 | 14.73 |

### Resize LA16 (luma with alpha channel) image (U16x2) 4928x3279 => 852x567

Expand All @@ -162,6 +162,6 @@ Pipeline:

| | Nearest | Bilinear | CatmullRom | Lanczos3 |
|------------|:-------:|:--------:|:----------:|:--------:|
| fir rust | 0.19 | 33.44 | 53.17 | 72.06 |
| fir sse4.1 | 0.19 | 21.89 | 33.99 | 46.56 |
| fir avx2 | 0.19 | 15.22 | 21.95 | 28.99 |
| fir rust | 0.19 | 34.70 | 54.59 | 74.64 |
| fir sse4.1 | 0.19 | 22.71 | 34.82 | 47.46 |
| fir avx2 | 0.20 | 15.68 | 22.42 | 29.13 |
127 changes: 65 additions & 62 deletions src/convolution/optimisations.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::slice;

use super::Bound;
use crate::convolution::Coefficients;

// This code is based on C-implementation from Pillow-SIMD package for Python
// https://github.com/uploadcare/pillow-simd
Expand Down Expand Up @@ -28,25 +27,27 @@ const CLIP8_LOOKUPS: [u8; 1280] = get_clip_table();
// two extra bits for overflow and i32 type.
const PRECISION_BITS: u8 = 32 - 8 - 2;
// We use i16 type to store coefficients.
const MAX_COEFS_PRECISION: u8 = 16 - 1;
const MAX_COEFFS_PRECISION: u8 = 16 - 1;

/// Converts `Vec<f64>` into `&[i16]` without additional memory allocations.
/// The memory buffer from `Vec<f64>` uses as `[i16]` .
pub struct NormalizerGuard16 {
values: Vec<f64>,
/// Converts `Vec<f64>` into `Vec<i16>`.
pub(crate) struct Normalizer16 {
values: Vec<i16>,
precision: u8,
window_size: usize,
bounds: Vec<Bound>,
}

#[derive(Debug, Clone, Copy)]
pub struct CoefficientsI16Chunk<'a> {
pub(crate) struct CoefficientsI16Chunk<'a> {
pub start: u32,
pub values: &'a [i16],
}

impl NormalizerGuard16 {
impl Normalizer16 {
#[inline]
pub fn new(mut values: Vec<f64>) -> Self {
let max_weight = values
pub fn new(coefficients: Coefficients) -> Self {
let max_weight = coefficients
.values
.iter()
.max_by(|&x, &y| x.partial_cmp(y).unwrap())
.unwrap_or(&0.0)
Expand All @@ -57,36 +58,32 @@ impl NormalizerGuard16 {
precision = cur_precision;
let next_value: i32 = (max_weight * (1 << (precision + 1)) as f64).round() as i32;
// The next value will be outside the range, so just stop
if next_value >= (1 << MAX_COEFS_PRECISION) {
if next_value >= (1 << MAX_COEFFS_PRECISION) {
break;
}
}
debug_assert!(precision >= 4); // required for some SIMD optimisations

let len = values.len();
let ptr = values.as_mut_ptr();
// Size of `[i16]` always will be not greater than `[f64]` with same number of items
let values_i16 = unsafe { slice::from_raw_parts_mut(ptr as *mut i16, len) };
let mut values_i16 = Vec::with_capacity(coefficients.values.len());

let scale = (1 << precision) as f64;
for (&src, dst) in values.iter().zip(values_i16.iter_mut()) {
*dst = (src * scale).round() as i16;
for src in coefficients.values.iter().copied() {
values_i16.push((src * scale).round() as i16);
}
Self {
values: values_i16,
precision,
window_size: coefficients.window_size,
bounds: coefficients.bounds,
}
Self { values, precision }
}

#[inline]
pub fn normalized_chunks(
&self,
window_size: usize,
bounds: &[Bound],
) -> Vec<CoefficientsI16Chunk> {
let len = self.values.len();
let ptr = self.values.as_ptr();
let mut cooefs = unsafe { slice::from_raw_parts(ptr as *const i16, len) };
let mut res = Vec::with_capacity(bounds.len());
for bound in bounds {
let (left, right) = cooefs.split_at(window_size);
pub fn normalized_chunks(&self) -> Vec<CoefficientsI16Chunk> {
let mut cooefs = self.values.as_slice();
let mut res = Vec::with_capacity(self.bounds.len());
for bound in self.bounds.iter() {
let (left, right) = cooefs.split_at(self.window_size);
cooefs = right;
let size = bound.size as usize;
res.push(CoefficientsI16Chunk {
Expand Down Expand Up @@ -121,25 +118,27 @@ impl NormalizerGuard16 {
// two extra bits for overflow and i64 type.
const PRECISION16_BITS: u8 = 64 - 16 - 2;
// We use i32 type to store coefficients.
const MAX_COEFS_PRECISION16: u8 = 32 - 1;
const MAX_COEFFS_PRECISION16: u8 = 32 - 1;

#[derive(Debug, Clone, Copy)]
pub struct CoefficientsI32Chunk<'a> {
pub(crate) struct CoefficientsI32Chunk<'a> {
pub start: u32,
pub values: &'a [i32],
}

/// Converts `Vec<f64>` into `&[i32]` without additional memory allocations.
/// The memory buffer from `Vec<f64>` uses as `[i32]` .
pub struct NormalizerGuard32 {
values: Vec<f64>,
/// Converts `Vec<f64>` into `Vec<i32>`.
pub(crate) struct Normalizer32 {
values: Vec<i32>,
precision: u8,
window_size: usize,
bounds: Vec<Bound>,
}

impl NormalizerGuard32 {
impl Normalizer32 {
#[inline]
pub fn new(mut values: Vec<f64>) -> Self {
let max_weight = values
pub fn new(coefficients: Coefficients) -> Self {
let max_weight = coefficients
.values
.iter()
.max_by(|&x, &y| x.partial_cmp(y).unwrap())
.unwrap_or(&0.0)
Expand All @@ -150,36 +149,32 @@ impl NormalizerGuard32 {
precision = cur_precision;
let next_value: i64 = (max_weight * (1i64 << (precision + 1)) as f64).round() as i64;
// The next value will be outside the range, so just stop
if next_value >= (1i64 << MAX_COEFS_PRECISION16) {
if next_value >= (1i64 << MAX_COEFFS_PRECISION16) {
break;
}
}
debug_assert!(precision >= 4); // required for some SIMD optimisations

let len = values.len();
let ptr = values.as_mut_ptr();
// Size of `[i32]` always will be not greater than `[f64]` with same number of items
let values_i32 = unsafe { slice::from_raw_parts_mut(ptr as *mut i32, len) };
let mut values_i32 = Vec::with_capacity(coefficients.values.len());

let scale = (1i64 << precision) as f64;
for (&src, dst) in values.iter().zip(values_i32.iter_mut()) {
*dst = (src * scale).round() as i32;
for src in coefficients.values.iter().copied() {
values_i32.push((src * scale).round() as i32);
}
Self {
values: values_i32,
precision,
window_size: coefficients.window_size,
bounds: coefficients.bounds,
}
Self { values, precision }
}

#[inline]
pub fn normalized_chunks(
&self,
window_size: usize,
bounds: &[Bound],
) -> Vec<CoefficientsI32Chunk> {
let len = self.values.len();
let ptr = self.values.as_ptr();
let mut cooefs = unsafe { slice::from_raw_parts(ptr as *const i32, len) };
let mut res = Vec::with_capacity(bounds.len());
for bound in bounds {
let (left, right) = cooefs.split_at(window_size);
pub fn normalized_chunks(&self) -> Vec<CoefficientsI32Chunk> {
let mut cooefs = self.values.as_slice();
let mut res = Vec::with_capacity(self.bounds.len());
for bound in self.bounds.iter() {
let (left, right) = cooefs.split_at(self.window_size);
cooefs = right;
let size = bound.size as usize;
res.push(CoefficientsI32Chunk {
Expand All @@ -205,12 +200,20 @@ impl NormalizerGuard32 {
mod tests {
use super::*;

fn get_coefficients(value: f64) -> Coefficients {
Coefficients {
values: vec![value],
window_size: 0,
bounds: vec![],
}
}

#[test]
fn test_minimal_precision() {
// required for some SIMD optimisations
assert!(NormalizerGuard16::new(vec![0.0]).precision() >= 4);
assert!(NormalizerGuard16::new(vec![2.0]).precision() >= 4);
assert!(NormalizerGuard32::new(vec![0.0]).precision() >= 4);
assert!(NormalizerGuard32::new(vec![2.0]).precision() >= 4);
assert!(Normalizer16::new(get_coefficients(0.0)).precision() >= 4);
assert!(Normalizer16::new(get_coefficients(2.0)).precision() >= 4);
assert!(Normalizer32::new(get_coefficients(0.0)).precision() >= 4);
assert!(Normalizer32::new(get_coefficients(2.0)).precision() >= 4);
}
}

0 comments on commit 8a159fc

Please sign in to comment.