Skip to content
This repository has been archived by the owner on Sep 24, 2022. It is now read-only.

Decrease deserialization complexity from quadratic to linear #349

Merged
merged 4 commits into from Oct 28, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
91 changes: 73 additions & 18 deletions src/de.rs
Expand Up @@ -5,6 +5,7 @@
//! provided at the top of the crate.

use std::borrow::Cow;
use std::collections::HashMap;
use std::error;
use std::f64;
use std::fmt;
Expand Down Expand Up @@ -214,6 +215,8 @@ impl<'de, 'b> de::Deserializer<'de> for &'b mut Deserializer<'de> {
V: de::Visitor<'de>,
{
let mut tables = self.tables()?;
let table_indices = build_table_indices(&tables);
let table_pindices = build_table_pindices(&tables);

let res = visitor.visit_map(MapVisitor {
values: Vec::new().into_iter(),
Expand All @@ -222,6 +225,8 @@ impl<'de, 'b> de::Deserializer<'de> for &'b mut Deserializer<'de> {
cur: 0,
cur_parent: 0,
max: tables.len(),
table_indices: &table_indices,
table_pindices: &table_pindices,
tables: &mut tables,
array: false,
de: self,
Expand Down Expand Up @@ -318,6 +323,28 @@ impl<'de, 'b> de::Deserializer<'de> for &'b mut Deserializer<'de> {
}
}

fn build_table_indices<'de>(tables: &[Table<'de>]) -> HashMap<Vec<Cow<'de, str>>, Vec<usize>> {
let mut res = HashMap::new();
for (i, table) in tables.iter().enumerate() {
let header = table.header.iter().map(|v| v.1.clone()).collect::<Vec<_>>();
res.entry(header).or_insert(Vec::new()).push(i);
}
res
}

fn build_table_pindices<'de>(tables: &[Table<'de>]) -> HashMap<Vec<Cow<'de, str>>, Vec<usize>> {
let mut res = HashMap::new();
for (i, table) in tables.iter().enumerate() {
let header = table.header.iter().map(|v| v.1.clone()).collect::<Vec<_>>();
for len in 0..=header.len() {
res.entry(header[..len].to_owned())
.or_insert(Vec::new())
.push(i);
}
}
res
}

fn headers_equal<'a, 'b>(hdr_a: &[(Span, Cow<'a, str>)], hdr_b: &[(Span, Cow<'b, str>)]) -> bool {
if hdr_a.len() != hdr_b.len() {
return false;
Expand All @@ -339,6 +366,8 @@ struct MapVisitor<'de, 'b> {
cur: usize,
cur_parent: usize,
max: usize,
table_indices: &'b HashMap<Vec<Cow<'de, str>>, Vec<usize>>,
table_pindices: &'b HashMap<Vec<Cow<'de, str>>, Vec<usize>>,
tables: &'b mut [Table<'de>],
array: bool,
de: &'b mut Deserializer<'de>,
Expand All @@ -364,20 +393,27 @@ impl<'de, 'b> de::MapAccess<'de> for MapVisitor<'de, 'b> {
}

let next_table = {
let prefix = &self.tables[self.cur_parent].header[..self.depth];
self.tables[self.cur..self.max]
let prefix_stripped = self.tables[self.cur_parent].header[..self.depth]
.iter()
.enumerate()
.find(|&(_, t)| {
if t.values.is_none() {
return false;
}
match t.header.get(..self.depth) {
Some(header) => headers_equal(&header, &prefix),
None => false,
.map(|v| v.1.clone())
.collect::<Vec<_>>();
self.table_pindices
.get(&prefix_stripped)
.and_then(|entries| {
let start = entries
.binary_search(&self.cur)
.unwrap_or_else(std::convert::identity);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd personally prefer if this were unwrap_or_else(|i| i)

if start == entries.len() || entries[start] < self.cur {
return None;
}
entries[start..]
.iter()
.copied()
.filter(|i| *i < self.max)
.map(|i| (i, &self.tables[i]))
.find(|(_, table)| table.values.is_some())
.map(|p| p.0)
})
.map(|(i, _)| i + self.cur)
};

let pos = match next_table {
Expand Down Expand Up @@ -469,6 +505,8 @@ impl<'de, 'b> de::MapAccess<'de> for MapVisitor<'de, 'b> {
cur: 0,
max: self.max,
array,
table_indices: &*self.table_indices,
table_pindices: &*self.table_pindices,
tables: &mut *self.tables,
de: &mut *self.de,
});
Expand All @@ -493,15 +531,30 @@ impl<'de, 'b> de::SeqAccess<'de> for MapVisitor<'de, 'b> {
return Ok(None);
}

let next = self.tables[..self.max]
let header_stripped = self.tables[self.cur_parent]
.header
.iter()
.enumerate()
.skip(self.cur_parent + 1)
.find(|&(_, table)| {
let tables_eq = headers_equal(&table.header, &self.tables[self.cur_parent].header);
table.array && tables_eq
.map(|v| v.1.clone())
.collect::<Vec<_>>();
let start_idx = self.cur_parent + 1;
let next = self
.table_indices
.get(&header_stripped)
.and_then(|entries| {
let start = entries
.binary_search(&start_idx)
.unwrap_or_else(std::convert::identity);
if start == entries.len() || entries[start] < start_idx {
return None;
}
entries[start..]
.iter()
.copied()
.filter(|i| *i < self.max)
.map(|i| (i, &self.tables[i]))
.find(|(_, table)| table.array)
.map(|p| p.0)
})
.map(|p| p.0)
.unwrap_or(self.max);

let ret = seed.deserialize(MapVisitor {
Expand All @@ -516,6 +569,8 @@ impl<'de, 'b> de::SeqAccess<'de> for MapVisitor<'de, 'b> {
max: next,
cur: 0,
array: false,
table_indices: &*self.table_indices,
table_pindices: &*self.table_pindices,
tables: &mut self.tables,
de: &mut self.de,
})?;
Expand Down
37 changes: 37 additions & 0 deletions test-suite/tests/linear.rs
@@ -0,0 +1,37 @@
use std::time::{Duration, Instant};
use toml::Value;

const TOLERANCE: f64 = 2.0;

fn measure_time(entries: usize, f: impl Fn(usize) -> String) -> Duration {
let start = Instant::now();
let mut s = String::new();
for i in 0..entries {
s += &f(i);
s += "entry = 42\n"
}
s.parse::<Value>().unwrap();
Instant::now() - start
}

#[test]
fn linear_increase_map() {
let time_1 = measure_time(100, |i| format!("[header_no_{}]\n", i));
let time_4 = measure_time(400, |i| format!("[header_no_{}]\n", i));
dbg!(time_1, time_4);
// Now ensure that the deserialization time has increased linearly
// (within a tolerance interval) instead of, say, quadratically
assert!(time_4 > time_1.mul_f64(4.0 - TOLERANCE));
assert!(time_4 < time_1.mul_f64(4.0 + TOLERANCE));
}

#[test]
fn linear_increase_array() {
let time_1 = measure_time(100, |i| format!("[[header_no_{}]]\n", i));
let time_4 = measure_time(400, |i| format!("[[header_no_{}]]\n", i));
dbg!(time_1, time_4);
// Now ensure that the deserialization time has increased linearly
// (within a tolerance interval) instead of, say, quadratically
assert!(time_4 > time_1.mul_f64(4.0 - TOLERANCE));
assert!(time_4 < time_1.mul_f64(4.0 + TOLERANCE));
}