Skip to content

Commit

Permalink
blobby: new storage format (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
newpavlov committed Jul 1, 2020
1 parent 84d3b94 commit 018c1bf
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 282 deletions.
3 changes: 1 addition & 2 deletions blobby/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "blobby"
version = "0.2.0"
version = "0.3.0"
authors = ["RustCrypto Developers"]
license = "MIT OR Apache-2.0"
description = "Iterator over simple binary blob storage"
Expand All @@ -11,4 +11,3 @@ edition = "2018"

[dev-dependencies]
hex = "0.3"
byteorder = { version = "1", features = ["std"] }
106 changes: 81 additions & 25 deletions blobby/examples/convert.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,92 @@
//! Convert utility
use std::{env, error::Error, fs::File};
use std::collections::HashMap;
use std::io::{self, Write, BufRead, BufReader, BufWriter};
use std::{u8, u16, u32};
use blobby::{BlobIterator};

use byteorder::{WriteBytesExt, LE};
use blobby::BlobIterator;
const NEXT_MASK: u8 = 0b1000_0000;
const VAL_MASK: u8 = 0b0111_1111;

fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] {
macro_rules! step {
($n:expr) => {
buf[$n] = if $n == 3 {
(val & (VAL_MASK as usize)) as u8
} else {
val -= 1;
NEXT_MASK | (val & (VAL_MASK as usize)) as u8
};
val >>= 7;
if val == 0 {
return &buf[$n..];
}
};
}

step!(3);
step!(2);
step!(1);
step!(0);
panic!("integer is too big")
}

fn encode(reader: impl BufRead, mut writer: impl Write)
-> io::Result<usize>
{
let mut res = Vec::new();
let mut blobs = Vec::new();
for line in reader.lines() {
let blob = hex::decode(line?.as_str())
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
res.push(blob);
blobs.push(blob);
}
let n = match res.iter().map(|b| b.len()).max() {
None => 1,
Some(m) if m <= u8::MAX as usize => 1,
Some(m) if m <= u16::MAX as usize => 2,
Some(m) if m <= u32::MAX as usize => 4,
_ => 8,
};

writer.write_all(b"blobby")?;
writer.write_all(format!("{}", n).as_bytes())?;
let mut idx_map = HashMap::new();
for blob in blobs.iter().filter(|b| b.len() != 0) {
let v = idx_map.entry(blob.as_slice()).or_insert(0);
*v += 1;
}

for blob in res.iter() {
let s = blob.len();
match n {
1 => writer.write_all(&[s as u8])?,
2 => writer.write_u16::<LE>(s as u16)?,
4 => writer.write_u32::<LE>(s as u32)?,
8 => writer.write_u64::<LE>(s as u64)?,
_ => unreachable!(),
let mut idx: Vec<&[u8]> = idx_map
.iter()
.filter(|(_, &v)| v > 1)
.map(|(&k, _)| k)
.collect();
idx.sort_by_key(|e| {
let k = match e {
&[0] => 2,
&[1] => 1,
_ => 0,
};
(k, idx_map.get(e).unwrap())
});
idx.reverse();

let rev_idx: HashMap<&[u8], usize> = idx
.iter()
.enumerate()
.map(|(i, &e)| (e, i))
.collect();

println!("Index len: {:?}", idx.len());
let mut buf = [0u8; 4];
writer.write_all(encode_vlq(idx.len(), &mut buf))?;
for e in idx {
writer.write_all(encode_vlq(e.len(), &mut buf))?;
writer.write_all(e)?;
}

for blob in blobs.iter() {
if let Some(dup_pos) = rev_idx.get(blob.as_slice()) {
let n = (dup_pos << 1) + 1;
writer.write_all(encode_vlq(n, &mut buf))?;
} else {
let n = blob.len() << 1;
writer.write_all(encode_vlq(n, &mut buf))?;
writer.write_all(blob)?;
}
writer.write_all(blob)?;
}
Ok(res.len())

Ok(blobs.len())
}

fn decode<R: BufRead, W: Write>(mut reader: R, mut writer: W)
Expand All @@ -46,9 +95,16 @@ fn decode<R: BufRead, W: Write>(mut reader: R, mut writer: W)
let mut data = Vec::new();
reader.read_to_end(&mut data)?;
let res: Vec<_> = BlobIterator::new(&data)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?
.map_err(|e| io::Error::new(
io::ErrorKind::InvalidData,
format!("invalid blobby data: {:?}", e),
))?
.collect();
for blob in res.iter() {
let blob = blob.map_err(|e| io::Error::new(
io::ErrorKind::InvalidData,
format!("invalid blobby data: {:?}", e),
))?;
writer.write_all(hex::encode(blob).as_bytes())?;
writer.write_all(b"\n")?;
}
Expand Down

0 comments on commit 018c1bf

Please sign in to comment.