diff --git a/pageserver/src/bin/layer_map_analyzer.rs b/pageserver/src/bin/layer_map_analyzer.rs new file mode 100644 index 000000000000..e7408794584c --- /dev/null +++ b/pageserver/src/bin/layer_map_analyzer.rs @@ -0,0 +1,230 @@ +//! Tool for extracting content-dependent metadata about layers. Useful for scanning real project layer files and evaluating the effectiveness of different heuristics on them. +//! +//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. + +use anyhow::Result; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::ops::Range; +use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr}; + +use pageserver::page_cache::PAGE_SZ; +use pageserver::repository::{Key, KEY_SIZE}; +use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; +use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; +use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; +use pageserver::tenant::storage_layer::range_overlaps; +use pageserver::virtual_file::VirtualFile; + +use utils::{bin_ser::BeSer, lsn::Lsn}; + +const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; +const DEFAULT_MAX_HOLES: usize = 10; + +/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap +#[derive(PartialEq, Eq)] +struct Hole(Range); + +impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + let other_len = other.0.end.to_i128() - other.0.start.to_i128(); + let self_len = self.0.end.to_i128() - self.0.start.to_i128(); + other_len.cmp(&self_len) + } +} + +impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +struct LayerFile { + key_range: Range, + lsn_range: Range, + is_delta: bool, + holes: Vec, +} + +impl LayerFile { + fn skips(&self, key_range: &Range) -> bool { + if !range_overlaps(&self.key_range, key_range) { + return false; + } + let start = match self + .holes + .binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return false; + } + index - 1 + } + }; + self.holes[start].0.end >= key_range.end + } +} + +fn parse_filename(name: &str) -> Option { + let split: Vec<&str> = name.split("__").collect(); + if split.len() != 2 { + return None; + } + let keys: Vec<&str> = split[0].split('-').collect(); + let mut lsns: Vec<&str> = split[1].split('-').collect(); + let is_delta = if lsns.len() == 1 { + lsns.push(lsns[0]); + false + } else { + true + }; + + let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); + let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let holes = Vec::new(); + Some(LayerFile { + key_range, + lsn_range, + is_delta, + holes, + }) +} + +// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" +fn get_holes(path: &Path, max_holes: usize) -> Result> { + let file = FileBlockReader::new(VirtualFile::open(path)?); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + actual_summary.index_start_blk, + actual_summary.index_root_blk, + file, + ); + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev_key: Option = None; + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, _value| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + if let Some(prev) = prev_key { + if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH { + heap.push(Hole(prev..curr)); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + prev_key = Some(curr.next()); + true + }, + )?; + let mut holes = heap.into_vec(); + holes.sort_by_key(|hole| hole.0.start); + Ok(holes) +} + +fn main() -> Result<()> { + let args: Vec = env::args().collect(); + if args.len() < 2 { + println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]"); + return Ok(()); + } + let storage_path = PathBuf::from_str(&args[1])?; + let max_holes = if args.len() > 2 { + args[2].parse::().unwrap() + } else { + DEFAULT_MAX_HOLES + }; + + // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. + pageserver::virtual_file::init(10); + pageserver::page_cache::init(100); + + let mut total_delta_layers = 0usize; + let mut total_image_layers = 0usize; + let mut total_excess_layers = 0usize; + for tenant in fs::read_dir(storage_path.join("tenants"))? { + let tenant = tenant?; + if !tenant.file_type()?.is_dir() { + continue; + } + for timeline in fs::read_dir(tenant.path().join("timelines"))? { + let timeline = timeline?; + if !timeline.file_type()?.is_dir() { + continue; + } + // Collect sorted vec of layers and count deltas + let mut layers = Vec::new(); + let mut n_deltas = 0usize; + + for layer in fs::read_dir(timeline.path())? { + let layer = layer?; + if let Some(mut layer_file) = + parse_filename(&layer.file_name().into_string().unwrap()) + { + if layer_file.is_delta { + layer_file.holes = get_holes(&layer.path(), max_holes)?; + n_deltas += 1; + } + layers.push(layer_file); + } + } + layers.sort_by_key(|layer| layer.lsn_range.end); + + // Count the number of holes and number of excess layers. + // Excess layer is image layer generated when holes in delta layers are not considered. + let mut n_excess_layers = 0usize; + let mut n_holes = 0usize; + + for i in 0..layers.len() { + if !layers[i].is_delta { + let mut n_deltas_since_last_image = 0usize; + let mut n_skipped = 0usize; + let img_key_range = &layers[i].key_range; + for j in (0..i).rev() { + if range_overlaps(img_key_range, &layers[j].key_range) { + if layers[j].is_delta { + n_deltas_since_last_image += 1; + if layers[j].skips(img_key_range) { + n_skipped += 1; + } + } else { + // Image layer is always dense, despite to the fact that it doesn't contain all possible + // key values in the specified range: there are may be no keys in the storage belonging + // to the image layer range but not present in the image layer. + break; + } + } + } + if n_deltas_since_last_image >= 3 && n_deltas_since_last_image - n_skipped < 3 { + // It is just approximation: it doesn't take in account all image coverage. + // Moreover the new layer map doesn't count total deltas, but the max stack of overlapping deltas. + n_excess_layers += 1; + } + n_holes += n_skipped; + } + } + println!( + "Tenant {} timeline {} delta layers {} image layers {} excess layers {} holes {}", + tenant.file_name().into_string().unwrap(), + timeline.file_name().into_string().unwrap(), + n_deltas, + layers.len() - n_deltas, + n_excess_layers, + n_holes + ); + total_delta_layers += n_deltas; + total_image_layers += layers.len() - n_deltas; + total_excess_layers += n_excess_layers; + } + } + println!( + "Total delta layers {} image layers {} excess layers {}", + total_delta_layers, total_image_layers, total_excess_layers + ); + Ok(()) +} diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2f45fe0dfc83..1af5eb07a2f3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -77,7 +77,7 @@ use utils::{ mod blob_io; pub mod block_io; -mod disk_btree; +pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 2149fc7eb739..cb7380be2d8c 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,6 +1,6 @@ //! Common traits and structs for layers -mod delta_layer; +pub mod delta_layer; mod filename; mod image_layer; mod inmemory_layer; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 302ba2dc787b..e9bcb55f9b51 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -62,7 +62,7 @@ use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOr /// the 'index' starts at the block indicated by 'index_start_blk' /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -struct Summary { +pub struct Summary { /// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC. magic: u16, format_version: u16, @@ -73,9 +73,9 @@ struct Summary { lsn_range: Range, /// Block number where the 'index' part of the file begins. - index_start_blk: u32, + pub index_start_blk: u32, /// Block within the 'index', where the B-tree root page is stored - index_root_blk: u32, + pub index_root_blk: u32, } impl From<&DeltaLayer> for Summary { @@ -125,7 +125,7 @@ impl BlobRef { } } -const DELTA_KEY_SIZE: usize = KEY_SIZE + 8; +pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8; struct DeltaKey([u8; DELTA_KEY_SIZE]); ///