diff --git a/Cargo.toml b/Cargo.toml index 1dc7f5685..46a57daeb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ edition = "2018" [workspace] members = [ - "bench", "regex-capi", "regex-debug", "regex-syntax", + "bench", "regex-capi", "regex-syntax", ] [lib] diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml deleted file mode 100644 index 1db4036b9..000000000 --- a/regex-debug/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -publish = false -name = "regex-debug" -version = "0.1.0" -authors = ["The Rust Project Developers"] -license = "MIT OR Apache-2.0" -repository = "https://github.com/rust-lang/regex" -documentation = "https://docs.rs/regex" -homepage = "https://github.com/rust-lang/regex" -description = "A tool useful for debugging regular expressions." -workspace = ".." -edition = "2018" - -[dependencies] -docopt = "1" -regex = { version = "1.1", path = ".." } -regex-syntax = { version = "0.6", path = "../regex-syntax" } -serde = { version = "1", features = ["derive"] } diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs deleted file mode 100644 index a7dd453e1..000000000 --- a/regex-debug/src/main.rs +++ /dev/null @@ -1,376 +0,0 @@ -use std::error; -use std::io::{self, Write}; -use std::process; -use std::result; - -use docopt::Docopt; -use regex::internal::{Compiler, LiteralSearcher}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; - -const USAGE: &'static str = " -Usage: - regex-debug [options] ast - regex-debug [options] hir - regex-debug [options] prefixes ... - regex-debug [options] suffixes ... - regex-debug [options] anchors - regex-debug [options] captures - regex-debug [options] compile ... - regex-debug [options] utf8-ranges - regex-debug [options] utf8-ranges-rev - regex-debug --help - -Options: - --help Show this usage message. - --size-limit ARG An approximate size limit on the total size (in bytes) - of a compiled regular expression program. - [default: 10485760] - --bytes Show the instruction codes for byte oriented programs. - (As opposed to Unicode oriented programs.) - --dfa Show the instruction codes for a DFA. - --dfa-reverse Show the instruction codes for a reverse DFA. - This implies --dfa. - -a, --all-literals Shows all literals extracted. - By default, only unambiguous literals are shown. - --literal-limit ARG An approximate limit on the total size (in bytes) - of all literals extracted. [default: 250] - --class-limit ARG A limit on the size of character classes used to - extract literals. [default: 10] - --literal-bytes Show raw literal bytes instead of Unicode chars. - --lcp Show the longest common prefix of all the literals - extracted. - --lcs Show the longest common suffix of all the literals - extracted. - --searcher Show the debug output for the literal searcher - constructed by the literals found. - --quiet Show less output. -"; - -#[derive(serde::Deserialize)] -struct Args { - cmd_ast: bool, - cmd_hir: bool, - cmd_prefixes: bool, - cmd_suffixes: bool, - cmd_anchors: bool, - cmd_captures: bool, - cmd_compile: bool, - cmd_utf8_ranges: bool, - cmd_utf8_ranges_rev: bool, - - arg_pattern: String, - arg_patterns: Vec, - arg_class: String, - - flag_size_limit: usize, - flag_bytes: bool, - flag_dfa: bool, - flag_dfa_reverse: bool, - flag_all_literals: bool, - flag_literal_limit: usize, - flag_class_limit: usize, - flag_literal_bytes: bool, - flag_lcp: bool, - flag_lcs: bool, - flag_searcher: bool, - flag_quiet: bool, -} - -type Result = result::Result>; - -fn main() { - let mut args: Args = Docopt::new(USAGE) - .and_then(|d| d.deserialize()) - .unwrap_or_else(|e| e.exit()); - if args.flag_dfa_reverse { - args.flag_dfa = true; - } - match run(&args) { - Ok(_) => process::exit(0), - Err(err) => { - let _ = writeln!(&mut io::stderr(), "{}", err); - process::exit(1) - } - } -} - -fn run(args: &Args) -> Result<()> { - if args.cmd_ast { - cmd_ast(args) - } else if args.cmd_hir { - cmd_hir(args) - } else if args.cmd_prefixes { - cmd_literals(args) - } else if args.cmd_suffixes { - cmd_literals(args) - } else if args.cmd_anchors { - cmd_anchors(args) - } else if args.cmd_captures { - cmd_captures(args) - } else if args.cmd_compile { - cmd_compile(args) - } else if args.cmd_utf8_ranges { - cmd_utf8_ranges(args) - } else if args.cmd_utf8_ranges_rev { - cmd_utf8_ranges_rev(args) - } else { - unreachable!() - } -} - -fn cmd_ast(args: &Args) -> Result<()> { - use regex_syntax::ast::parse::Parser; - - let mut parser = Parser::new(); - let ast = parser.parse(&args.arg_pattern)?; - println!("{:#?}", ast); - Ok(()) -} - -fn cmd_hir(args: &Args) -> Result<()> { - use regex_syntax::ParserBuilder; - - let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build(); - let hir = parser.parse(&args.arg_pattern)?; - println!("{:#?}", hir); - Ok(()) -} - -fn cmd_literals(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let mut lits = if args.cmd_prefixes { - args.literals(&exprs, |lits, e| lits.union_prefixes(e)) - } else { - args.literals(&exprs, |lits, e| lits.union_suffixes(e)) - }; - if !args.flag_all_literals { - if args.cmd_prefixes { - lits = lits.unambiguous_prefixes(); - } else { - lits = lits.unambiguous_suffixes(); - } - } - if args.flag_searcher { - if args.cmd_prefixes { - println!("{:?}", LiteralSearcher::prefixes(lits)) - } else { - println!("{:?}", LiteralSearcher::suffixes(lits)) - } - } else if args.flag_lcp { - println!("{}", escape_unicode(lits.longest_common_prefix())); - } else if args.flag_lcs { - println!("{}", escape_unicode(lits.longest_common_suffix())); - } else { - for lit in lits.literals() { - if args.flag_literal_bytes { - if lit.is_cut() { - println!("Cut({})", escape_bytes(lit)); - } else { - println!("Complete({})", escape_bytes(lit)); - } - } else { - println!("{:?}", lit); - } - } - } - Ok(()) -} - -fn cmd_anchors(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - if expr.is_anchored_start() { - println!("start"); - } - if expr.is_anchored_end() { - println!("end"); - } - Ok(()) -} - -fn cmd_captures(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - let prog = args.compiler().only_utf8(false).compile(&[expr])?; - for (i, name) in prog.captures.iter().enumerate() { - match *name { - None => println!("{}", i), - Some(ref name) => println!("{}:{}", i, name), - } - } - Ok(()) -} - -fn cmd_compile(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let compiler = args - .compiler() - .bytes(args.flag_bytes) - .only_utf8(!args.flag_bytes) - .dfa(args.flag_dfa) - .reverse(args.flag_dfa_reverse); - let prog = compiler.compile(&exprs)?; - if !args.flag_quiet { - print!("{:?}", prog); - } else { - println!("instruction count: {}", prog.insts.len()); - } - Ok(()) -} - -fn cmd_utf8_ranges(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - for (i, range) in cls.iter().enumerate() { - if i > 0 { - println!("----------------------------"); - } - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -fn cmd_utf8_ranges_rev(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - let mut seqs = vec![]; - for (_, range) in cls.iter().enumerate() { - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - let mut seq = seq.as_slice().to_vec(); - seq.reverse(); - seqs.push(seq); - } - } - seqs.sort(); - for seq in seqs { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -impl Args { - fn parse_one(&self) -> Result { - parse(&self.arg_pattern) - } - - fn parse_many(&self) -> Result> { - self.arg_patterns.iter().map(|s| parse(s)).collect() - } - - fn literals bool>( - &self, - exprs: &[Hir], - get_literals: F, - ) -> Literals { - let mut lits = Some(self.empty_literals()); - for e in exprs { - lits = lits.and_then(|mut lits| { - if !get_literals(&mut lits, e) { - None - } else { - Some(lits) - } - }); - } - lits.unwrap_or(self.empty_literals()) - } - - fn empty_literals(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.flag_literal_limit); - lits.set_limit_class(self.flag_class_limit); - lits - } - - fn compiler(&self) -> Compiler { - Compiler::new().size_limit(self.flag_size_limit) - } -} - -fn parse(re: &str) -> Result { - use regex_syntax::ParserBuilder; - ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(re) - .map_err(From::from) -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else { - if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) - } else { - format!(r"\U{{{:08x}}}", c as u32) - } - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -}