Skip to content

Commit

Permalink
[wip] Compile globs via HIR rather than text.
Browse files Browse the repository at this point in the history
This change migrates from `regex` to its intermediate `regex-syntax` and
`regex-automata` crates. This greatly improves the flexibility in
encoding globs and matched text and removes the need for obtuse textual
encoding before compiling automata.
  • Loading branch information
olson-sean-k committed Mar 27, 2024
1 parent 46d690b commit 5872def
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 139 deletions.
10 changes: 7 additions & 3 deletions Cargo.toml
Expand Up @@ -33,7 +33,6 @@ miette = [
walk = ["dep:walkdir"]

[dependencies]
const_format = "^0.2.0"
itertools = "^0.11.0"
nom = "^7.0.0"
pori = "=0.0.0"
Expand All @@ -44,15 +43,20 @@ version = "^5.10.0"
default-features = false
optional = true

[dependencies.regex]
version = "^1.9.0"
[dependencies.regex-automata]
version = "^0.4.6"
default-features = false
features = [
"meta",
"perf",
"std",
"unicode-case"
]

[dependencies.regex-syntax]
version = "^0.8.3"
default-features = false

[dependencies.tardar]
version = "^0.1.0"
optional = true
Expand Down
134 changes: 25 additions & 109 deletions src/capture.rs
@@ -1,81 +1,28 @@
use regex::Captures as BorrowedText;
use regex_automata::meta::Regex;
use regex_automata::util::captures::Captures;
use std::borrow::Cow;
use std::str;

use crate::CandidatePath;

#[derive(Clone, Debug)]
struct OwnedText {
matched: String,
ranges: Vec<Option<(usize, usize)>>,
pub trait RegexExt {
fn matched<'t>(&self, text: impl Into<Cow<'t, str>>) -> Option<MatchedText<'t>>;
}

impl OwnedText {
pub fn get(&self, index: usize) -> Option<&str> {
if index == 0 {
Some(self.matched.as_ref())
impl RegexExt for Regex {
fn matched<'t>(&self, text: impl Into<Cow<'t, str>>) -> Option<MatchedText<'t>> {
let text = text.into();
let mut captures = self.create_captures();
self.captures(text.as_ref(), &mut captures);
if captures.is_match() {
Some(MatchedText { text, captures })
}
else {
self.ranges
.get(index - 1)
.and_then(|range| range.map(|range| &self.matched[range.0..range.1]))
None
}
}
}

impl<'t> From<BorrowedText<'t>> for OwnedText {
fn from(captures: BorrowedText<'t>) -> Self {
From::from(&captures)
}
}

impl<'m, 't> From<&'m BorrowedText<'t>> for OwnedText {
fn from(captures: &'m BorrowedText<'t>) -> Self {
let matched = captures.get(0).unwrap().as_str().into();
let ranges = captures
.iter()
.skip(1)
.map(|capture| capture.map(|capture| (capture.start(), capture.end())))
.collect();
OwnedText { matched, ranges }
}
}

#[derive(Debug)]
enum MaybeOwnedText<'t> {
Borrowed(BorrowedText<'t>),
Owned(OwnedText),
}

impl<'t> MaybeOwnedText<'t> {
fn into_owned(self) -> MaybeOwnedText<'static> {
match self {
MaybeOwnedText::Borrowed(borrowed) => OwnedText::from(borrowed).into(),
MaybeOwnedText::Owned(owned) => owned.into(),
}
}

// This conversion may appear to operate in place.
#[must_use]
fn to_owned(&self) -> MaybeOwnedText<'static> {
match self {
MaybeOwnedText::Borrowed(ref borrowed) => OwnedText::from(borrowed).into(),
MaybeOwnedText::Owned(ref owned) => owned.clone().into(),
}
}
}

impl<'t> From<BorrowedText<'t>> for MaybeOwnedText<'t> {
fn from(captures: BorrowedText<'t>) -> Self {
MaybeOwnedText::Borrowed(captures)
}
}

impl From<OwnedText> for MaybeOwnedText<'static> {
fn from(captures: OwnedText) -> Self {
MaybeOwnedText::Owned(captures)
}
}

/// Text that has been matched by a [`Program`] and its captures.
///
/// To match a [`Glob`] or other [`Program`] against a [`CandidatePath`] and get the matched text,
Expand Down Expand Up @@ -107,33 +54,19 @@ impl From<OwnedText> for MaybeOwnedText<'static> {
/// [`Glob`]: crate::Glob
/// [`Program`]: crate::Program
/// [`Program::matched`]: crate::Program::matched
#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct MatchedText<'t> {
inner: MaybeOwnedText<'t>,
text: Cow<'t, str>,
captures: Captures,
}

impl<'t> MatchedText<'t> {
/// Clones any borrowed data into an owning instance.
pub fn into_owned(self) -> MatchedText<'static> {
let MatchedText { inner } = self;
MatchedText {
inner: inner.into_owned(),
}
}

/// Clones any borrowed data to an owning instance.
///
/// This function is similar to [`into_owned`], but does not consume its receiver. Due to a
/// technical limitation, `MatchedText` cannot properly implement [`Clone`], so this function
/// is provided as a stop gap that allows a distinct instance to be created that owns its data.
///
/// [`Clone`]: std::clone::Clone
/// [`into_owned`]: crate::MatchedText::into_owned
// This conversion may appear to operate in place.
#[must_use]
pub fn to_owned(&self) -> MatchedText<'static> {
let MatchedText { text, captures } = self;
MatchedText {
inner: self.inner.to_owned(),
text: text.into_owned().into(),
captures,
}
}

Expand Down Expand Up @@ -162,32 +95,15 @@ impl<'t> MatchedText<'t> {
///
/// [`Program`]: crate::Program
pub fn get(&self, index: usize) -> Option<&str> {
match self.inner {
MaybeOwnedText::Borrowed(ref captures) => {
captures.get(index).map(|capture| capture.as_str())
},
MaybeOwnedText::Owned(ref captures) => captures.get(index),
}
self.captures.get_group(index).map(|span| {
self.text
.as_ref()
.get(span.start..span.end)
.expect("match span not in text")
})
}

pub fn to_candidate_path(&self) -> CandidatePath {
CandidatePath::from(self.complete())
}
}

// TODO: This probably shouldn't be part of the public API.
impl<'t> From<BorrowedText<'t>> for MatchedText<'t> {
fn from(captures: BorrowedText<'t>) -> Self {
MatchedText {
inner: captures.into(),
}
}
}

impl From<OwnedText> for MatchedText<'static> {
fn from(captures: OwnedText) -> Self {
MatchedText {
inner: captures.into(),
}
}
}
15 changes: 10 additions & 5 deletions src/encode.rs
Expand Up @@ -2,13 +2,18 @@ use const_format::formatcp;
use itertools::{Itertools as _, Position};
#[cfg(feature = "miette")]
use miette::Diagnostic;
use regex::{Error as RegexError, Regex};
use regex_automata::meta::BuildError;
use regex_syntax::hir::Hir;
use std::borrow::{Borrow, Cow};
#[cfg(feature = "miette")]
use std::fmt::Display;
use thiserror::Error;

use crate::token::{ConcatenationTree, Token, TokenTopology};
// TODO: Replace this file with `hir.rs`.

pub use regex_automata::meta::Regex;

use crate::token::{self, Fold, ConcatenationTree, Token, TokenTopology};

/// A regular expression that never matches.
///
Expand Down Expand Up @@ -93,13 +98,13 @@ trait Escaped {

impl Escaped for char {
fn escaped(&self) -> String {
regex::escape(&self.to_string())
regex_syntax::escape(&self.to_string())
}
}

impl Escaped for str {
fn escaped(&self) -> String {
regex::escape(self)
regex_syntax::escape(self)
}
}

Expand Down Expand Up @@ -128,7 +133,7 @@ impl Grouping {
}

pub fn case_folded_eq(left: &str, right: &str) -> bool {
let regex = Regex::new(&format!("(?i){}", regex::escape(left)))
let regex = Regex::new(&format!("(?i){}", regex_syntax::escape(left)))
.expect("failed to compile literal regular expression");
if let Some(matched) = regex.find(right) {
matched.start() == 0 && matched.end() == right.len()
Expand Down

0 comments on commit 5872def

Please sign in to comment.