Skip to content

Commit

Permalink
Refactor code against changed token APIs.
Browse files Browse the repository at this point in the history
This change begins refactoring code that interacts with the `token`
module and fixes many small errors. This is incomplete and leaves some
more difficult problems to solve with flawed approaches. Namely,
`FoldMap` and `BranchFold` cannot work properly as designed and
`Component` cannot implement `ConcatenationTree`.
  • Loading branch information
olson-sean-k committed Jan 20, 2024
1 parent 3c95ff0 commit 3e8b2a2
Show file tree
Hide file tree
Showing 10 changed files with 625 additions and 463 deletions.
253 changes: 130 additions & 123 deletions src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::borrow::{Borrow, Cow};
use std::fmt::Display;
use thiserror::Error;

use crate::token::{Bound, Token};
use crate::token::{Bound, ConcatenationTree, Token, TokenTopology};

/// A regular expression that never matches.
///
Expand Down Expand Up @@ -138,13 +138,13 @@ pub fn case_folded_eq(left: &str, right: &str) -> bool {
}
}

pub fn compile<'t, A, T>(tokens: impl IntoIterator<Item = T>) -> Result<Regex, CompileError>
pub fn compile<'t, T>(tree: impl Borrow<T>) -> Result<Regex, CompileError>
where
T: Borrow<Token<'t, A>>,
T: ConcatenationTree<'t>,
{
let mut pattern = String::new();
pattern.push('^');
encode(Grouping::Capture, None, &mut pattern, tokens);
encode(Grouping::Capture, None, &mut pattern, tree);
pattern.push('$');
Regex::new(&pattern).map_err(|error| match error {
RegexError::CompiledTooBig(_) => CompileError {
Expand All @@ -154,22 +154,24 @@ where
})
}

// TODO: Implement this iteratively.
// TODO: Some versions of `const_format` in `^0.2.0` fail this lint in `formatcp`. See
// https://github.com/rodrimati1992/const_format_crates/issues/38
#[allow(clippy::double_parens)]
fn encode<'t, A, T>(
fn encode<'t, T>(
grouping: Grouping,
superposition: Option<Position>,
pattern: &mut String,
tokens: impl IntoIterator<Item = T>,
tree: impl Borrow<T>,
) where
T: Borrow<Token<'t, A>>,
T: ConcatenationTree<'t>,
{
use itertools::Position::{First, Last, Middle, Only};

use crate::token::Archetype::{Character, Range};
use crate::token::BranchKind::{Alternation, Concatenation, Repetition};
use crate::token::Evaluation::{Eager, Lazy};
use crate::token::TokenKind::{Alternative, Class, Literal, Repetition, Separator, Wildcard};
use crate::token::LeafKind::{Class, Literal, Separator, Wildcard};
use crate::token::Wildcard::{One, Tree, ZeroOrMore};

fn encode_intermediate_tree(grouping: Grouping, pattern: &mut String) {
Expand All @@ -180,135 +182,140 @@ fn encode<'t, A, T>(

// TODO: Use `Grouping` everywhere a group is encoded. For invariant groups that ignore
// `grouping`, construct a local `Grouping` instead.
for (position, token) in tokens.into_iter().with_position() {
match (position, token.borrow().kind()) {
(_, Literal(literal)) => {
// TODO: Only encode changes to casing flags.
// TODO: Should Unicode support also be toggled by casing flags?
if literal.is_case_insensitive() {
pattern.push_str("(?i)");
}
else {
pattern.push_str("(?-i)");
}
pattern.push_str(&literal.text().escaped());
},
(_, Separator(_)) => pattern.push_str(sepexpr!("{0}")),
(position, Alternative(alternative)) => {
let encodings: Vec<_> = alternative
.branches()
.iter()
.map(|tokens| {
let mut pattern = String::new();
pattern.push_str("(?:");
encode(
Grouping::NonCapture,
superposition.or(Some(position)),
&mut pattern,
tokens.iter(),
);
pattern.push(')');
pattern
})
.collect();
grouping.push_str(pattern, &encodings.join("|"));
},
(position, Repetition(repetition)) => {
let encoding = {
let cardinality = repetition.cardinality();
let mut pattern = String::new();
pattern.push_str("(?:");
encode(
Grouping::NonCapture,
superposition.or(Some(position)),
&mut pattern,
repetition.tokens().iter(),
);
pattern.push_str(&if let Bound::Bounded(upper) = cardinality().upper() {
format!("){{{},{}}}", cardinality.lower(), upper)
for (position, token) in tree.borrow().concatenation().iter().with_position() {
match token.topology() {
TokenTopology::Leaf(leaf) => match (position, leaf) {
(_, Literal(literal)) => {
// TODO: Only encode changes to casing flags.
// TODO: Should Unicode support also be toggled by casing flags?
if literal.is_case_insensitive() {
pattern.push_str("(?i)");
}
else {
format!("){{{},}}", cardinality.lower())
});
pattern
};
grouping.push_str(pattern, &encoding);
},
(_, Class(class)) => {
grouping.push_with(pattern, || {
use crate::token::Class as ClassToken;
pattern.push_str("(?-i)");
}
pattern.push_str(&literal.text().escaped());
},
(_, Separator(_)) => pattern.push_str(sepexpr!("{0}")),
(_, Class(class)) => {
grouping.push_with(pattern, || {
use crate::token::Class as ClassToken;

fn encode_class_archetypes(class: &ClassToken, pattern: &mut String) {
for archetype in class.archetypes() {
match archetype {
Character(literal) => pattern.push_str(&literal.escaped()),
Range(left, right) => {
pattern.push_str(&left.escaped());
pattern.push('-');
pattern.push_str(&right.escaped());
},
fn encode_class_archetypes(class: &ClassToken, pattern: &mut String) {
for archetype in class.archetypes() {
match archetype {
Character(literal) => pattern.push_str(&literal.escaped()),
Range(left, right) => {
pattern.push_str(&left.escaped());
pattern.push('-');
pattern.push_str(&right.escaped());
},
}
}
}
}

let mut pattern = String::new();
pattern.push('[');
if class.is_negated() {
pattern.push('^');
encode_class_archetypes(class, &mut pattern);
pattern.push_str(SEPARATOR_CLASS_EXPRESSION);
let mut pattern = String::new();
pattern.push('[');
if class.is_negated() {
pattern.push('^');
encode_class_archetypes(class, &mut pattern);
pattern.push_str(SEPARATOR_CLASS_EXPRESSION);
}
else {
encode_class_archetypes(class, &mut pattern);
pattern.push_str(nsepexpr!("&&{0}"));
}
pattern.push(']');
// TODO: The compiled `Regex` is discarded. Is there a way to check the
// correctness of the expression but do less work (i.e., don't build a
// complete `Regex`)?
// Compile the character class sub-expression. This may fail if the subtraction
// of the separator pattern yields an empty character class (meaning that the
// glob expression matches only separator characters on the target platform).
if Regex::new(&pattern).is_ok() {
pattern.into()
}
else {
// If compilation fails, then use `NEVER_EXPRESSION`, which matches
// nothing.
NEVER_EXPRESSION.into()
}
});
},
(_, Wildcard(One)) => grouping.push_str(pattern, nsepexpr!("{0}")),
(_, Wildcard(ZeroOrMore(Eager))) => grouping.push_str(pattern, nsepexpr!("{0}*")),
(_, Wildcard(ZeroOrMore(Lazy))) => grouping.push_str(pattern, nsepexpr!("{0}*?")),
(First, Wildcard(Tree { has_root })) => {
if let Some(Middle | Last) = superposition {
encode_intermediate_tree(grouping, pattern);
}
else if *has_root {
grouping.push_str(pattern, sepexpr!("{0}.*{0}?"));
}
else {
encode_class_archetypes(class, &mut pattern);
pattern.push_str(nsepexpr!("&&{0}"));
pattern.push_str(sepexpr!("(?:{0}?|"));
grouping.push_str(pattern, sepexpr!(".*{0}"));
pattern.push(')');
}
pattern.push(']');
// TODO: The compiled `Regex` is discarded. Is there a way to check the
// correctness of the expression but do less work (i.e., don't build a
// complete `Regex`)?
// Compile the character class sub-expression. This may fail if the subtraction
// of the separator pattern yields an empty character class (meaning that the
// glob expression matches only separator characters on the target platform).
if Regex::new(&pattern).is_ok() {
pattern.into()
},
(Middle, Wildcard(Tree { .. })) => {
encode_intermediate_tree(grouping, pattern);
},
(Last, Wildcard(Tree { .. })) => {
if let Some(First | Middle) = superposition {
encode_intermediate_tree(grouping, pattern);
}
else {
// If compilation fails, then use `NEVER_EXPRESSION`, which matches
// nothing.
NEVER_EXPRESSION.into()
pattern.push_str(sepexpr!("(?:{0}?|{0}"));
grouping.push_str(pattern, ".*");
pattern.push(')');
}
});
},
(_, Wildcard(One)) => grouping.push_str(pattern, nsepexpr!("{0}")),
(_, Wildcard(ZeroOrMore(Eager))) => grouping.push_str(pattern, nsepexpr!("{0}*")),
(_, Wildcard(ZeroOrMore(Lazy))) => grouping.push_str(pattern, nsepexpr!("{0}*?")),
(First, Wildcard(Tree { has_root })) => {
if let Some(Middle | Last) = superposition {
encode_intermediate_tree(grouping, pattern);
}
else if *has_root {
grouping.push_str(pattern, sepexpr!("{0}.*{0}?"));
}
else {
pattern.push_str(sepexpr!("(?:{0}?|"));
grouping.push_str(pattern, sepexpr!(".*{0}"));
pattern.push(')');
}
},
(Middle, Wildcard(Tree { .. })) => {
encode_intermediate_tree(grouping, pattern);
},
(Only, Wildcard(Tree { .. })) => grouping.push_str(pattern, ".*"),
},
(Last, Wildcard(Tree { .. })) => {
if let Some(First | Middle) = superposition {
encode_intermediate_tree(grouping, pattern);
}
else {
pattern.push_str(sepexpr!("(?:{0}?|{0}"));
grouping.push_str(pattern, ".*");
pattern.push(')');
}
TokenTopology::Branch(branch) => match branch {
Alternation(alternation) => {
let encodings: Vec<_> = alternation
.tokens()
.iter()
.map(|token| {
let mut pattern = String::new();
pattern.push_str("(?:");
encode::<Token<_>>(
Grouping::NonCapture,
superposition.or(Some(position)),
&mut pattern,
token,
);
pattern.push(')');
pattern
})
.collect();
grouping.push_str(pattern, &encodings.join("|"));
},
Concatenation(_) => unreachable!(),
Repetition(repetition) => {
let encoding = {
let cardinality = repetition.cardinality();
let mut pattern = String::new();
pattern.push_str("(?:");
encode::<Token<_>>(
Grouping::NonCapture,
superposition.or(Some(position)),
&mut pattern,
repetition.token(),
);
pattern.push_str(&if let Bound::Bounded(upper) = cardinality.upper() {
format!("){{{},{}}}", cardinality.lower(), upper)
}
else {
format!("){{{},}}", cardinality.lower())
});
pattern
};
grouping.push_str(pattern, &encoding);
},
},
(Only, Wildcard(Tree { .. })) => grouping.push_str(pattern, ".*"),
}
}
}
Expand Down

0 comments on commit 3e8b2a2

Please sign in to comment.