From 2b1b06b169ed186714ff71935ea418e2397c5de2 Mon Sep 17 00:00:00 2001
From: Shneor <770elmo@gmail.com>
Date: Fri, 6 Oct 2023 01:19:09 +0200
Subject: [PATCH 1/5] Added split_inclusive()
---
regex-automata/src/meta/regex.rs | 103 +++++++++++++++++++++++++++++++
1 file changed, 103 insertions(+)
diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index ce3bae0fa..4a04fa948 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -892,6 +892,14 @@ impl Regex {
) -> SplitN<'r, 'h> {
SplitN { splits: self.split(input), limit }
}
+
+ /// TODO: add docs
+ pub fn split_inclusive<'r, 'h, I: Into>>(
+ &'r self,
+ input: I,
+ ) -> SplitInclusive<'r, 'h> {
+ SplitInclusive { finder: self.find_iter(input), last: 0, span_to_yield: None }
+ }
}
/// Lower level search routines that give more control.
@@ -2278,6 +2286,58 @@ impl<'r, 'h> Iterator for SplitN<'r, 'h> {
impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
+/// TODO: add docs
+#[derive(Debug)]
+pub struct SplitInclusive<'r, 'h> {
+ finder: FindMatches<'r, 'h>,
+ last: usize,
+ span_to_yield: Option,
+}
+
+impl<'r, 'h> SplitInclusive<'r, 'h> {
+ /// Returns the current `Input` associated with this iterator.
+ ///
+ /// The `start` position on the given `Input` may change during iteration,
+ /// but all other values are guaranteed to remain invariant.
+ #[inline]
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ self.finder.input()
+ }
+}
+
+impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> {
+ type Item = Span;
+
+ fn next(&mut self) -> Option {
+ if let Some(span) = self.span_to_yield {
+ self.span_to_yield = None;
+ return Some(span)
+ }
+
+ match self.finder.next() {
+ None => {
+ let len = self.input().haystack().len();
+ if self.last > len {
+ None
+ } else {
+ let span = Span::from(self.last..len);
+ self.last = len + 1; // Next call will return None
+ Some(span)
+ }
+ },
+ Some(m) => {
+ let span = Span::from(self.last..m.start()); // return this right now
+ self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this
+
+ self.last = m.end();
+ Some(span)
+ }
+ }
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for SplitInclusive<'r, 'h> {}
+
/// Represents mutable scratch space used by regex engines during a search.
///
/// Most of the regex engines in this crate require some kind of
@@ -3646,4 +3706,47 @@ mod tests {
let re = Regex::new(r"[a-zA-Z]+ing").unwrap();
assert_eq!(1, re.find_iter("tingling").count());
}
+
+ #[test]
+ fn split_inclusive() {
+ let arr = [
+ (
+ r"(x)",
+ "1x2",
+ vec!["1", "x", "2"],
+ ),
+ (
+ r"([^\d]+)",
+ "1-2",
+ vec!["1", "-", "2"],
+ ),
+ (
+ r"(\s+)",
+ "this is a \n\ntest",
+ vec!["this", " ", "is", " ", "a", " \n\n", "test"],
+ ),
+ (
+ r"([^0-9a-zA-Z_])",
+ " C# is great! (not actually) :)",
+ vec!["", " ", "C", "#", "", " ", "is", " ", "great", "!", "", " ", "", "(", "not", " ", "actually", ")", "", " ", "", ":", "", ")", ""],
+ ),
+ (
+ r"([a-zA-Z](?:[a-zA-Z']*[a-zA-Z])?)",
+ r#"He said, "I'd like to eat cake!""#,
+ vec!["", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", "to", " ", "eat", " ", "cake", r#"!""#],
+ ),
+ ];
+ for (pattern, text, expected_output) in arr {
+ let mut out: Vec<_> = Regex::new(pattern).unwrap()
+ .split_inclusive(text)
+ .map(|span| &text[span])
+ .collect();
+
+ assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text);
+
+ // make sure that we can get the original string, from the splitted string by
+ // concatenating it
+ assert_eq!(out.join(""), text);
+ }
+ }
}
From 1d00c121fd32ae83d772627c195ccdf06f1045a9 Mon Sep 17 00:00:00 2001
From: Shneor <770elmo@gmail.com>
Date: Fri, 6 Oct 2023 02:24:13 +0200
Subject: [PATCH 2/5] Fixed 'mut' warning
---
regex-automata/src/meta/regex.rs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index 4a04fa948..bfca1e27b 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -3737,9 +3737,9 @@ mod tests {
),
];
for (pattern, text, expected_output) in arr {
- let mut out: Vec<_> = Regex::new(pattern).unwrap()
+ let out: Vec<_> = Regex::new(pattern).unwrap()
.split_inclusive(text)
- .map(|span| &text[span])
+ .map(|sp| &text[sp])
.collect();
assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text);
From 075c62c104faa1e9ab25be307e50d8dd6794d91b Mon Sep 17 00:00:00 2001
From: Shneor <770elmo@gmail.com>
Date: Fri, 6 Oct 2023 11:44:48 +0200
Subject: [PATCH 3/5] cargo fmt
---
regex-automata/src/meta/regex.rs | 46 +++++++++++++++++++-------------
1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index bfca1e27b..4a49b0328 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -898,7 +898,11 @@ impl Regex {
&'r self,
input: I,
) -> SplitInclusive<'r, 'h> {
- SplitInclusive { finder: self.find_iter(input), last: 0, span_to_yield: None }
+ SplitInclusive {
+ finder: self.find_iter(input),
+ last: 0,
+ span_to_yield: None,
+ }
}
}
@@ -2311,7 +2315,7 @@ impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> {
fn next(&mut self) -> Option {
if let Some(span) = self.span_to_yield {
self.span_to_yield = None;
- return Some(span)
+ return Some(span);
}
match self.finder.next() {
@@ -2324,10 +2328,10 @@ impl<'r, 'h> Iterator for SplitInclusive<'r, 'h> {
self.last = len + 1; // Next call will return None
Some(span)
}
- },
+ }
Some(m) => {
- let span = Span::from(self.last..m.start()); // return this right now
- self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this
+ let span = Span::from(self.last..m.start()); // return this right now
+ self.span_to_yield = Some(Span::from(m.start()..m.end())); // next call will return this
self.last = m.end();
Some(span)
@@ -3710,16 +3714,8 @@ mod tests {
#[test]
fn split_inclusive() {
let arr = [
- (
- r"(x)",
- "1x2",
- vec!["1", "x", "2"],
- ),
- (
- r"([^\d]+)",
- "1-2",
- vec!["1", "-", "2"],
- ),
+ (r"(x)", "1x2", vec!["1", "x", "2"]),
+ (r"([^\d]+)", "1-2", vec!["1", "-", "2"]),
(
r"(\s+)",
"this is a \n\ntest",
@@ -3728,21 +3724,33 @@ mod tests {
(
r"([^0-9a-zA-Z_])",
" C# is great! (not actually) :)",
- vec!["", " ", "C", "#", "", " ", "is", " ", "great", "!", "", " ", "", "(", "not", " ", "actually", ")", "", " ", "", ":", "", ")", ""],
+ vec![
+ "", " ", "C", "#", "", " ", "is", " ", "great", "!", "",
+ " ", "", "(", "not", " ", "actually", ")", "", " ", "",
+ ":", "", ")", "",
+ ],
),
(
r"([a-zA-Z](?:[a-zA-Z']*[a-zA-Z])?)",
r#"He said, "I'd like to eat cake!""#,
- vec!["", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ", "to", " ", "eat", " ", "cake", r#"!""#],
+ vec![
+ "", "He", " ", "said", r#", ""#, "I'd", " ", "like", " ",
+ "to", " ", "eat", " ", "cake", r#"!""#,
+ ],
),
];
for (pattern, text, expected_output) in arr {
- let out: Vec<_> = Regex::new(pattern).unwrap()
+ let out: Vec<_> = Regex::new(pattern)
+ .unwrap()
.split_inclusive(text)
.map(|sp| &text[sp])
.collect();
- assert_eq!(out, expected_output, "Regex: {}, Input: {}", pattern, text);
+ assert_eq!(
+ out, expected_output,
+ "Regex: {}, Input: {}",
+ pattern, text
+ );
// make sure that we can get the original string, from the splitted string by
// concatenating it
From 755a48bdf755ea27724172facda769845386d3df Mon Sep 17 00:00:00 2001
From: Shneor <770elmo@gmail.com>
Date: Fri, 6 Oct 2023 11:46:45 +0200
Subject: [PATCH 4/5] Replaced \d with 0-9 in regex
---
regex-automata/src/meta/regex.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index 4a49b0328..c6e9b8b96 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -3715,7 +3715,7 @@ mod tests {
fn split_inclusive() {
let arr = [
(r"(x)", "1x2", vec!["1", "x", "2"]),
- (r"([^\d]+)", "1-2", vec!["1", "-", "2"]),
+ (r"([^0-9]+)", "1-2", vec!["1", "-", "2"]),
(
r"(\s+)",
"this is a \n\ntest",
From 77371e597d46b91073f94644a15eb1fda3e0c383 Mon Sep 17 00:00:00 2001
From: Shneor <770elmo@gmail.com>
Date: Fri, 6 Oct 2023 12:25:32 +0200
Subject: [PATCH 5/5] Fixed '\s' regex
---
regex-automata/src/meta/regex.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
index c6e9b8b96..93fe6c0bb 100644
--- a/regex-automata/src/meta/regex.rs
+++ b/regex-automata/src/meta/regex.rs
@@ -3717,7 +3717,7 @@ mod tests {
(r"(x)", "1x2", vec!["1", "x", "2"]),
(r"([^0-9]+)", "1-2", vec!["1", "-", "2"]),
(
- r"(\s+)",
+ r"([\r\n\t\f\v ]+)", // equivalent to r"(\s+)"
"this is a \n\ntest",
vec!["this", " ", "is", " ", "a", " \n\n", "test"],
),