diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 404c47721..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -412,6 +412,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { let this = if group.is_capturing() { 1 } else { 0 }; this + count_capturing_groups_ast(&*group.ast) } + Ast::LookAround(ref lookaround) => { + count_capturing_groups_ast(&lookaround.ast) + } Ast::Alternation(ref alt) => { alt.asts.iter().map(count_capturing_groups_ast).sum() } diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 0eca1d4db..25f3b9280 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -144,6 +144,10 @@ pub enum ErrorKind { /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, + /// An unclosed look-around, e.g., `(? write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), + LookAroundUnclosed => write!(f, "unclosed look-around"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, @@ -526,7 +531,7 @@ impl Ast { } /// Create a "look-around" AST item. - pub fn look_around(e: LookAround) -> Ast { + pub fn lookaround(e: LookAround) -> Ast { Ast::LookAround(Box::new(e)) } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 012185537..5883a0dd4 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -159,6 +159,7 @@ impl ParserBuilder { stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), + lookaround_depth: Cell::new(0), } } @@ -280,6 +281,9 @@ pub struct Parser { /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, + /// Whether the parser is currently in a look-around. This is used to + /// detect capture groups within look-arounds, which are not supported. + lookaround_depth: Cell, } /// ParserI is the internal parser implementation. @@ -299,9 +303,9 @@ struct ParserI<'s, P> { pattern: &'s str, } -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. +/// GroupState represents a single stack frame while parsing nested groups, +/// look-arounds and alternations. Each frame records the state up to an opening +/// parenthesis or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. @@ -313,6 +317,13 @@ enum GroupState { /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, + /// This state is pushed whenever an opening look-around is found. + LookAround { + /// The concatenation immediately preceding the opening look-around. + concat: ast::Concat, + /// The look-around that has been opened. Its sub-AST is always empty. + lookaround: ast::LookAround, + }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new @@ -385,6 +396,7 @@ impl Parser { self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); + self.lookaround_depth.set(0); } } @@ -470,6 +482,11 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.get() } + /// Return whether the parser is currently in a look-around. + fn in_lookaround(&self) -> bool { + self.parser().lookaround_depth.get() != 0 + } + /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. @@ -521,18 +538,15 @@ impl<'s, P: Borrow> ParserI<'s, P> { } } - /// Returns true if and only if the parser is positioned at a look-around + /// Returns true if and only if the parser is positioned at a look-ahead /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. - fn is_lookaround_prefix(&self) -> bool { - self.bump_if("?=") - || self.bump_if("?!") - || self.bump_if("?<=") - || self.bump_if("? bool { + self.bump_if("?=") || self.bump_if("?!") } /// Bump the parser, and if the `x` flag is enabled, bump through any @@ -686,9 +700,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { })); } - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. + /// Parse and push a group or look-around AST (and its parent + /// concatenation) on to the parser's internal stack. Return a fresh + /// concatenation corresponding to the grouping's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. @@ -697,12 +711,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. + /// If there was a problem parsing the start of the grouping, then an + /// error is returned. #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result { + fn push_grouping(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); - match self.parse_group()? { + match self.parse_grouping()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { @@ -712,7 +726,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { concat.asts.push(Ast::flags(set)); Ok(concat) } - Either::Right(group) => { + Either::Right(Either::Left(group)) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() @@ -728,61 +742,124 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } + Either::Right(Either::Right(lookaround)) => { + self.parser() + .stack_group + .borrow_mut() + .push(GroupState::LookAround { concat, lookaround }); + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() + 1); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } } } - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. + /// Pop a group or look-around AST from the parser's internal stack and + /// set the grouping's AST to the given concatenation. Return the + /// concatenation containing the grouping. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// - /// If no such group could be popped, then an unopened group error is + /// If no such grouping could be popped, then an unopened group error is /// returned. + /// + /// If a look-behind contains a capture group, then an error is returned. #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + fn pop_grouping( + &self, + mut grouping_concat: ast::Concat, + ) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { + let (mut prior_concat, mut grouping, ignore_whitespace, alt) = + match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) + (concat, Either::Left(group), ignore_whitespace, None) } - None | Some(Alternation(_)) => { + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + None, + ), + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => ( + concat, + Either::Left(group), + ignore_whitespace, + Some(alt), + ), + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + Some(alt), + ), + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; + }; self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); + grouping_concat.span.end = self.pos(); self.bump(); - group.span.end = self.pos(); + match &mut grouping { + Either::Left(group) => group.span.end = self.pos(), + Either::Right(lookaround) => lookaround.span.end = self.pos(), + } match alt { Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); + alt.span.end = grouping_concat.span.end; + alt.asts.push(grouping_concat.into_ast()); + match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(alt.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(alt.into_ast()) + } + } } + None => match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(grouping_concat.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(grouping_concat.into_ast()) + } + }, } - prior_concat.asts.push(Ast::group(group)); + prior_concat.asts.push(match grouping { + Either::Left(group) => { + if group.is_capturing() && self.in_lookaround() { + return Err(self.error( + group.span, + ast::ErrorKind::UnsupportedCaptureInLookBehind, + )); + } + + Ast::group(group) + } + Either::Right(lookaround) => { + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() - 1); + Ast::lookaround(lookaround) + } + }); Ok(prior_concat) } @@ -793,7 +870,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes that the parser has advanced to the end. #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + fn pop_grouping_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { @@ -808,6 +885,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } + Some(GroupState::LookAround { lookaround, .. }) => { + return Err(self.error( + lookaround.span, + ast::ErrorKind::LookAroundUnclosed, + )); + } }; // If we try to pop again, there should be nothing. match stack.pop() { @@ -824,6 +907,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } + Some(GroupState::LookAround { lookaround, .. }) => Err(self + .error(lookaround.span, ast::ErrorKind::LookAroundUnclosed)), } } @@ -989,8 +1074,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { break; } match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, + '(' => concat = self.push_grouping(concat)?, + ')' => concat = self.pop_grouping(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; @@ -1020,7 +1105,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } - let ast = self.pop_group_end(concat)?; + let ast = self.pop_grouping_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, @@ -1205,16 +1290,17 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(concat) } - /// Parse a group (which contains a sub-expression) or a set of flags. + /// Parse a group or look-around (which contain a sub-expression), or a + /// set of flags. /// - /// If a group was found, then it is returned with an empty AST. If a set - /// of flags is found, then that set is returned. + /// If a group or look-around was found, then it is returned with an + /// empty AST. If a set of flags is found, then that set is returned. /// /// The parser should be positioned at the opening parenthesis. /// /// This advances the parser to the character before the start of the - /// sub-expression (in the case of a group) or to the closing parenthesis - /// immediately following the set of flags. + /// sub-expression (in the case of a group or look-around) or to the + /// closing parenthesis immediately following the set of flags. /// /// # Errors /// @@ -1223,19 +1309,38 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. + /// + /// If a look-ahead is given (which is currently unsupported), then an + /// error is returned. #[inline(never)] - fn parse_group(&self) -> Result> { + fn parse_grouping( + &self, + ) -> Result>> + { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); self.bump_space(); - if self.is_lookaround_prefix() { + if self.is_lookahead_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); + + let mut lookaround_kind = ast::LookAroundKind::PositiveLookBehind; + if self.bump_if("?<=") || { + lookaround_kind = ast::LookAroundKind::NegativeLookBehind; + self.bump_if("?> ParserI<'s, P> { } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), - })) + }))) } else if self.bump_if("?") { if self.is_eof() { return Err( @@ -1272,19 +1377,19 @@ impl<'s, P: Borrow> ParserI<'s, P> { })) } else { assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } else { let capture_index = self.next_capture_index(open_span)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } @@ -3756,25 +3861,185 @@ bar } #[test] - #[ignore = "Missing parser support for lookaround"] fn parse_lookbehinds() { - todo!("write tests for lookbehinds"); + assert_eq!( + parser(r"(?<=)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..5), + ast: Box::new(Ast::empty(span(4..4))), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?<=))(a)").parse(), + Ok(concat( + 0..13, + vec![ + Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::lookaround(ast::LookAround { + span: span(4..9), + ast: Box::new(Ast::empty(span(8..8))), + kind: ast::LookAroundKind::PositiveLookBehind + })), + kind: ast::LookAroundKind::PositiveLookBehind + }), + group(10..13, 1, lit('a', 11)), + ] + )) + ); + assert_eq!( + parser(r"(?<=a)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..6), + ast: Box::new(lit('a', 4)), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?:a))").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::group(ast::Group { + span: span(4..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(6..6), + items: vec![], + }), + ast: Box::new(lit('a', 7)), + })), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?a))").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, + } + ); + assert_eq!( + parser(r"(?a)|b)").parse().unwrap_err(), + TestError { + span: span(6..16), kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 112c0bda1..0e87599d2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -526,9 +526,8 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn print_lookaround() { - roundtrip("(? &Hir { match self { Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { @@ -1828,7 +1828,7 @@ impl LookAround { } } - /// Returns a new lookaround of the same kind, but with its + /// Returns a new look-around of the same kind, but with its /// sub-expression replaced with the one given. pub fn with(&self, sub: Hir) -> LookAround { match self { diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 86e0018c6..4b032fae4 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -485,7 +485,6 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn print_look_around() { roundtrip("(?<=)", "(?<=(?:))"); roundtrip("(? Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::LookAround(ref x) => { - todo!("translation from AST to HIR"); - } Ast::Concat(_) => { self.push(HirFrame::Concat); } @@ -449,8 +446,16 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - Ast::LookAround(_) => { - todo!("translation from AST to HIR"); + Ast::LookAround(ref x) => { + let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.push(HirFrame::Expr(Hir::lookaround(match x.kind { + ast::LookAroundKind::PositiveLookBehind => { + hir::LookAround::PositiveLookBehind(expr) + } + ast::LookAroundKind::NegativeLookBehind => { + hir::LookAround::NegativeLookBehind(expr) + } + }))); } Ast::Concat(_) => { let mut exprs = vec![];