From 787de6d0a0c9d4e6331351a471da6790bfa99e49 Mon Sep 17 00:00:00 2001 From: Greg Brown Date: Mon, 9 Nov 2020 14:40:14 +0000 Subject: Parser with some tests --- src/nibble/mod.rs | 720 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 720 insertions(+) create mode 100644 src/nibble/mod.rs (limited to 'src/nibble/mod.rs') diff --git a/src/nibble/mod.rs b/src/nibble/mod.rs new file mode 100644 index 0000000..9c41344 --- /dev/null +++ b/src/nibble/mod.rs @@ -0,0 +1,720 @@ +pub mod parse; + +use self::parse::{ + iter::{Peek, PeekExt, PeekableTakeWhile}, + requires, Parse, ParseError, +}; +use std::convert::Infallible; +use std::str::FromStr; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Ident { + name: String, +} + +impl Parse for Ident { + type Err = Infallible; + + fn parse>(mut iter: I) -> Result> { + // rustc can't predict type, but rust-analyzer can... + let name: String = PeekableTakeWhile::new(&mut iter, |c| c.is_alphabetic()).collect(); + if name.is_empty() { + if let Some(c) = iter.peek() { + Err(ParseError::InvalidCharacter(*c)) + } else { + Err(ParseError::EndOfFile) + } + } else { + Ok(Self { name }) + } + } +} + +impl FromStr for Ident { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Ident::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Literal { + contents: String, +} + +impl Parse for Literal { + type Err = LiteralError; + + fn parse>(mut iter: I) -> Result> { + fn parse_digit>( + mut iter: I, + radix: u32, + ) -> Result> { + Ok(iter + .next_if(|c| c.is_digit(radix)) + .ok_or(ParseError::EndOfFile)? + .map_err(|i| ParseError::InvalidCharacter(*i.peek().unwrap()))? + .to_digit(radix) + .unwrap()) + } + + /// Parse full character escape. + fn parse_escape>( + mut iter: I, + ) -> Result> { + requires(&mut iter, '\\')?; + match iter.peek().ok_or(ParseError::EndOfFile)? { + '\'' | '\"' | '\\' => Ok(iter.next().unwrap()), + 'n' => { + requires(iter, 'n')?; + Ok('\n') + } + 'r' => { + requires(iter, 'r')?; + Ok('\r') + } + 't' => { + requires(iter, 't')?; + Ok('\t') + } + '0' => { + requires(iter, '0')?; + Ok('\0') + } + 'x' => { + requires(&mut iter, 'x')?; + let upper = parse_digit(&mut iter, 8)?; + let ord = 16 * upper + parse_digit(&mut iter, 16)?; + std::char::from_u32(ord) + .ok_or(ParseError::Other(LiteralError::InvalidCharacterCode(ord))) + } + 'u' => { + requires(&mut iter, 'u')?; + requires(&mut iter, '{')?; + let mut ord = 0; + for _ in 1..=6 { + ord *= 16; + ord += parse_digit(&mut iter, 16)?; + iter.advance_while(|&c| c == '_'); + + if iter.peek() == Some(&'}') { + break; + } + } + requires(&mut iter, '}')?; + + std::char::from_u32(ord) + .ok_or(ParseError::Other(LiteralError::InvalidCharacterCode(ord))) + } + &c => Err(ParseError::InvalidCharacter(c)), + } + } + + requires(&mut iter, '\'')?; + + let mut s = String::new(); + + loop { + match iter.peek().ok_or(ParseError::EndOfFile)? { + '\'' => { + iter.next(); + return if s.is_empty() { + Err(ParseError::Other(LiteralError::EmptyLiteral)) + } else { + Ok(Literal { contents: s }) + }; + } + &c @ '\n' | &c @ '\r' | &c @ '\t' => return Err(ParseError::InvalidCharacter(c)), + '\\' => s.push(parse_escape(&mut iter)?), + _ => s.push(iter.next().unwrap()), + } + } + } +} + +impl FromStr for Literal { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Literal::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum LiteralError { + EmptyLiteral, + InvalidCharacterCode(u32), +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Call { + name: Ident, + args: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Fix { + arg: Ident, + expr: Expression, +} + +impl Parse for Fix { + type Err = FixError; + + fn parse>(mut iter: I) -> Result> { + requires(&mut iter, '[')?; + let arg = Ident::parse(&mut iter).map_err(ParseError::map)?; + requires(&mut iter, ']')?; + requires(&mut iter, '(')?; + iter.advance_while(|c| c.is_whitespace()); + let expr = Expression::parse(Box::new(&mut iter as &mut dyn Peek)) + .map_err(ParseError::map)?; + requires(&mut iter, ')')?; + Ok(Self { arg, expr }) + } +} + +impl FromStr for Fix { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Fix::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FixError(Box); + +impl From for FixError { + fn from(other: Infallible) -> Self { + match other {} + } +} + +impl From for FixError { + fn from(other: ExpressionError) -> Self { + Self(Box::new(other)) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Term { + Epsilon, + Ident(Ident), + Literal(Literal), + Call { name: Ident, args: Vec }, + Fix(Fix), + Parens(Expression), +} + +impl Parse for Term { + type Err = TermError; + + fn parse>(mut iter: I) -> Result> { + match iter.peek().ok_or(ParseError::EndOfFile)? { + '(' => { + iter.next(); + + if iter.consume_if_next(&')') { + Ok(Self::Epsilon) + } else { + let expr = Expression::parse(Box::new(&mut iter as &mut dyn Peek)) + .map_err(ParseError::map)?; + requires(iter, ')').map(|_| Self::Parens(expr)) + } + } + '[' => Fix::parse(iter).map(Self::Fix).map_err(ParseError::map), + '\'' => Literal::parse(iter) + .map(Self::Literal) + .map_err(ParseError::map), + c if c.is_alphabetic() => { + let ident = Ident::parse(&mut iter).map_err(ParseError::map)?; + + if iter.consume_if_next(&'(') { + iter.advance_while(|c| c.is_whitespace()); + let mut args = Vec::new(); + loop { + args.push( + Expression::parse(Box::new(&mut iter as &mut dyn Peek)) + .map_err(ParseError::::map) + .map_err(ParseError::map)?, + ); + + if iter.consume_if_next(&')') { + return Ok(Self::Call { name: ident, args }); + } + + requires(&mut iter, ',')?; + iter.advance_while(|c| c.is_whitespace()); + } + } else { + Ok(Self::Ident(ident)) + } + } + &c => Err(ParseError::InvalidCharacter(c)), + } + } +} + +impl FromStr for Term { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Term::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CallError(Box); + +impl From for CallError { + fn from(other: ExpressionError) -> Self { + Self(Box::new(other)) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum TermError { + Literal(LiteralError), + Call(CallError), + Fix(FixError), + Parens(Box), +} + +impl From for TermError { + fn from(other: Infallible) -> Self { + match other {} + } +} + +impl From for TermError { + fn from(other: ExpressionError) -> Self { + Self::Parens(Box::new(other)) + } +} + +impl From for TermError { + fn from(other: FixError) -> Self { + Self::Fix(other) + } +} + +impl From for TermError { + fn from(other: CallError) -> Self { + Self::Call(other) + } +} + +impl From for TermError { + fn from(other: LiteralError) -> Self { + Self::Literal(other) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Seq { + terms: Vec, +} + +impl Parse for Seq { + type Err = SeqError; + + fn parse>(mut iter: I) -> Result> { + let mut terms = Vec::new(); + terms.push(Term::parse(&mut iter).map_err(ParseError::map)?); + iter.advance_while(|c| c.is_whitespace()); + + while iter.consume_if_next(&'.') { + iter.advance_while(|c| c.is_whitespace()); + terms.push(Term::parse(&mut iter).map_err(ParseError::map)?); + iter.advance_while(|c| c.is_whitespace()); + } + + Ok(Self { terms }) + } +} + +impl FromStr for Seq { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Seq::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SeqError(TermError); + +impl From for SeqError { + fn from(other: TermError) -> Self { + Self(other) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Alt { + seqs: Vec, +} + +impl Parse for Alt { + type Err = AltError; + + fn parse>(mut iter: I) -> Result> { + let mut seqs = Vec::new(); + seqs.push(Seq::parse(&mut iter).map_err(ParseError::map)?); + + while iter.consume_if_next(&'|') { + iter.advance_while(|c| c.is_whitespace()); + seqs.push(Seq::parse(&mut iter).map_err(ParseError::map)?); + } + + Ok(Self { seqs }) + } +} + +impl FromStr for Alt { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Alt::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct AltError(SeqError); + +impl From for AltError { + fn from(other: SeqError) -> Self { + Self(other) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Expression { + alt: Alt, +} + +impl Parse for Expression { + type Err = ExpressionError; + + fn parse>(iter: I) -> Result> { + Ok(Self { + alt: Alt::parse(iter).map_err(ParseError::map)?, + }) + } +} + +impl FromStr for Expression { + type Err = ParseError<::Err>; + + fn from_str(s: &str) -> Result { + Expression::parse(s.chars().peekable()) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ExpressionError(AltError); + +impl From for ExpressionError { + fn from(other: AltError) -> Self { + Self(other) + } +} + +#[cfg(test)] +mod tests { + use super::Alt; + use super::Expression; + use super::Fix; + use super::Seq; + use super::Term; + use super::{ + parse::{Parse, ParseError}, + Ident, Literal, LiteralError, + }; + use std::iter; + + macro_rules! parse_indent { + ($s:literal) => { + assert_eq!( + $s.parse(), + Ok(Ident { + name: $s.to_owned() + }) + ) + }; + } + + #[test] + fn parse_ident_simple() { + parse_indent!("x"); + parse_indent!("variable"); + } + + #[test] + fn parse_indent_unicode() { + parse_indent!("ℕꥇℜ"); + } + + #[test] + fn parse_indent_stops() { + let s = "variable then more text"; + let mut iter = s.chars().peekable(); + assert_eq!( + Ident::parse(&mut iter), + Ok(Ident { + name: "variable".to_owned() + }) + ); + assert_eq!(iter.collect::(), " then more text"); + } + + #[test] + fn parse_indent_not_empty() { + assert_eq!("".parse::(), Err(ParseError::EndOfFile)); + } + + #[test] + fn parse_indent_alphabetic_only() { + assert_eq!( + "123".parse::(), + Err(ParseError::InvalidCharacter('1')) + ); + } + + macro_rules! parse_lit { + ($str:literal) => { + assert_eq!( + Literal::parse( + iter::once('\'') + .chain($str.escape_debug()) + .chain(iter::once('\''),) + .peekable() + ), + Ok(Literal { + contents: $str.to_owned() + }), + ) + }; + } + + #[test] + fn parse_literal_basic() { + parse_lit!("hello"); + parse_lit!("this has whitespace"); + } + + #[test] + fn parse_literal_quotes() { + parse_lit!(r#"quote ' " and backslash \ quoting"#); + } + + #[test] + fn parse_literal_ascii_escape() { + parse_lit!("newline \n tab \t carriage return \r"); + + assert_eq!( + r#"'\0'"#.parse(), + Ok(Literal { + contents: "\0".to_owned() + }) + ); + } + + #[test] + fn parse_literal_hex_escape() { + assert_eq!( + r#"'\x20'"#.parse(), + Ok(Literal { + contents: " ".to_owned() + }) + ); + } + + #[test] + fn parse_literal_unicode_escape() { + parse_lit!("emoji ❤ escape \u{200c}"); + } + + #[test] + fn parse_literal_stops() { + let mut iter = "'hi there' Not quoted".chars().peekable(); + assert_eq!( + Literal::parse(&mut iter), + Ok(Literal { + contents: "hi there".to_owned() + }) + ); + assert_eq!(iter.collect::(), " Not quoted"); + } + + #[test] + fn parse_literal_needs_quotes() { + assert_eq!( + "hello".parse::(), + Err(ParseError::InvalidCharacter('h')) + ); + assert_eq!("'hello".parse::(), Err(ParseError::EndOfFile)); + } + + #[test] + fn parse_literal_not_empty() { + assert_eq!( + "''".parse::(), + Err(ParseError::Other(LiteralError::EmptyLiteral)) + ); + } + + #[test] + fn parse_literal_no_forbidden_whitespace() { + assert_eq!( + "'\n".parse::(), + Err(ParseError::InvalidCharacter('\n')) + ); + assert_eq!( + "'\r".parse::(), + Err(ParseError::InvalidCharacter('\r')) + ); + assert_eq!( + "'\t".parse::(), + Err(ParseError::InvalidCharacter('\t')) + ); + } + + #[test] + fn parse_literal_invalid_escape() { + assert_eq!( + r#"'\F"#.parse::(), + Err(ParseError::InvalidCharacter('F')) + ); + } + + #[test] + fn parse_literal_bad_hex_escape() { + assert_eq!( + r#"'\xF"#.parse::(), + Err(ParseError::InvalidCharacter('F')) + ); + assert_eq!( + r#"'\x0z"#.parse::(), + Err(ParseError::InvalidCharacter('z')) + ); + } + + #[test] + fn parse_literal_bad_unicode_escape() { + // Negative tests + assert_eq!( + r#"'\u0"#.parse::(), + Err(ParseError::InvalidCharacter('0')) + ); + assert_eq!( + r#"'\u{}"#.parse::(), + Err(ParseError::InvalidCharacter('}')) + ); + assert_eq!( + r#"'\u{z"#.parse::(), + Err(ParseError::InvalidCharacter('z')) + ); + assert_eq!( + r#"'\u{_"#.parse::(), + Err(ParseError::InvalidCharacter('_')) + ); + assert_eq!( + r#"'\u{0___h"#.parse::(), + Err(ParseError::InvalidCharacter('h')) + ); + assert_eq!( + r#"'\u{D800}"#.parse::(), + Err(ParseError::Other(LiteralError::InvalidCharacterCode( + 0xd800 + ))) + ); + assert_eq!( + r#"'\u{0000000"#.parse::(), + Err(ParseError::InvalidCharacter('0')) + ); + } + + #[test] + fn parse_fix_basic() { + let expr = Expression { + alt: Alt { + seqs: vec![Seq { + terms: vec![Term::Epsilon], + }], + }, + }; + assert_eq!( + "[x](())".parse(), + Ok(Fix { + arg: Ident { + name: "x".to_owned() + }, + expr: expr.clone() + }) + ); + assert_eq!( + "[x](() )".parse(), + Ok(Fix { + arg: Ident { + name: "x".to_owned() + }, + expr: expr.clone() + }) + ); + assert_eq!( + "[x]( ())".parse(), + Ok(Fix { + arg: Ident { + name: "x".to_owned() + }, + expr + }) + ); + } + + #[test] + fn parse_fix_needs_arg() { + assert_eq!( + "x]()".parse::(), + Err(ParseError::InvalidCharacter('x')) + ); + assert_eq!( + "[]()".parse::(), + Err(ParseError::InvalidCharacter(']')) + ); + } + + #[test] + fn parse_fix_needs_parens() { + assert_eq!( + "[x]x".parse::(), + Err(ParseError::InvalidCharacter('x')) + ); + assert_eq!("[x](x".parse::(), Err(ParseError::EndOfFile)) + } + + #[test] + fn parse_fix_stops() { + let mut iter = "[x](()) Not in fix".chars().peekable(); + assert_eq!( + Fix::parse(&mut iter), + Ok(Fix { + arg: Ident { + name: "x".to_owned() + }, + expr: Expression { + alt: Alt { + seqs: vec![Seq { + terms: vec![Term::Epsilon], + }], + }, + }, + }) + ); + + assert_eq!(iter.collect::(), " Not in fix"); + } +} -- cgit v1.2.3