diff options
author | Greg Brown <gmb60@cam.ac.uk> | 2021-03-01 13:36:55 +0000 |
---|---|---|
committer | Greg Brown <gmb60@cam.ac.uk> | 2021-03-01 13:36:55 +0000 |
commit | 7ba3f96d49489aec9ff4a536c2df416287e49aca (patch) | |
tree | 02e5334a179b2103063d91ed8d1ef8ce658427b8 | |
parent | 7934aa9af22e8fa3c33a45bc08e732a73c0cabf5 (diff) |
Initial work on charsetscharset
-rw-r--r-- | src/chomp/set.rs | 165 | ||||
-rw-r--r-- | src/lower/mod.rs | 56 |
2 files changed, 176 insertions, 45 deletions
diff --git a/src/chomp/set.rs b/src/chomp/set.rs index 0661ab6..8f97e8a 100644 --- a/src/chomp/set.rs +++ b/src/chomp/set.rs @@ -1,8 +1,31 @@ -use std::collections::BTreeSet; +use std::{collections::BTreeSet, convert::TryInto}; + +// From 1.50 std::core::iter::range lines 425--450 +fn step_char_forward(c : char) -> Option<char> { + let start = u32::from(c); + let mut res = start.checked_add(1)?; + + if start < 0xD800 && 0xD800 <= res { + res = start.checked_add(0x800)?; + } + + res.try_into().ok() +} + +fn step_char_backward(c : char) -> Option<char> { + let start = u32::from(c); + let mut res = start.checked_sub(1)?; + + if start >= 0xE000 && 0xE000 > res { + res = start.checked_sub(0x800)?; + } + + res.try_into().ok() +} #[derive(Clone, Debug, Default, Eq, PartialEq, Hash)] pub struct FirstSet { - inner: BTreeSet<char>, + inner: Vec<(char, char)>, } impl FirstSet { @@ -11,32 +34,108 @@ impl FirstSet { } pub fn of_str(s: &str) -> Self { - let mut inner = BTreeSet::new(); - s.chars().next().map(|c| inner.insert(c)); - - Self { inner } + Self { + inner: s.chars().next().into_iter().map(|c| (c, c)).collect(), + } } pub fn is_empty(&self) -> bool { self.inner.is_empty() } - pub fn union(mut self, mut other: Self) -> Self { - self.inner.append(&mut other.inner); - self + pub fn union(self, mut other: Self) -> Self { + let mut out = Vec::new(); + let mut iters = [self.into_iter().peekable(), other.into_iter().peekable()]; + + 'outer: loop { + // Look at the next two ranges, and find the minimum next character + let (min, mut max) = match (iters[0].peek(), iters[1].peek()) { + // Both inputs exhausted + (None, None) => break, + // Add remainder to output + (None, Some(_)) => { + out.extend(iters[1]); + break + } + (Some(_), None) => { + out.extend(iters[0]); + break + } + // The interesting case + (Some((left, left_max)), Some((right, right_max))) => { + if left > right { + iters.swap(0, 1); + (*right, *right_max) + } else { + (*left, *left_max) + } + } + }; + + max = match step_char_forward(max) { + Some(c) => c, + None => { + // assert!(max == char::MAX) + // This swallows all other ranges + out.push((min, max)); + break; + } + }; + + // There are a few cases: + // - No overlap: + // ---- + // ---- + // - Extend: + // ---- + // ---- + // - Swallow: + // --------- + // ---- + // We loop until there is no overlap. + // For extend, we step the left and swap. + // For swallow, we step the right only + while let Some((other_min, other_max)) = iters[1].peek() { + if *other_min > max { + // No overlap + break; + } + + // max is one past range, other_max is in range, so we use >= + if *other_max >= max { + // Extend + iters[0].next(); + iters.swap(0, 1); + max = match step_char_forward(*other_max) { + Some(c) => c, + None => { out.push((min, max)); break 'outer } + }; + } else { + // Swallow + iters[1].next(); + } + } + + let end = step_char_backward(max).expect("char + 1 - 1 != char"); + out.push((min, end)); + iters[0].next(); + } + + Self {inner: out} } pub fn intersect(&self, other: &Self) -> Self { - Self { - inner: self.inner.intersection(&other.inner).copied().collect(), - } + todo!() + // Self { + // inner: self.inner.intersection(&other.inner).copied().collect(), + // } } } impl IntoIterator for FirstSet { - type Item = char; + type Item = (char, char); - type IntoIter = <BTreeSet<char> as IntoIterator>::IntoIter; + type IntoIter = <Vec<(char, char)> as IntoIterator>::IntoIter; fn into_iter(self) -> Self::IntoIter { self.inner.into_iter() @@ -58,8 +157,9 @@ impl FlastSet { } pub fn union_first(mut self, mut other: FirstSet) -> Self { - self.inner.append(&mut other.inner); - self + // self.inner.append(&mut other.inner); + // self + todo!() } pub fn union(mut self, mut other: Self) -> Self { @@ -68,9 +168,10 @@ impl FlastSet { } pub fn intersect_first(&self, other: &FirstSet) -> Self { - Self { - inner: self.inner.intersection(&other.inner).copied().collect(), - } + // Self { + // inner: self.inner.intersection(&other.inner).copied().collect(), + // } + todo!() } pub fn intersect(&self, other: &Self) -> Self { @@ -89,3 +190,29 @@ impl IntoIterator for FlastSet { self.inner.into_iter() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_first_union_none() { + let left = FirstSet { inner: vec![('A', 'Z'), ('a', 'z')]}; + let right = FirstSet {inner: vec![]}; + assert_eq!(left.union(right), left) + } + + #[test] + fn test_first_union_shallow() { + let left = FirstSet { inner: vec![('A', 'Z'), ('a', 'z')]}; + let right = FirstSet {inner: vec![('b', 'y')]}; + assert_eq!(left.union(right), left) + } + + #[test] + fn test_first_union_extend() { + let left = FirstSet { inner: vec![('A', 'Z'), ('a', 'z')]}; + let right = FirstSet {inner: vec![('[', '`')]}; + assert_eq!(left.union(right), FirstSet {inner: vec![('A', 'z')]}) + } +} diff --git a/src/lower/mod.rs b/src/lower/mod.rs index dcd0f1f..45be7aa 100644 --- a/src/lower/mod.rs +++ b/src/lower/mod.rs @@ -300,17 +300,19 @@ impl Backend for RustBackend { .enumerate() .find(|(_, ty)| ty.nullable()) .map(|(idx, _)| name_parts[idx].clone()); - let (first_alts, firsts): (Vec<_>, Vec<_>) = tys - .iter() - .map(Type::first_set) - .cloned() - .enumerate() - .filter(|(_, fs)| !fs.is_empty()) - .map(|(idx, fs)| { - let fs = fs.into_iter(); - (name_parts[idx].clone(), quote! {#(Some(#fs))|*}) - }) - .unzip(); + todo!(); + // let (first_alts, firsts): (Vec<_>, Vec<_>) = tys + // .iter() + // .map(Type::first_set) + // .cloned() + // .enumerate() + // .filter(|(_, fs)| !fs.is_empty()) + // .map(|(idx, fs)| { + // let fs = fs.into_iter(); + // // (name_parts[idx].clone(), quote! {#(Some(#fs))|*}) + // todo!() + // }) + // .unzip(); let all_firsts = tys .iter() .map(Type::first_set) @@ -336,25 +338,27 @@ impl Backend for RustBackend { quote_spanned! {span=> impl Parse for #name { fn take<P: Parser + ?Sized>(input: &mut P) -> Result<Self, TakeError> { - match input.peek() { - #(#firsts => Ok(Self::#first_alts(input.take()?)),)* - _ => Ok(Self::#nullable(input.take()?)) - } + todo!() + // match input.peek() { + // #(#firsts => Ok(Self::#first_alts(input.take()?)),)* + // _ => Ok(Self::#nullable(input.take()?)) + // } } } } } else { - quote_spanned! {span=> - impl Parse for #name { - fn take<P: Parser + ?Sized>(input: &mut P) -> Result<Self, TakeError> { - match input.peek() { - #(#firsts => Ok(Self::#first_alts(input.take()?)),)* - Some(c) => Err(TakeError::BadBranch(input.pos(), c, &[#(#all_firsts),*])), - None => Err(TakeError::EndOfStream(input.pos())) - } - } - } - } + todo!() + // quote_spanned! {span=> + // impl Parse for #name { + // fn take<P: Parser + ?Sized>(input: &mut P) -> Result<Self, TakeError> { + // match input.peek() { + // #(#firsts => Ok(Self::#first_alts(input.take()?)),)* + // Some(c) => Err(TakeError::BadBranch(input.pos(), c, &[#(#all_firsts),*])), + // None => Err(TakeError::EndOfStream(input.pos())) + // } + // } + // } + // } }); if ids.iter().all(|id| self.can_char.contains(id)) { |