hyperactor/reference/
parse.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! A simple parser combinator library intended to parse (syntactically
10//! simple) Hyperactor identifiers.
11
12use std::collections::VecDeque;
13
14/// ParseError represents an error that occured during parsing.
15#[derive(thiserror::Error, Debug, PartialEq, Eq)]
16pub enum ParseError {
17    #[error("expected '{0}', got '{1}'")]
18    Expected(String, String),
19
20    #[error("invalid token '{0}'")]
21    InvalidToken(String),
22}
23
24/// Valid tokens in Hyperactor identifiers.
25#[derive(Debug, PartialEq, Eq)]
26pub enum Token<'a> {
27    /// "["
28    LeftBracket,
29    /// "]"
30    RightBracket,
31    /// "<"
32    LessThan,
33    /// ">"
34    GreaterThan,
35    /// A decimal unsigned integer.
36    Uint(usize),
37    /// An element is any valid name.
38    Elem(&'a str),
39    /// "@"
40    At,
41    /// "."
42    Dot,
43    /// ","
44    Comma,
45
46    // Special token to denote an invalid element. It is used to poison
47    // the parser.
48    InvalidElem(&'a str),
49}
50
51/// Lexer that produces a stream of [`Token`]s, represented by
52/// the lexer's iterator implementation.
53pub struct Lexer<'a> {
54    tokens: VecDeque<&'a str>,
55}
56
57impl<'a> Lexer<'a> {
58    /// Create a new lexer over the provided input.
59    pub fn new(input: &'a str) -> Self {
60        Self {
61            // TODO: compose iterators directly; would be simpler with
62            // existential type support.
63            tokens: chop(input, &["[", "]", "<", ">", ".", "@", ","]).collect(),
64        }
65    }
66}
67
68impl<'a> Iterator for Lexer<'a> {
69    type Item = Token<'a>;
70
71    fn next(&mut self) -> Option<Token<'a>> {
72        match self.tokens.pop_front() {
73            None => None,
74            Some("[") => Some(Token::LeftBracket),
75            Some("]") => Some(Token::RightBracket),
76            Some("<") => Some(Token::LessThan),
77            Some(">") => Some(Token::GreaterThan),
78            Some("@") => Some(Token::At),
79            Some(".") => Some(Token::Dot),
80            Some(",") => Some(Token::Comma),
81            Some(elem) => Some({
82                if let Ok(uint) = elem.parse::<usize>() {
83                    Token::Uint(uint)
84                } else if is_valid_elem(elem) {
85                    Token::Elem(elem)
86                } else {
87                    Token::InvalidElem(elem)
88                }
89            }),
90        }
91    }
92}
93
94/// A macro to parse tokens by pattern matching. The macro takes as input
95/// a set of productions in the form of one or more space-separated,
96/// token-typed patterns, which are matched against the token stream. Each
97/// rule must have a production clause. For example:
98///
99/// ```ignore
100/// parse! {
101///     token_stream;
102///     Token::Dot Token::Dot Token::Elem(ellipsis) => ellipsis
103/// }
104/// ```
105///
106/// Parses streams of the form "..blah", returning the string "blah".
107// TODO: improve error reporting here
108macro_rules! parse {
109    (
110        $token_stream:expr;
111        $(
112            $($pattern:pat_param)* => $constructor:expr
113        ),* $(,)?
114    ) => {
115        {
116            let tokens = $token_stream.collect::<Vec<_>>();
117            let result = match tokens[..] {
118                $([$( $pattern ),+] => Some($constructor),)*
119                _ => None,
120            };
121            result.ok_or_else(|| {
122                let all_patterns = vec![$(stringify!($($pattern)*),)*];
123                ParseError::Expected(
124                    all_patterns.join(" or "),
125                    tokens.iter().map(|tok| format!("{:?}", tok)).collect::<Vec<_>>().join(" "))
126            })
127        }
128    };
129}
130pub(crate) use parse;
131
132/// Chop implements a simple lexer on a fixed set of delimiters.
133fn chop<'a>(mut s: &'a str, delims: &'a [&'a str]) -> impl Iterator<Item = &'a str> + 'a {
134    std::iter::from_fn(move || {
135        if s.is_empty() {
136            return None;
137        }
138
139        match delims
140            .iter()
141            .enumerate()
142            .flat_map(|(index, d)| s.find(d).map(|pos| (index, pos)))
143            .min_by_key(|&(_, v)| v)
144        {
145            Some((index, 0)) => {
146                let delim = delims[index];
147                s = &s[delim.len()..];
148                Some(delim)
149            }
150            Some((_, pos)) => {
151                let token = &s[..pos];
152                s = &s[pos..];
153                Some(token.trim())
154            }
155            None => {
156                let token = s;
157                s = "";
158                Some(token.trim())
159            }
160        }
161    })
162}
163
164pub fn is_valid(token: &str, is_continue: fn(char) -> bool) -> bool {
165    // Disallow raw identifiers;
166    if token.starts_with("r#") || token.is_empty() {
167        return false;
168    }
169    let mut chars = token.chars();
170    let mut first = true;
171    while let Some(ch) = chars.next() {
172        let valid = if ch == ':' {
173            chars.next() == Some(':')
174        } else if first {
175            ch == '_' || unicode_ident::is_xid_start(ch)
176        } else {
177            is_continue(ch)
178        };
179        if !valid {
180            return false;
181        }
182        first = false;
183    }
184    true
185}
186
187/// Determines whether the provided token is a valid hyperactor identifier.
188///
189/// Valid hyperactor identifiers are
190/// [Rust identifier](https://doc.rust-lang.org/reference/identifiers.html),
191/// excluding raw identifiers. Additionally, we allow double colon ("::")
192/// to appear anywhere.
193pub fn is_valid_ident(token: &str) -> bool {
194    is_valid(token, unicode_ident::is_xid_continue)
195}
196
197/// Determines whether the provided token is a valid hyperactor element.
198/// Like [`is_valid_ident`], but additionally allows '-' as a continuation
199/// character.
200fn is_valid_elem(token: &str) -> bool {
201    fn is_continue(ch: char) -> bool {
202        ch == '-' || unicode_ident::is_xid_continue(ch)
203    }
204    is_valid(token, is_continue)
205}
206
207#[cfg(test)]
208mod tests {
209    use std::assert_matches::assert_matches;
210
211    use super::*;
212
213    #[test]
214    fn test_lexer() {
215        let tokens = Lexer::new("foo.bar[123],baz");
216        assert_eq!(
217            tokens.collect::<Vec<Token>>(),
218            vec![
219                Token::Elem("foo"),
220                Token::Dot,
221                Token::Elem("bar"),
222                Token::LeftBracket,
223                Token::Uint(123),
224                Token::RightBracket,
225                Token::Comma,
226                Token::Elem("baz"),
227            ]
228        )
229    }
230
231    #[test]
232    fn test_valid_idents() {
233        let idents = vec![
234            "foo", "foo_bar", "東京", "_foo", "foo-bar", "::foo", "foo::bar",
235        ];
236
237        for ident in idents {
238            let tokens = Lexer::new(ident);
239
240            assert_matches!(tokens.collect::<Vec<Token>>()[..], [Token::Elem(ident_)] if ident_ == ident,);
241        }
242    }
243
244    #[test]
245    fn test_invalid_idents() {
246        let idents = vec!["-bar", "foo/bar", "r#true"];
247
248        for ident in idents {
249            let tokens = Lexer::new(ident);
250            assert_matches!(
251                &tokens.collect::<Vec<Token>>()[..],
252                [Token::InvalidElem(ident_)] if *ident_ == ident,
253            );
254        }
255    }
256
257    #[test]
258    fn test_parse() {
259        let tokens = Lexer::new("foo.bar[123]");
260        let parsed = parse!(
261            tokens;
262            Token::Elem(first) Token::Dot Token::Elem(second)
263              Token::LeftBracket Token::Uint(num) Token::RightBracket => (first, second, num)
264        );
265        assert_eq!(parsed.unwrap(), ("foo", "bar", 123usize));
266    }
267
268    #[test]
269    fn test_parse_failure() {
270        let tokens = Lexer::new("foo.bar[123]");
271        let parsed = parse!(
272            tokens;
273            Token::Elem(first) Token::Elem(second)
274              Token::LeftBracket Token::Uint(num) Token::RightBracket => (first, second, num)
275        );
276        assert!(parsed.is_err())
277    }
278}