hyperactor/reference/
lex.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! This module contains a lexer for Hyperactor identifiers.
10
11use std::iter::Peekable;
12use std::str::Chars;
13use std::sync::LazyLock;
14
15use crate::reference::name::FLICKR_BASE_58;
16use crate::reference::name::Ident;
17use crate::reference::name::Uid;
18
19/// Precomputed character ordinals for the alphabet.
20static FLICKR_BASE_58_ORD: LazyLock<[Option<usize>; 256]> = LazyLock::new(|| {
21    let mut table = [None; 256];
22    for (i, c) in FLICKR_BASE_58.chars().enumerate() {
23        table[c as usize] = Some(i);
24    }
25    table
26});
27
28/// The tyep of error that occurs while parsing a hyperactor identifier.
29#[derive(Debug, thiserror::Error, PartialEq, Eq)]
30pub enum ParseError {
31    /// Todo
32    #[error("expected '{0}', got '{1}'")]
33    Expected(Token, Token),
34}
35
36/// An identifier token.
37#[derive(Debug, PartialEq, Eq)]
38pub enum Token {
39    /// "["
40    LeftBracket,
41    /// "]"
42    RightBracket,
43    /// A decimal unsigned integer, can appear in brackets
44    Uint(usize),
45    /// "."
46    Dot,
47    /// "-uid" suffixes, FLICKR58 format
48    Uid(Uid),
49    /// ":", can appear in brackets
50    Colon,
51    /// "//"
52    DoubleSlash,
53    /// Typename, within a bracket. These are rust identifiers, plus
54    /// ':' characters.
55    BracketTypename(String),
56    /// An identifier, following rust rules.
57    Ident(Ident),
58
59    /// Special token to denote a lexer error. It contains the unlexed
60    /// remainder of the input. The error occured on the first character.
61    Error(String),
62
63    /// No more tokens.
64    Eof,
65}
66
67impl Token {
68    /// Return the token as a left bracket, otherwise a parse error.
69    pub fn into_left_bracket(self) -> Result<(), ParseError> {
70        match self {
71            Token::LeftBracket => Ok(()),
72            other => Err(ParseError::Expected(Token::LeftBracket, other)),
73        }
74    }
75
76    /// Return the token as a right bracket, otherwise a parse error.
77    pub fn into_right_bracket(self) -> Result<(), ParseError> {
78        match self {
79            Token::RightBracket => Ok(()),
80            other => Err(ParseError::Expected(Token::RightBracket, other)),
81        }
82    }
83
84    /// Return the token as a uint, otherwise a parse error.
85    pub fn into_uint(self) -> Result<usize, ParseError> {
86        match self {
87            Token::Uint(value) => Ok(value),
88            other => Err(ParseError::Expected(Token::Uint(0), other)),
89        }
90    }
91
92    /// Return the token as a dot, otherwise a parse error.
93    pub fn into_dot(self) -> Result<(), ParseError> {
94        match self {
95            Token::Dot => Ok(()),
96            other => Err(ParseError::Expected(Token::Dot, other)),
97        }
98    }
99
100    /// Return the token as a Uid, otherwise a parse error.
101    pub fn into_uid(self) -> Result<Uid, ParseError> {
102        match self {
103            Token::Uid(uid) => Ok(uid),
104            other => Err(ParseError::Expected(Token::Uid(Uid::zero()), other)),
105        }
106    }
107
108    /// Return the token as a colon, otherwise a parse error.
109    pub fn into_colon(self) -> Result<(), ParseError> {
110        match self {
111            Token::Colon => Ok(()),
112            other => Err(ParseError::Expected(Token::Colon, other)),
113        }
114    }
115
116    /// Return the token as a double slash, otherwise a parse error.
117    pub fn into_double_slash(self) -> Result<(), ParseError> {
118        match self {
119            Token::DoubleSlash => Ok(()),
120            other => Err(ParseError::Expected(Token::DoubleSlash, other)),
121        }
122    }
123
124    /// Return the token as a bracket typename, otherwise a parse error.
125    pub fn into_bracket_typename(self) -> Result<String, ParseError> {
126        match self {
127            Token::BracketTypename(value) => Ok(value),
128            other => Err(ParseError::Expected(
129                Token::BracketTypename(String::new()),
130                other,
131            )),
132        }
133    }
134
135    /// Return the token as an ident, otherwise a parse error.
136    pub fn into_ident(self) -> Result<Ident, ParseError> {
137        match self {
138            Token::Ident(value) => Ok(value),
139            other => Err(ParseError::Expected(
140                Token::Ident("ident".parse().unwrap()),
141                other,
142            )),
143        }
144    }
145
146    /// Return the token as an error, otherwise a parse error.
147    pub fn into_error(self) -> Result<String, ParseError> {
148        match self {
149            Token::Error(value) => Ok(value),
150            other => Err(ParseError::Expected(Token::Error(String::new()), other)),
151        }
152    }
153}
154
155impl std::fmt::Display for Token {
156    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
157        match self {
158            Token::LeftBracket => write!(f, "["),
159            Token::RightBracket => write!(f, "]"),
160            Token::Uint(value) => write!(f, "{}", value),
161            Token::Dot => write!(f, "."),
162            Token::Uid(uid) => write!(f, "-{}", uid),
163            Token::Colon => write!(f, ":"),
164            Token::DoubleSlash => write!(f, "//"),
165            Token::BracketTypename(value) => write!(f, "{}", value),
166            Token::Ident(value) => write!(f, "{}", value),
167            Token::Error(value) => write!(f, "<error {}>", value),
168            Token::Eof => write!(f, "<eof>"),
169        }
170    }
171}
172
173/// A lexer is an iterator over [`Token`].
174#[derive(Default, Debug)]
175pub(crate) enum Lexer<'a> {
176    Next(Peekable<Chars<'a>>),
177    Inbracket(Peekable<Chars<'a>>),
178    #[default]
179    Invalid,
180}
181
182impl<'a> Lexer<'a> {
183    /// Create a new lexer over the provided input.
184    pub(crate) fn new(input: &'a str) -> Self {
185        Lexer::Next(input.chars().peekable())
186    }
187
188    /// Consume and return the next token, returning [`Token::Eof`]
189    /// ("fused") when there are no more tokens available.
190    pub(crate) fn next_or_eof(&mut self) -> Token {
191        self.next().unwrap_or(Token::Eof)
192    }
193
194    /// Expect the provided token, or return a parse error.
195    pub(crate) fn expect(&mut self, token: Token) -> Result<(), ParseError> {
196        let next = self.next_or_eof();
197        if next != token {
198            return Err(ParseError::Expected(token, next));
199        }
200        Ok(())
201    }
202
203    fn parse_uint(iter: &mut Peekable<Chars<'a>>) -> usize {
204        let mut value = iter.next().unwrap().to_digit(10).unwrap() as usize;
205        while let Some(&ch) = iter.peek() {
206            if let Some(d) = ch.to_digit(10) {
207                value = value * 10 + d as usize;
208                iter.next();
209            } else {
210                break;
211            }
212        }
213        value
214    }
215
216    fn parse_ident(iter: &mut Peekable<Chars<'a>>) -> Ident {
217        let mut ident = String::new();
218
219        let ch = iter.next().unwrap();
220        assert!(unicode_ident::is_xid_start(ch) || ch == '_');
221        ident.push(ch);
222
223        while let Some(&ch) = iter.peek()
224            && unicode_ident::is_xid_continue(ch)
225        {
226            ident.push(iter.next().unwrap());
227        }
228        Ident::new(ident).unwrap()
229    }
230
231    fn parse_bracket_typename(iter: &mut Peekable<Chars<'a>>) -> String {
232        let mut ident = String::new();
233
234        let ch = iter.next().unwrap();
235        assert!(unicode_ident::is_xid_start(ch) || ch == '_');
236        ident.push(ch);
237
238        while let Some(&ch) = iter.peek()
239            && (unicode_ident::is_xid_continue(ch) || ch == ':')
240        {
241            ident.push(iter.next().unwrap());
242        }
243        ident
244    }
245
246    fn parse_uid(iter: &mut Peekable<Chars<'a>>) -> Option<Uid> {
247        let base = FLICKR_BASE_58.len() as u64;
248        let mut num = 0u64;
249
250        for _i in 0..12 {
251            let &ch = iter.peek()?;
252            let pos = FLICKR_BASE_58_ORD[ch as usize]?;
253            let _ = iter.next();
254            num *= base;
255            num += pos as u64;
256        }
257
258        Some(num.into())
259    }
260}
261
262impl<'a> Iterator for Lexer<'a> {
263    type Item = Token;
264
265    fn next(&mut self) -> Option<Self::Item> {
266        let (new_state, token) = match std::mem::take(self) {
267            Lexer::Next(mut iter) => match iter.peek()? {
268                '[' => {
269                    let _ = iter.next();
270                    (Lexer::Inbracket(iter), Some(Token::LeftBracket))
271                }
272                '.' => {
273                    let _ = iter.next();
274                    (Lexer::Next(iter), Some(Token::Dot))
275                }
276                '/' => {
277                    let _ = iter.next();
278                    match iter.next() {
279                        Some('/') => (Lexer::Next(iter), Some(Token::DoubleSlash)),
280                        _ => (Lexer::Invalid, Some(Token::Error(iter.collect()))),
281                    }
282                }
283                ':' => {
284                    let _ = iter.next();
285                    (Lexer::Next(iter), Some(Token::Colon))
286                }
287                '-' => {
288                    let _ = iter.next();
289                    match Lexer::parse_uid(&mut iter) {
290                        Some(uid) => (Lexer::Next(iter), Some(Token::Uid(uid))),
291                        None => (Lexer::Invalid, Some(Token::Error(iter.collect()))),
292                    }
293                }
294                // TODO: support hexadecimal
295                '0'..='9' => {
296                    let uint = Lexer::parse_uint(&mut iter);
297                    (Lexer::Next(iter), Some(Token::Uint(uint)))
298                }
299                ch if unicode_ident::is_xid_start(*ch) || *ch == '_' => {
300                    let ident = Lexer::parse_ident(&mut iter);
301                    (Lexer::Next(iter), Some(Token::Ident(ident)))
302                }
303                _ => (Lexer::Invalid, Some(Token::Error(iter.collect()))),
304            },
305            Lexer::Inbracket(mut iter) => match iter.peek()? {
306                '0'..='9' => {
307                    let uint = Lexer::parse_uint(&mut iter);
308                    (Lexer::Inbracket(iter), Some(Token::Uint(uint)))
309                }
310                ch if unicode_ident::is_xid_start(*ch) => {
311                    let typename = Lexer::parse_bracket_typename(&mut iter);
312                    (
313                        Lexer::Inbracket(iter),
314                        Some(Token::BracketTypename(typename)),
315                    )
316                }
317                ':' => {
318                    let _ = iter.next();
319                    (Lexer::Inbracket(iter), Some(Token::Colon))
320                }
321                ']' => {
322                    let _ = iter.next();
323                    (Lexer::Next(iter), Some(Token::RightBracket))
324                }
325                _ => (Lexer::Invalid, Some(Token::Error(iter.collect()))),
326            },
327            Lexer::Invalid => (Lexer::Invalid, None),
328        };
329
330        *self = new_state;
331        token
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use Token::*;
338
339    use super::*;
340
341    #[test]
342    fn test_basic() {
343        assert_eq!(
344            Lexer::new("foo.bar[123:foo]//blah").collect::<Vec<_>>(),
345            vec![
346                Ident("foo".parse().unwrap()),
347                Dot,
348                Ident("bar".parse().unwrap()),
349                LeftBracket,
350                Uint(123),
351                Colon,
352                BracketTypename("foo".to_string()),
353                RightBracket,
354                DoubleSlash,
355                Ident("blah".parse().unwrap())
356            ]
357        );
358        assert_eq!(
359            Lexer::new("foo.ba)r[123:foo]//blah").collect::<Vec<_>>(),
360            vec![
361                Ident("foo".parse().unwrap()),
362                Dot,
363                Ident("ba".parse().unwrap()),
364                Error(")r[123:foo]//blah".to_string())
365            ]
366        );
367    }
368}