iota/client_ptb/
lexer.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use super::{
6    error::{Span, Spanned},
7    token::{Lexeme, Token as T},
8};
9use crate::sp;
10
11pub struct Lexer<'l, I: Iterator<Item = &'l str>> {
12    pub buf: &'l str,
13    pub tokens: I,
14    pub offset: usize,
15    pub done: Option<Spanned<Lexeme<'l>>>,
16}
17
18impl<'l, I: Iterator<Item = &'l str>> Lexer<'l, I> {
19    pub fn new(mut tokens: I) -> Option<Self> {
20        Some(Self {
21            buf: tokens.next()?,
22            tokens,
23            offset: 0,
24            done: None,
25        })
26    }
27
28    /// Returns the next character in the current shell token, along with the
29    /// byte offset it ends at, or None if the current shell token is empty.
30    fn next_char_boundary(&self) -> Option<(usize, char)> {
31        let mut chars = self.buf.char_indices();
32        let (_, c) = chars.next()?;
33        let ix = chars.next().map_or(self.buf.len(), |(ix, _)| ix);
34        Some((ix, c))
35    }
36
37    /// Repeatedly consume whitespace, stopping only if you hit a non-whitespace
38    /// character, or the end of the shell token stream.
39    fn eat_whitespace(&mut self) {
40        loop {
41            if let Some((ix, c)) = self.next_char_boundary() {
42                if c.is_whitespace() {
43                    self.buf = &self.buf[ix..];
44                    self.offset += ix;
45                } else {
46                    break;
47                }
48            } else if let Some(next) = self.tokens.next() {
49                self.offset += 1; // +1 for the space between tokens
50                self.buf = next;
51            } else {
52                break;
53            };
54        }
55    }
56
57    /// Checks whether the current shell token starts with the prefix `patt`,
58    /// and consumes it if so, returning a spanned slice of the consumed
59    /// prefix.
60    fn eat_prefix(&mut self, patt: &str) -> Option<Spanned<&'l str>> {
61        let start = self.offset;
62
63        let rest = self.buf.strip_prefix(patt)?;
64
65        let len = self.buf.len() - rest.len();
66        let value = &self.buf[..len];
67        self.offset += len;
68        self.buf = rest;
69
70        let span = Span {
71            start,
72            end: self.offset,
73        };
74        Some(Spanned { span, value })
75    }
76
77    /// Checks whether the current shell token starts with at least one
78    /// character that satisfies `pred`. Consumes all such characters from
79    /// the front of the shell token, returning a spanned slice of the
80    /// consumed prefix.
81    fn eat_while(&mut self, pred: impl FnMut(char) -> bool) -> Option<Spanned<&'l str>> {
82        let start = self.offset;
83
84        let rest = self.buf.trim_start_matches(pred);
85        if self.buf == rest {
86            return None;
87        };
88
89        let len = self.buf.len() - rest.len();
90        let value = &self.buf[..len];
91        self.offset += len;
92        self.buf = rest;
93
94        let span = Span {
95            start,
96            end: self.offset,
97        };
98        Some(Spanned { span, value })
99    }
100
101    /// Consume the whole next shell token (assumes the current shell token has
102    /// already been consumed).
103    fn eat_token(&mut self) -> Option<Spanned<&'l str>> {
104        debug_assert!(self.buf.is_empty());
105        let start = self.offset + 1;
106        let value = self.tokens.next()?;
107        self.offset += value.len() + 1;
108
109        let span = Span {
110            start,
111            end: self.offset,
112        };
113        Some(Spanned { span, value })
114    }
115
116    /// Look at the next character in the current shell token without consuming
117    /// it, if it exists.
118    fn peek(&self) -> Option<Spanned<&'l str>> {
119        let start = self.offset;
120        let ix = self.next_char_boundary()?.0;
121
122        let value = &self.buf[..ix];
123        let span = Span {
124            start,
125            end: start + ix,
126        };
127        Some(Spanned { span, value })
128    }
129
130    /// Consume the next character in the current shell token, assuming there is
131    /// one.
132    fn bump(&mut self) {
133        if let Some((ix, _)) = self.next_char_boundary() {
134            self.buf = &self.buf[ix..];
135            self.offset += ix;
136        }
137    }
138
139    /// Tokenize a string at the prefix of the current shell token. `start` is
140    /// the spanned slice containing the initial quote character, which also
141    /// specifies the terminating quote character.
142    ///
143    /// A string that is not terminated in the same shell token it was started
144    /// in is tokenized as an `UnfinishedString`, even if it would have been
145    /// terminated in a following shell token.
146    fn string(&mut self, start: Spanned<&'l str>) -> Spanned<Lexeme<'l>> {
147        self.bump();
148        let sp!(sp, quote) = start;
149
150        let mut escaped = false;
151        let content = self
152            .eat_while(|c| {
153                if escaped {
154                    escaped = false;
155                    true
156                } else if c == '\\' {
157                    escaped = true;
158                    true
159                } else {
160                    !quote.starts_with(c)
161                }
162            })
163            .unwrap_or(Spanned {
164                span: Span {
165                    start: sp.end,
166                    end: sp.end,
167                },
168                value: "",
169            })
170            .widen(start);
171
172        let Some(end) = self.eat_prefix(quote) else {
173            let error = content.map(|src| Lexeme(T::UnfinishedString, src));
174            self.done = Some(error);
175            return error;
176        };
177
178        content.widen(end).map(|src| Lexeme(T::String, src))
179    }
180
181    /// Signal that `c` is an unexpected token, and trigger the lexer's error
182    /// flag, to prevent further iteration.
183    fn unexpected(&mut self, c: Spanned<&'l str>) -> Spanned<Lexeme<'l>> {
184        let error = c.map(|src| Lexeme(T::Unexpected, src));
185        self.done = Some(error);
186        error
187    }
188
189    /// Signal that the lexer has experienced an unexpected, early end-of-file,
190    /// and trigger the lexer's error flag, to prevent further iteration.
191    fn done(&mut self, token: T) -> Spanned<Lexeme<'l>> {
192        let error = self.offset().wrap(Lexeme(token, ""));
193        self.done = Some(error);
194        error
195    }
196
197    /// Span pointing to the current offset in the input.
198    fn offset(&self) -> Span {
199        Span {
200            start: self.offset,
201            end: self.offset,
202        }
203    }
204}
205
206impl<'l, I: Iterator<Item = &'l str>> Iterator for Lexer<'l, I> {
207    type Item = Spanned<Lexeme<'l>>;
208
209    fn next(&mut self) -> Option<Self::Item> {
210        // Lexer has been expended, repeatedly return the terminal token.
211        if let Some(done) = self.done {
212            return Some(done);
213        }
214
215        self.eat_whitespace();
216
217        let Some(c) = self.peek() else {
218            return Some(self.done(T::Eof));
219        };
220
221        macro_rules! token {
222            ($t:expr) => {{
223                self.bump();
224                c.map(|src| Lexeme($t, src))
225            }};
226        }
227
228        Some(match c {
229            // Single character tokens
230            sp!(_, ",") => token!(T::Comma),
231            sp!(_, "[") => token!(T::LBracket),
232            sp!(_, "]") => token!(T::RBracket),
233            sp!(_, "(") => token!(T::LParen),
234            sp!(_, ")") => token!(T::RParen),
235            sp!(_, "<") => token!(T::LAngle),
236            sp!(_, ">") => token!(T::RAngle),
237            sp!(_, "@") => token!(T::At),
238            sp!(_, ".") => token!(T::Dot),
239
240            sp!(_, "'" | "\"") => self.string(c),
241
242            sp!(_, ":") => 'colon: {
243                let Some(sp) = self.eat_prefix("::") else {
244                    break 'colon self.unexpected(c);
245                };
246
247                sp.map(|src| Lexeme(T::ColonColon, src))
248            }
249
250            sp!(_, c) if c.chars().next().is_some_and(is_ident_start) => {
251                let Some(ident) = self.eat_while(is_ident_continue) else {
252                    unreachable!("is_ident_start implies is_ident_continue");
253                };
254
255                ident.map(|src| Lexeme(T::Ident, src))
256            }
257
258            sp!(_, "0") => 'zero: {
259                let Some(prefix) = self.eat_prefix("0x") else {
260                    break 'zero token!(T::Number);
261                };
262
263                let Some(digits) = self.eat_while(is_hex_continue) else {
264                    break 'zero self.unexpected(prefix);
265                };
266
267                digits.widen(prefix).map(|src| Lexeme(T::HexNumber, src))
268            }
269
270            sp!(_, n) if n.chars().next().is_some_and(is_number_start) => {
271                let Some(num) = self.eat_while(is_number_continue) else {
272                    unreachable!("is_number_start implies is_number_continue");
273                };
274
275                num.map(|src| Lexeme(T::Number, src))
276            }
277
278            sp!(_, "-") => 'command: {
279                self.bump();
280                let Some(next) = self.peek() else {
281                    break 'command self.unexpected(c);
282                };
283
284                match next {
285                    sp!(_, "-") => {
286                        self.bump();
287                    }
288                    sp!(_, flag) if is_flag(flag.chars().next().unwrap()) => {
289                        self.bump();
290                        break 'command next.widen(c).map(|src| Lexeme(T::Flag, src));
291                    }
292                    sp!(_, _) => break 'command self.unexpected(next),
293                }
294
295                let Some(ident) = self.eat_while(is_ident_continue) else {
296                    break 'command self.unexpected(c);
297                };
298
299                match ident {
300                    sp!(_, "publish") => {
301                        if let Some(next) = self.peek() {
302                            break 'command self.unexpected(next);
303                        }
304
305                        let Some(file) = self.eat_token() else {
306                            break 'command self.done(T::EarlyEof);
307                        };
308
309                        file.widen(c).map(|src| Lexeme(T::Publish, src))
310                    }
311
312                    sp!(_, "upgrade") => {
313                        if let Some(next) = self.peek() {
314                            break 'command self.unexpected(next);
315                        }
316
317                        let Some(file) = self.eat_token() else {
318                            break 'command self.done(T::EarlyEof);
319                        };
320
321                        file.widen(c).map(|src| Lexeme(T::Upgrade, src))
322                    }
323
324                    sp!(_, _) => ident.widen(c).map(|src| Lexeme(T::Command, src)),
325                }
326            }
327
328            sp!(_, _) => self.unexpected(c),
329        })
330    }
331}
332
333fn is_flag(c: char) -> bool {
334    c.is_ascii_alphanumeric()
335}
336
337fn is_ident_start(c: char) -> bool {
338    c.is_ascii_alphabetic() || c == '_'
339}
340
341fn is_ident_continue(c: char) -> bool {
342    c.is_ascii_alphanumeric() || c == '_' || c == '-'
343}
344
345fn is_number_start(c: char) -> bool {
346    c.is_ascii_digit()
347}
348
349fn is_number_continue(c: char) -> bool {
350    c.is_ascii_digit() || c == '_'
351}
352
353fn is_hex_continue(c: char) -> bool {
354    c.is_ascii_hexdigit() || c == '_'
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    /// Tokenize the input up to and including the first terminal token.
362    fn lex(input: Vec<&str>) -> Vec<Spanned<Lexeme>> {
363        let mut lexer = Lexer::new(input.into_iter()).unwrap();
364        let mut lexemes: Vec<_> = (&mut lexer)
365            .take_while(|sp!(_, lex)| !lex.is_terminal())
366            .collect();
367        lexemes.push(lexer.next().unwrap());
368        lexemes
369    }
370
371    #[test]
372    fn tokenize_vector() {
373        let vecs = vec![
374            "vector[1,2,3]",
375            "vector[1, 2, 3]",
376            "vector[]",
377            "vector[1]",
378            "vector[1,]",
379        ];
380
381        insta::assert_debug_snapshot!(lex(vecs));
382    }
383
384    #[test]
385    fn tokenize_array() {
386        let arrays = vec!["[1,2,3]", "[1, 2, 3]", "[]", "[1]", "[1,]"];
387        insta::assert_debug_snapshot!(lex(arrays));
388    }
389
390    #[test]
391    fn tokenize_num() {
392        let nums = vec![
393            "1",
394            "1_000",
395            "100_000_000",
396            "100_000u64",
397            "1u8",
398            "1_u128",
399            "0x1",
400            "0x1_000",
401            "0x100_000_000",
402            "0x100_000u64",
403            "0x1u8",
404            "0x1_u128",
405        ];
406
407        insta::assert_debug_snapshot!(lex(nums));
408    }
409
410    #[test]
411    fn tokenize_address() {
412        let addrs = vec![
413            "@0x1",
414            "@0x1_000",
415            "@0x100_000_000",
416            "@0x100_000u64",
417            "@0x1u8",
418            "@0x1_u128",
419        ];
420
421        insta::assert_debug_snapshot!(lex(addrs));
422    }
423
424    #[test]
425    fn tokenize_commands() {
426        let cmds = vec!["--f00", "--Bar_baz", "--qux-quy"];
427
428        insta::assert_debug_snapshot!(lex(cmds));
429    }
430
431    #[test]
432    fn tokenize_flags() {
433        let flags = vec!["-h", "-a", "-Z", "-1"];
434
435        insta::assert_debug_snapshot!(lex(flags));
436    }
437
438    #[test]
439    fn tokenize_args() {
440        let args = vec![
441            "@0x1 1 1u8 1_u128 1_000 100_000_000 100_000u64 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] \
442             vector[] vector[1,2,3] vector[1]",
443            "some(@0x1) none some(vector[1,2,3]) --assign --transfer-objects --split-coins \
444             --merge-coins --make-move-vec --move-call --preview --warn-shadows --pick-gas-budget \
445             --gas-budget --summary",
446            "--publish",
447            "package-a",
448            "--upgrade",
449            "package-b",
450        ];
451
452        insta::assert_debug_snapshot!(lex(args));
453    }
454
455    #[test]
456    fn dotted_idents() {
457        let idents = vec!["a", "a.b", "a.b.c", "a.b.c.d", "a.b.c.d.e"];
458        insta::assert_debug_snapshot!(lex(idents));
459    }
460
461    #[test]
462    fn gas() {
463        let gas = vec!["gas"];
464        insta::assert_debug_snapshot!(lex(gas));
465    }
466
467    #[test]
468    fn functions() {
469        let funs = vec![
470            "0x2::transfer::public_transfer<0x42::foo::Bar>",
471            "std::option::is_none<u64>",
472            "0x1::option::is_some <u64>",
473            "0x1::option::is_none",
474            "<u64>",
475        ];
476
477        insta::assert_debug_snapshot!(lex(funs));
478    }
479
480    #[test]
481    fn unexpected_colon() {
482        let unexpected = vec!["hello: world"];
483        insta::assert_debug_snapshot!(lex(unexpected));
484    }
485
486    #[test]
487    fn unexpected_0x() {
488        let unexpected = vec!["0x forgot my train of thought"];
489        insta::assert_debug_snapshot!(lex(unexpected));
490    }
491
492    #[test]
493    fn unexpected_dash() {
494        let unexpected = vec!["-"];
495        insta::assert_debug_snapshot!(lex(unexpected));
496    }
497
498    #[test]
499    fn unexpected_dash_dash() {
500        let unexpected = vec!["--"];
501        insta::assert_debug_snapshot!(lex(unexpected));
502    }
503
504    #[test]
505    fn unexpected_publish_trailing() {
506        let unexpected = vec!["--publish needs a token break"];
507        insta::assert_debug_snapshot!(lex(unexpected));
508    }
509
510    #[test]
511    fn unexpected_upgrade_eof() {
512        let unexpected = vec!["--upgrade"]; // needs a next token
513        insta::assert_debug_snapshot!(lex(unexpected));
514    }
515
516    #[test]
517    fn unexpected_random_chars() {
518        let unexpected = vec!["4 * 5"];
519        insta::assert_debug_snapshot!(lex(unexpected));
520    }
521}