1use super::{
6 error::{Span, Spanned},
7 token::{Lexeme, Token as T},
8};
9use crate::sp;
10
11pub struct Lexer<'l, I: Iterator<Item = &'l str>> {
12 pub buf: &'l str,
13 pub tokens: I,
14 pub offset: usize,
15 pub done: Option<Spanned<Lexeme<'l>>>,
16}
17
18impl<'l, I: Iterator<Item = &'l str>> Lexer<'l, I> {
19 pub fn new(mut tokens: I) -> Option<Self> {
20 Some(Self {
21 buf: tokens.next()?,
22 tokens,
23 offset: 0,
24 done: None,
25 })
26 }
27
28 fn next_char_boundary(&self) -> Option<(usize, char)> {
31 let mut chars = self.buf.char_indices();
32 let (_, c) = chars.next()?;
33 let ix = chars.next().map_or(self.buf.len(), |(ix, _)| ix);
34 Some((ix, c))
35 }
36
37 fn eat_whitespace(&mut self) {
40 loop {
41 if let Some((ix, c)) = self.next_char_boundary() {
42 if c.is_whitespace() {
43 self.buf = &self.buf[ix..];
44 self.offset += ix;
45 } else {
46 break;
47 }
48 } else if let Some(next) = self.tokens.next() {
49 self.offset += 1; self.buf = next;
51 } else {
52 break;
53 };
54 }
55 }
56
57 fn eat_prefix(&mut self, patt: &str) -> Option<Spanned<&'l str>> {
61 let start = self.offset;
62
63 let rest = self.buf.strip_prefix(patt)?;
64
65 let len = self.buf.len() - rest.len();
66 let value = &self.buf[..len];
67 self.offset += len;
68 self.buf = rest;
69
70 let span = Span {
71 start,
72 end: self.offset,
73 };
74 Some(Spanned { span, value })
75 }
76
77 fn eat_while(&mut self, pred: impl FnMut(char) -> bool) -> Option<Spanned<&'l str>> {
82 let start = self.offset;
83
84 let rest = self.buf.trim_start_matches(pred);
85 if self.buf == rest {
86 return None;
87 };
88
89 let len = self.buf.len() - rest.len();
90 let value = &self.buf[..len];
91 self.offset += len;
92 self.buf = rest;
93
94 let span = Span {
95 start,
96 end: self.offset,
97 };
98 Some(Spanned { span, value })
99 }
100
101 fn eat_token(&mut self) -> Option<Spanned<&'l str>> {
104 debug_assert!(self.buf.is_empty());
105 let start = self.offset + 1;
106 let value = self.tokens.next()?;
107 self.offset += value.len() + 1;
108
109 let span = Span {
110 start,
111 end: self.offset,
112 };
113 Some(Spanned { span, value })
114 }
115
116 fn peek(&self) -> Option<Spanned<&'l str>> {
119 let start = self.offset;
120 let ix = self.next_char_boundary()?.0;
121
122 let value = &self.buf[..ix];
123 let span = Span {
124 start,
125 end: start + ix,
126 };
127 Some(Spanned { span, value })
128 }
129
130 fn bump(&mut self) {
133 if let Some((ix, _)) = self.next_char_boundary() {
134 self.buf = &self.buf[ix..];
135 self.offset += ix;
136 }
137 }
138
139 fn string(&mut self, start: Spanned<&'l str>) -> Spanned<Lexeme<'l>> {
147 self.bump();
148 let sp!(sp, quote) = start;
149
150 let mut escaped = false;
151 let content = self
152 .eat_while(|c| {
153 if escaped {
154 escaped = false;
155 true
156 } else if c == '\\' {
157 escaped = true;
158 true
159 } else {
160 !quote.starts_with(c)
161 }
162 })
163 .unwrap_or(Spanned {
164 span: Span {
165 start: sp.end,
166 end: sp.end,
167 },
168 value: "",
169 })
170 .widen(start);
171
172 let Some(end) = self.eat_prefix(quote) else {
173 let error = content.map(|src| Lexeme(T::UnfinishedString, src));
174 self.done = Some(error);
175 return error;
176 };
177
178 content.widen(end).map(|src| Lexeme(T::String, src))
179 }
180
181 fn unexpected(&mut self, c: Spanned<&'l str>) -> Spanned<Lexeme<'l>> {
184 let error = c.map(|src| Lexeme(T::Unexpected, src));
185 self.done = Some(error);
186 error
187 }
188
189 fn done(&mut self, token: T) -> Spanned<Lexeme<'l>> {
192 let error = self.offset().wrap(Lexeme(token, ""));
193 self.done = Some(error);
194 error
195 }
196
197 fn offset(&self) -> Span {
199 Span {
200 start: self.offset,
201 end: self.offset,
202 }
203 }
204}
205
206impl<'l, I: Iterator<Item = &'l str>> Iterator for Lexer<'l, I> {
207 type Item = Spanned<Lexeme<'l>>;
208
209 fn next(&mut self) -> Option<Self::Item> {
210 if let Some(done) = self.done {
212 return Some(done);
213 }
214
215 self.eat_whitespace();
216
217 let Some(c) = self.peek() else {
218 return Some(self.done(T::Eof));
219 };
220
221 macro_rules! token {
222 ($t:expr) => {{
223 self.bump();
224 c.map(|src| Lexeme($t, src))
225 }};
226 }
227
228 Some(match c {
229 sp!(_, ",") => token!(T::Comma),
231 sp!(_, "[") => token!(T::LBracket),
232 sp!(_, "]") => token!(T::RBracket),
233 sp!(_, "(") => token!(T::LParen),
234 sp!(_, ")") => token!(T::RParen),
235 sp!(_, "<") => token!(T::LAngle),
236 sp!(_, ">") => token!(T::RAngle),
237 sp!(_, "@") => token!(T::At),
238 sp!(_, ".") => token!(T::Dot),
239
240 sp!(_, "'" | "\"") => self.string(c),
241
242 sp!(_, ":") => 'colon: {
243 let Some(sp) = self.eat_prefix("::") else {
244 break 'colon self.unexpected(c);
245 };
246
247 sp.map(|src| Lexeme(T::ColonColon, src))
248 }
249
250 sp!(_, c) if c.chars().next().is_some_and(is_ident_start) => {
251 let Some(ident) = self.eat_while(is_ident_continue) else {
252 unreachable!("is_ident_start implies is_ident_continue");
253 };
254
255 ident.map(|src| Lexeme(T::Ident, src))
256 }
257
258 sp!(_, "0") => 'zero: {
259 let Some(prefix) = self.eat_prefix("0x") else {
260 break 'zero token!(T::Number);
261 };
262
263 let Some(digits) = self.eat_while(is_hex_continue) else {
264 break 'zero self.unexpected(prefix);
265 };
266
267 digits.widen(prefix).map(|src| Lexeme(T::HexNumber, src))
268 }
269
270 sp!(_, n) if n.chars().next().is_some_and(is_number_start) => {
271 let Some(num) = self.eat_while(is_number_continue) else {
272 unreachable!("is_number_start implies is_number_continue");
273 };
274
275 num.map(|src| Lexeme(T::Number, src))
276 }
277
278 sp!(_, "-") => 'command: {
279 self.bump();
280 let Some(next) = self.peek() else {
281 break 'command self.unexpected(c);
282 };
283
284 match next {
285 sp!(_, "-") => {
286 self.bump();
287 }
288 sp!(_, flag) if is_flag(flag.chars().next().unwrap()) => {
289 self.bump();
290 break 'command next.widen(c).map(|src| Lexeme(T::Flag, src));
291 }
292 sp!(_, _) => break 'command self.unexpected(next),
293 }
294
295 let Some(ident) = self.eat_while(is_ident_continue) else {
296 break 'command self.unexpected(c);
297 };
298
299 match ident {
300 sp!(_, "publish") => {
301 if let Some(next) = self.peek() {
302 break 'command self.unexpected(next);
303 }
304
305 let Some(file) = self.eat_token() else {
306 break 'command self.done(T::EarlyEof);
307 };
308
309 file.widen(c).map(|src| Lexeme(T::Publish, src))
310 }
311
312 sp!(_, "upgrade") => {
313 if let Some(next) = self.peek() {
314 break 'command self.unexpected(next);
315 }
316
317 let Some(file) = self.eat_token() else {
318 break 'command self.done(T::EarlyEof);
319 };
320
321 file.widen(c).map(|src| Lexeme(T::Upgrade, src))
322 }
323
324 sp!(_, _) => ident.widen(c).map(|src| Lexeme(T::Command, src)),
325 }
326 }
327
328 sp!(_, _) => self.unexpected(c),
329 })
330 }
331}
332
333fn is_flag(c: char) -> bool {
334 c.is_ascii_alphanumeric()
335}
336
337fn is_ident_start(c: char) -> bool {
338 c.is_ascii_alphabetic() || c == '_'
339}
340
341fn is_ident_continue(c: char) -> bool {
342 c.is_ascii_alphanumeric() || c == '_' || c == '-'
343}
344
345fn is_number_start(c: char) -> bool {
346 c.is_ascii_digit()
347}
348
349fn is_number_continue(c: char) -> bool {
350 c.is_ascii_digit() || c == '_'
351}
352
353fn is_hex_continue(c: char) -> bool {
354 c.is_ascii_hexdigit() || c == '_'
355}
356
357#[cfg(test)]
358mod tests {
359 use super::*;
360
361 fn lex(input: Vec<&str>) -> Vec<Spanned<Lexeme>> {
363 let mut lexer = Lexer::new(input.into_iter()).unwrap();
364 let mut lexemes: Vec<_> = (&mut lexer)
365 .take_while(|sp!(_, lex)| !lex.is_terminal())
366 .collect();
367 lexemes.push(lexer.next().unwrap());
368 lexemes
369 }
370
371 #[test]
372 fn tokenize_vector() {
373 let vecs = vec![
374 "vector[1,2,3]",
375 "vector[1, 2, 3]",
376 "vector[]",
377 "vector[1]",
378 "vector[1,]",
379 ];
380
381 insta::assert_debug_snapshot!(lex(vecs));
382 }
383
384 #[test]
385 fn tokenize_array() {
386 let arrays = vec!["[1,2,3]", "[1, 2, 3]", "[]", "[1]", "[1,]"];
387 insta::assert_debug_snapshot!(lex(arrays));
388 }
389
390 #[test]
391 fn tokenize_num() {
392 let nums = vec![
393 "1",
394 "1_000",
395 "100_000_000",
396 "100_000u64",
397 "1u8",
398 "1_u128",
399 "0x1",
400 "0x1_000",
401 "0x100_000_000",
402 "0x100_000u64",
403 "0x1u8",
404 "0x1_u128",
405 ];
406
407 insta::assert_debug_snapshot!(lex(nums));
408 }
409
410 #[test]
411 fn tokenize_address() {
412 let addrs = vec![
413 "@0x1",
414 "@0x1_000",
415 "@0x100_000_000",
416 "@0x100_000u64",
417 "@0x1u8",
418 "@0x1_u128",
419 ];
420
421 insta::assert_debug_snapshot!(lex(addrs));
422 }
423
424 #[test]
425 fn tokenize_commands() {
426 let cmds = vec!["--f00", "--Bar_baz", "--qux-quy"];
427
428 insta::assert_debug_snapshot!(lex(cmds));
429 }
430
431 #[test]
432 fn tokenize_flags() {
433 let flags = vec!["-h", "-a", "-Z", "-1"];
434
435 insta::assert_debug_snapshot!(lex(flags));
436 }
437
438 #[test]
439 fn tokenize_args() {
440 let args = vec![
441 "@0x1 1 1u8 1_u128 1_000 100_000_000 100_000u64 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] \
442 vector[] vector[1,2,3] vector[1]",
443 "some(@0x1) none some(vector[1,2,3]) --assign --transfer-objects --split-coins \
444 --merge-coins --make-move-vec --move-call --preview --warn-shadows --pick-gas-budget \
445 --gas-budget --summary",
446 "--publish",
447 "package-a",
448 "--upgrade",
449 "package-b",
450 ];
451
452 insta::assert_debug_snapshot!(lex(args));
453 }
454
455 #[test]
456 fn dotted_idents() {
457 let idents = vec!["a", "a.b", "a.b.c", "a.b.c.d", "a.b.c.d.e"];
458 insta::assert_debug_snapshot!(lex(idents));
459 }
460
461 #[test]
462 fn gas() {
463 let gas = vec!["gas"];
464 insta::assert_debug_snapshot!(lex(gas));
465 }
466
467 #[test]
468 fn functions() {
469 let funs = vec![
470 "0x2::transfer::public_transfer<0x42::foo::Bar>",
471 "std::option::is_none<u64>",
472 "0x1::option::is_some <u64>",
473 "0x1::option::is_none",
474 "<u64>",
475 ];
476
477 insta::assert_debug_snapshot!(lex(funs));
478 }
479
480 #[test]
481 fn unexpected_colon() {
482 let unexpected = vec!["hello: world"];
483 insta::assert_debug_snapshot!(lex(unexpected));
484 }
485
486 #[test]
487 fn unexpected_0x() {
488 let unexpected = vec!["0x forgot my train of thought"];
489 insta::assert_debug_snapshot!(lex(unexpected));
490 }
491
492 #[test]
493 fn unexpected_dash() {
494 let unexpected = vec!["-"];
495 insta::assert_debug_snapshot!(lex(unexpected));
496 }
497
498 #[test]
499 fn unexpected_dash_dash() {
500 let unexpected = vec!["--"];
501 insta::assert_debug_snapshot!(lex(unexpected));
502 }
503
504 #[test]
505 fn unexpected_publish_trailing() {
506 let unexpected = vec!["--publish needs a token break"];
507 insta::assert_debug_snapshot!(lex(unexpected));
508 }
509
510 #[test]
511 fn unexpected_upgrade_eof() {
512 let unexpected = vec!["--upgrade"]; insta::assert_debug_snapshot!(lex(unexpected));
514 }
515
516 #[test]
517 fn unexpected_random_chars() {
518 let unexpected = vec!["4 * 5"];
519 insta::assert_debug_snapshot!(lex(unexpected));
520 }
521}