1use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, FocusKind, MeasureKind, Span, Token, TokenType};
40
41#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50 Indent,
52 Dedent,
54 Newline,
56 Content { text: String, start: usize, end: usize },
58}
59
60pub struct LineLexer<'a> {
63 source: &'a str,
64 bytes: &'a [u8],
65 indent_stack: Vec<usize>,
66 pending_dedents: usize,
67 position: usize,
68 has_pending_content: bool,
70 pending_content_start: usize,
71 pending_content_end: usize,
72 pending_content_text: String,
73 finished_lines: bool,
75 emitted_indent: bool,
77}
78
79impl<'a> LineLexer<'a> {
80 pub fn new(source: &'a str) -> Self {
81 Self {
82 source,
83 bytes: source.as_bytes(),
84 indent_stack: vec![0],
85 pending_dedents: 0,
86 position: 0,
87 has_pending_content: false,
88 pending_content_start: 0,
89 pending_content_end: 0,
90 pending_content_text: String::new(),
91 finished_lines: false,
92 emitted_indent: false,
93 }
94 }
95
96 fn measure_indent(&self, line_start: usize) -> (usize, usize) {
99 let mut indent = 0;
100 let mut pos = line_start;
101
102 while pos < self.bytes.len() {
103 match self.bytes[pos] {
104 b' ' => {
105 indent += 1;
106 pos += 1;
107 }
108 b'\t' => {
109 indent += 4; pos += 1;
111 }
112 _ => break,
113 }
114 }
115
116 (indent, pos)
117 }
118
119 fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
122 let mut pos = content_start;
123
124 while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
126 pos += 1;
127 }
128
129 let content_end = pos;
130 let text = self.source[content_start..content_end].trim_end().to_string();
131
132 let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
134 pos + 1
135 } else {
136 pos
137 };
138
139 (text, content_start, content_end, next_line_start)
140 }
141
142 fn is_blank_line(&self, line_start: usize) -> bool {
144 let mut pos = line_start;
145 while pos < self.bytes.len() {
146 match self.bytes[pos] {
147 b' ' | b'\t' => pos += 1,
148 b'\n' => return true,
149 _ => return false,
150 }
151 }
152 true }
154
155 fn process_next_line(&mut self) -> bool {
158 while self.position < self.bytes.len() && self.is_blank_line(self.position) {
160 while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
162 self.position += 1;
163 }
164 if self.position < self.bytes.len() {
165 self.position += 1; }
167 }
168
169 if self.position >= self.bytes.len() {
171 self.finished_lines = true;
172 if self.indent_stack.len() > 1 {
174 self.pending_dedents = self.indent_stack.len() - 1;
175 self.indent_stack.truncate(1);
176 }
177 return self.pending_dedents > 0;
178 }
179
180 let (line_indent, content_start) = self.measure_indent(self.position);
182
183 let (text, start, end, next_pos) = self.read_line_content(content_start);
185
186 if text.is_empty() {
188 self.position = next_pos;
189 return self.process_next_line();
190 }
191
192 let current_indent = *self.indent_stack.last().unwrap();
193
194 if line_indent > current_indent {
196 self.indent_stack.push(line_indent);
198 self.emitted_indent = true;
199 self.has_pending_content = true;
201 self.pending_content_text = text;
202 self.pending_content_start = start;
203 self.pending_content_end = end;
204 self.position = next_pos;
205 return true;
207 } else if line_indent < current_indent {
208 while self.indent_stack.len() > 1 {
210 let top = *self.indent_stack.last().unwrap();
211 if line_indent < top {
212 self.indent_stack.pop();
213 self.pending_dedents += 1;
214 } else {
215 break;
216 }
217 }
218 self.has_pending_content = true;
220 self.pending_content_text = text;
221 self.pending_content_start = start;
222 self.pending_content_end = end;
223 self.position = next_pos;
224 return true;
225 } else {
226 self.has_pending_content = true;
228 self.pending_content_text = text;
229 self.pending_content_start = start;
230 self.pending_content_end = end;
231 self.position = next_pos;
232 return true;
233 }
234 }
235}
236
237impl<'a> Iterator for LineLexer<'a> {
238 type Item = LineToken;
239
240 fn next(&mut self) -> Option<LineToken> {
241 if self.pending_dedents > 0 {
243 self.pending_dedents -= 1;
244 return Some(LineToken::Dedent);
245 }
246
247 if self.has_pending_content {
249 self.has_pending_content = false;
250 let text = std::mem::take(&mut self.pending_content_text);
251 let start = self.pending_content_start;
252 let end = self.pending_content_end;
253 return Some(LineToken::Content { text, start, end });
254 }
255
256 if !self.finished_lines {
262 let had_indent = self.indent_stack.len();
263 if self.process_next_line() {
264 if self.indent_stack.len() > had_indent {
266 return Some(LineToken::Indent);
267 }
268 if self.pending_dedents > 0 {
270 self.pending_dedents -= 1;
271 return Some(LineToken::Dedent);
272 }
273 if self.has_pending_content {
275 self.has_pending_content = false;
276 let text = std::mem::take(&mut self.pending_content_text);
277 let start = self.pending_content_start;
278 let end = self.pending_content_end;
279 return Some(LineToken::Content { text, start, end });
280 }
281 } else if self.pending_dedents > 0 {
282 self.pending_dedents -= 1;
284 return Some(LineToken::Dedent);
285 }
286 }
287
288 if self.pending_dedents > 0 {
290 self.pending_dedents -= 1;
291 return Some(LineToken::Dedent);
292 }
293
294 None
295 }
296}
297
298#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
303pub enum LexerMode {
304 #[default]
305 Declarative, Imperative, }
308
309pub struct Lexer<'a> {
310 words: Vec<WordItem>,
311 pos: usize,
312 lexicon: Lexicon,
313 interner: &'a mut Interner,
314 input_len: usize,
315 in_let_context: bool,
316 mode: LexerMode,
317 source: String,
318}
319
320struct WordItem {
321 word: String,
322 trailing_punct: Option<char>,
323 start: usize,
324 end: usize,
325 punct_pos: Option<usize>,
326}
327
328impl<'a> Lexer<'a> {
329 pub fn new(input: &str, interner: &'a mut Interner) -> Self {
353 let words = Self::split_into_words(input);
354 let input_len = input.len();
355
356 Lexer {
357 words,
358 pos: 0,
359 lexicon: Lexicon::new(),
360 interner,
361 input_len,
362 in_let_context: false,
363 mode: LexerMode::Declarative,
364 source: input.to_string(),
365 }
366 }
367
368 fn split_into_words(input: &str) -> Vec<WordItem> {
369 let mut items = Vec::new();
370 let mut current_word = String::new();
371 let mut word_start = 0;
372 let chars: Vec<char> = input.chars().collect();
373 let mut char_idx = 0;
374 let mut skip_count = 0;
375
376 for (i, c) in input.char_indices() {
377 if skip_count > 0 {
378 skip_count -= 1;
379 char_idx += 1;
380 continue;
381 }
382 let next_pos = i + c.len_utf8();
383 match c {
384 ' ' | '\t' | '\n' | '\r' => {
385 if !current_word.is_empty() {
386 items.push(WordItem {
387 word: std::mem::take(&mut current_word),
388 trailing_punct: None,
389 start: word_start,
390 end: i,
391 punct_pos: None,
392 });
393 }
394 word_start = next_pos;
395 }
396 '.' => {
397 let prev_is_digit = !current_word.is_empty()
399 && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
400 let next_is_digit = char_idx + 1 < chars.len()
401 && chars[char_idx + 1].is_ascii_digit();
402
403 if prev_is_digit && next_is_digit {
404 current_word.push(c);
406 } else {
407 if !current_word.is_empty() {
409 items.push(WordItem {
410 word: std::mem::take(&mut current_word),
411 trailing_punct: Some(c),
412 start: word_start,
413 end: i,
414 punct_pos: Some(i),
415 });
416 } else {
417 items.push(WordItem {
418 word: String::new(),
419 trailing_punct: Some(c),
420 start: i,
421 end: next_pos,
422 punct_pos: Some(i),
423 });
424 }
425 word_start = next_pos;
426 }
427 }
428 '#' => {
429 if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
431 if !current_word.is_empty() {
434 items.push(WordItem {
435 word: std::mem::take(&mut current_word),
436 trailing_punct: None,
437 start: word_start,
438 end: i,
439 punct_pos: None,
440 });
441 }
442 let header_start = i;
444 let mut j = char_idx + 2;
445 while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
446 j += 1;
447 }
448 let mut block_word = String::from("##");
450 while j < chars.len() && chars[j].is_alphabetic() {
451 block_word.push(chars[j]);
452 j += 1;
453 }
454 if block_word.len() > 2 {
455 items.push(WordItem {
456 word: block_word,
457 trailing_punct: None,
458 start: header_start,
459 end: header_start + (j - char_idx),
460 punct_pos: None,
461 });
462 }
463 skip_count = j - char_idx - 1;
464 word_start = header_start + (j - char_idx);
465 } else {
466 let mut look_ahead = char_idx + 1;
470 while look_ahead < chars.len() && chars[look_ahead] != '\n' {
471 skip_count += 1;
472 look_ahead += 1;
473 }
474 if !current_word.is_empty() {
475 items.push(WordItem {
476 word: std::mem::take(&mut current_word),
477 trailing_punct: None,
478 start: word_start,
479 end: i,
480 punct_pos: None,
481 });
482 }
483 word_start = look_ahead + 1; }
485 }
486 '"' => {
488 if !current_word.is_empty() {
490 items.push(WordItem {
491 word: std::mem::take(&mut current_word),
492 trailing_punct: None,
493 start: word_start,
494 end: i,
495 punct_pos: None,
496 });
497 }
498
499 let string_start = i;
501 let mut j = char_idx + 1;
502 let mut string_content = String::new();
503 while j < chars.len() && chars[j] != '"' {
504 if chars[j] == '\\' && j + 1 < chars.len() {
505 j += 1;
507 if j < chars.len() {
508 string_content.push(chars[j]);
509 }
510 } else {
511 string_content.push(chars[j]);
512 }
513 j += 1;
514 }
515
516 items.push(WordItem {
519 word: format!("\x00STR:{}", string_content),
520 trailing_punct: None,
521 start: string_start,
522 end: if j < chars.len() { j + 1 } else { j },
523 punct_pos: None,
524 });
525
526 if j < chars.len() {
528 skip_count = j - char_idx;
529 } else {
530 skip_count = j - char_idx - 1;
531 }
532 word_start = if j < chars.len() { j + 1 } else { j };
533 }
534 '`' => {
536 if !current_word.is_empty() {
538 items.push(WordItem {
539 word: std::mem::take(&mut current_word),
540 trailing_punct: None,
541 start: word_start,
542 end: i,
543 punct_pos: None,
544 });
545 }
546
547 let char_start = i;
549 let mut j = char_idx + 1;
550 let mut char_content = String::new();
551
552 if j < chars.len() {
553 if chars[j] == '\\' && j + 1 < chars.len() {
554 j += 1;
556 let escaped_char = match chars[j] {
557 'n' => '\n',
558 't' => '\t',
559 'r' => '\r',
560 '\\' => '\\',
561 '`' => '`',
562 '0' => '\0',
563 c => c,
564 };
565 char_content.push(escaped_char);
566 j += 1;
567 } else if chars[j] != '`' {
568 char_content.push(chars[j]);
570 j += 1;
571 }
572 }
573
574 if j < chars.len() && chars[j] == '`' {
576 j += 1; }
578
579 items.push(WordItem {
581 word: format!("\x00CHAR:{}", char_content),
582 trailing_punct: None,
583 start: char_start,
584 end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
585 punct_pos: None,
586 });
587
588 if j > char_idx + 1 {
589 skip_count = j - char_idx - 1;
590 }
591 word_start = char_start + (j - char_idx);
592 }
593 '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
595 if !current_word.is_empty() {
597 items.push(WordItem {
598 word: std::mem::take(&mut current_word),
599 trailing_punct: None,
600 start: word_start,
601 end: i,
602 punct_pos: None,
603 });
604 }
605 items.push(WordItem {
607 word: "->".to_string(),
608 trailing_punct: None,
609 start: i,
610 end: i + 2,
611 punct_pos: None,
612 });
613 skip_count = 1; word_start = i + 2;
615 }
616 '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
618 if !current_word.is_empty() {
619 items.push(WordItem {
620 word: std::mem::take(&mut current_word),
621 trailing_punct: None,
622 start: word_start,
623 end: i,
624 punct_pos: None,
625 });
626 }
627 items.push(WordItem {
628 word: "<=".to_string(),
629 trailing_punct: None,
630 start: i,
631 end: i + 2,
632 punct_pos: None,
633 });
634 skip_count = 1;
635 word_start = i + 2;
636 }
637 '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
639 if !current_word.is_empty() {
640 items.push(WordItem {
641 word: std::mem::take(&mut current_word),
642 trailing_punct: None,
643 start: word_start,
644 end: i,
645 punct_pos: None,
646 });
647 }
648 items.push(WordItem {
649 word: ">=".to_string(),
650 trailing_punct: None,
651 start: i,
652 end: i + 2,
653 punct_pos: None,
654 });
655 skip_count = 1;
656 word_start = i + 2;
657 }
658 '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
660 if !current_word.is_empty() {
661 items.push(WordItem {
662 word: std::mem::take(&mut current_word),
663 trailing_punct: None,
664 start: word_start,
665 end: i,
666 punct_pos: None,
667 });
668 }
669 items.push(WordItem {
670 word: "==".to_string(),
671 trailing_punct: None,
672 start: i,
673 end: i + 2,
674 punct_pos: None,
675 });
676 skip_count = 1;
677 word_start = i + 2;
678 }
679 '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
681 if !current_word.is_empty() {
682 items.push(WordItem {
683 word: std::mem::take(&mut current_word),
684 trailing_punct: None,
685 start: word_start,
686 end: i,
687 punct_pos: None,
688 });
689 }
690 items.push(WordItem {
691 word: "!=".to_string(),
692 trailing_punct: None,
693 start: i,
694 end: i + 2,
695 punct_pos: None,
696 });
697 skip_count = 1;
698 word_start = i + 2;
699 }
700 '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
701 if !current_word.is_empty() {
702 items.push(WordItem {
703 word: std::mem::take(&mut current_word),
704 trailing_punct: Some(c),
705 start: word_start,
706 end: i,
707 punct_pos: Some(i),
708 });
709 } else {
710 items.push(WordItem {
711 word: String::new(),
712 trailing_punct: Some(c),
713 start: i,
714 end: next_pos,
715 punct_pos: Some(i),
716 });
717 }
718 word_start = next_pos;
719 }
720 '\'' => {
721 let remaining: String = chars[char_idx + 1..].iter().collect();
723 let remaining_lower = remaining.to_lowercase();
724
725 if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
726 remaining_lower.starts_with("t,") || remaining_lower == "t" ||
727 (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
728 (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
729 let word_lower = current_word.to_lowercase();
731 if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
732 let base = if word_lower == "don" { "do" }
734 else if word_lower == "doesn" { "does" }
735 else { "did" };
736 items.push(WordItem {
737 word: base.to_string(),
738 trailing_punct: None,
739 start: word_start,
740 end: i,
741 punct_pos: None,
742 });
743 items.push(WordItem {
744 word: "not".to_string(),
745 trailing_punct: None,
746 start: i,
747 end: i + 2,
748 punct_pos: None,
749 });
750 current_word.clear();
751 word_start = next_pos + 1;
752 skip_count = 1;
753 } else if word_lower == "won" {
754 items.push(WordItem {
756 word: "will".to_string(),
757 trailing_punct: None,
758 start: word_start,
759 end: i,
760 punct_pos: None,
761 });
762 items.push(WordItem {
763 word: "not".to_string(),
764 trailing_punct: None,
765 start: i,
766 end: i + 2,
767 punct_pos: None,
768 });
769 current_word.clear();
770 word_start = next_pos + 1;
771 skip_count = 1;
772 } else if word_lower == "can" {
773 items.push(WordItem {
775 word: "cannot".to_string(),
776 trailing_punct: None,
777 start: word_start,
778 end: i + 2,
779 punct_pos: None,
780 });
781 current_word.clear();
782 word_start = next_pos + 1;
783 skip_count = 1;
784 } else {
785 if !current_word.is_empty() {
787 items.push(WordItem {
788 word: std::mem::take(&mut current_word),
789 trailing_punct: Some('\''),
790 start: word_start,
791 end: i,
792 punct_pos: Some(i),
793 });
794 }
795 word_start = next_pos;
796 }
797 } else {
798 if !current_word.is_empty() {
800 items.push(WordItem {
801 word: std::mem::take(&mut current_word),
802 trailing_punct: Some('\''),
803 start: word_start,
804 end: i,
805 punct_pos: Some(i),
806 });
807 }
808 word_start = next_pos;
809 }
810 }
811 c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
812 if current_word.is_empty() {
813 word_start = i;
814 }
815 current_word.push(c);
816 }
817 _ => {
818 word_start = next_pos;
819 }
820 }
821 char_idx += 1;
822 }
823
824 if !current_word.is_empty() {
825 items.push(WordItem {
826 word: current_word,
827 trailing_punct: None,
828 start: word_start,
829 end: input.len(),
830 punct_pos: None,
831 });
832 }
833
834 items
835 }
836
837 fn peek_word(&self, offset: usize) -> Option<&str> {
838 self.words.get(self.pos + offset).map(|w| w.word.as_str())
839 }
840
841 fn peek_sequence(&self, expected: &[&str]) -> bool {
842 for (i, &exp) in expected.iter().enumerate() {
843 match self.peek_word(i + 1) {
844 Some(w) if w.to_lowercase() == exp => continue,
845 _ => return false,
846 }
847 }
848 true
849 }
850
851 fn consume_words(&mut self, count: usize) {
852 self.pos += count;
853 }
854
855 pub fn tokenize(&mut self) -> Vec<Token> {
866 let mut tokens = Vec::new();
867
868 while self.pos < self.words.len() {
869 let item = &self.words[self.pos];
870 let word = item.word.clone();
871 let trailing_punct = item.trailing_punct;
872 let word_start = item.start;
873 let word_end = item.end;
874 let punct_pos = item.punct_pos;
875
876 if word.is_empty() {
877 if let Some(punct) = trailing_punct {
878 let kind = match punct {
879 '(' => TokenType::LParen,
880 ')' => TokenType::RParen,
881 '[' => TokenType::LBracket,
882 ']' => TokenType::RBracket,
883 ',' => TokenType::Comma,
884 ':' => TokenType::Colon,
885 '.' | '?' => {
886 self.in_let_context = false;
887 TokenType::Period
888 }
889 '!' => TokenType::Exclamation,
890 '+' => TokenType::Plus,
891 '-' => TokenType::Minus,
892 '*' => TokenType::Star,
893 '/' => TokenType::Slash,
894 '%' => TokenType::Percent,
895 '<' => TokenType::Lt,
896 '>' => TokenType::Gt,
897 _ => {
898 self.pos += 1;
899 continue;
900 }
901 };
902 let lexeme = self.interner.intern(&punct.to_string());
903 let span = Span::new(word_start, word_end);
904 tokens.push(Token::new(kind, lexeme, span));
905 }
906 self.pos += 1;
907 continue;
908 }
909
910 if word.starts_with("\x00STR:") {
912 let content = &word[5..]; let sym = self.interner.intern(content);
914 let span = Span::new(word_start, word_end);
915 tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
916 self.pos += 1;
917 continue;
918 }
919
920 if word.starts_with("\x00CHAR:") {
922 let content = &word[6..]; let sym = self.interner.intern(content);
924 let span = Span::new(word_start, word_end);
925 tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
926 self.pos += 1;
927 continue;
928 }
929
930 let kind = self.classify_with_lookahead(&word);
931 let lexeme = self.interner.intern(&word);
932 let span = Span::new(word_start, word_end);
933 tokens.push(Token::new(kind, lexeme, span));
934
935 if let Some(punct) = trailing_punct {
936 if punct == '\'' {
937 if let Some(next_item) = self.words.get(self.pos + 1) {
938 if next_item.word.to_lowercase() == "s" {
939 let poss_lexeme = self.interner.intern("'s");
940 let poss_start = punct_pos.unwrap_or(word_end);
941 let poss_end = next_item.end;
942 tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
943 self.pos += 1;
944 if let Some(s_punct) = next_item.trailing_punct {
945 let kind = match s_punct {
946 '(' => TokenType::LParen,
947 ')' => TokenType::RParen,
948 '[' => TokenType::LBracket,
949 ']' => TokenType::RBracket,
950 ',' => TokenType::Comma,
951 ':' => TokenType::Colon,
952 '.' | '?' => TokenType::Period,
953 '!' => TokenType::Exclamation,
954 '+' => TokenType::Plus,
955 '-' => TokenType::Minus,
956 '*' => TokenType::Star,
957 '/' => TokenType::Slash,
958 '%' => TokenType::Percent,
959 '<' => TokenType::Lt,
960 '>' => TokenType::Gt,
961 _ => {
962 self.pos += 1;
963 continue;
964 }
965 };
966 let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
967 let lexeme = self.interner.intern(&s_punct.to_string());
968 tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
969 }
970 self.pos += 1;
971 continue;
972 }
973 }
974 self.pos += 1;
975 continue;
976 }
977
978 let kind = match punct {
979 '(' => TokenType::LParen,
980 ')' => TokenType::RParen,
981 '[' => TokenType::LBracket,
982 ']' => TokenType::RBracket,
983 ',' => TokenType::Comma,
984 ':' => TokenType::Colon,
985 '.' | '?' => {
986 self.in_let_context = false;
987 TokenType::Period
988 }
989 '!' => TokenType::Exclamation,
990 '+' => TokenType::Plus,
991 '-' => TokenType::Minus,
992 '*' => TokenType::Star,
993 '/' => TokenType::Slash,
994 '%' => TokenType::Percent,
995 '<' => TokenType::Lt,
996 '>' => TokenType::Gt,
997 _ => {
998 self.pos += 1;
999 continue;
1000 }
1001 };
1002 let p_start = punct_pos.unwrap_or(word_end);
1003 let lexeme = self.interner.intern(&punct.to_string());
1004 tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1005 }
1006
1007 self.pos += 1;
1008 }
1009
1010 let eof_lexeme = self.interner.intern("");
1011 let eof_span = Span::new(self.input_len, self.input_len);
1012 tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1013
1014 self.insert_indentation_tokens(tokens)
1015 }
1016
1017 fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1022 let mut result = Vec::new();
1023 let empty_sym = self.interner.intern("");
1024
1025 let line_lexer = LineLexer::new(&self.source);
1027 let line_tokens: Vec<LineToken> = line_lexer.collect();
1028
1029 let mut structural_events: Vec<(usize, bool)> = Vec::new(); let mut pending_indents = 0usize;
1033 let mut pending_dedents = 0usize;
1034
1035 for line_token in &line_tokens {
1036 match line_token {
1037 LineToken::Indent => {
1038 pending_indents += 1;
1039 }
1040 LineToken::Dedent => {
1041 pending_dedents += 1;
1042 }
1043 LineToken::Content { start, .. } => {
1044 for _ in 0..pending_dedents {
1046 structural_events.push((*start, false)); }
1048 pending_dedents = 0;
1049
1050 for _ in 0..pending_indents {
1052 structural_events.push((*start, true)); }
1054 pending_indents = 0;
1055 }
1056 LineToken::Newline => {}
1057 }
1058 }
1059
1060 for _ in 0..pending_dedents {
1062 structural_events.push((self.input_len, false));
1063 }
1064
1065 structural_events.sort_by(|a, b| {
1067 if a.0 != b.0 {
1068 a.0.cmp(&b.0)
1069 } else {
1070 a.1.cmp(&b.1)
1072 }
1073 });
1074
1075 let mut event_idx = 0;
1080 let mut last_colon_pos: Option<usize> = None;
1081
1082 for token in tokens.iter() {
1083 let token_start = token.span.start;
1084
1085 while event_idx < structural_events.len() {
1087 let (event_pos, is_indent) = structural_events[event_idx];
1088
1089 if event_pos <= token_start {
1091 let span = if is_indent {
1092 Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1094 } else {
1095 Span::new(event_pos, event_pos)
1096 };
1097 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1098 result.push(Token::new(kind, empty_sym, span));
1099 event_idx += 1;
1100 } else {
1101 break;
1102 }
1103 }
1104
1105 result.push(token.clone());
1106
1107 if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1109 last_colon_pos = Some(token.span.end);
1110 }
1111 }
1112
1113 while event_idx < structural_events.len() {
1115 let (event_pos, is_indent) = structural_events[event_idx];
1116 let span = Span::new(event_pos, event_pos);
1117 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1118 result.push(Token::new(kind, empty_sym, span));
1119 event_idx += 1;
1120 }
1121
1122 let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1124 if let Some(pos) = eof_pos {
1125 let eof = result.remove(pos);
1126 result.push(eof);
1127 }
1128
1129 result
1130 }
1131
1132 fn is_end_of_line(&self, from_pos: usize) -> bool {
1134 let bytes = self.source.as_bytes();
1135 let mut pos = from_pos;
1136 while pos < bytes.len() {
1137 match bytes[pos] {
1138 b' ' | b'\t' => pos += 1,
1139 b'\n' => return true,
1140 _ => return false,
1141 }
1142 }
1143 true }
1145
1146 fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1147 let bytes = self.source.as_bytes();
1148 let mut pos = from_pos;
1149
1150 while pos < bytes.len() && bytes[pos] != b'\n' {
1151 pos += 1;
1152 }
1153
1154 if pos >= bytes.len() {
1155 return None;
1156 }
1157
1158 pos += 1;
1159
1160 let mut indent = 0;
1161 while pos < bytes.len() {
1162 match bytes[pos] {
1163 b' ' => indent += 1,
1164 b'\t' => indent += 4,
1165 b'\n' => {
1166 indent = 0;
1167 }
1168 _ => break,
1169 }
1170 pos += 1;
1171 }
1172
1173 if pos >= bytes.len() {
1174 return None;
1175 }
1176
1177 Some(indent)
1178 }
1179
1180 fn word_to_number(word: &str) -> Option<u32> {
1181 lexicon::word_to_number(&word.to_lowercase())
1182 }
1183
1184 fn is_numeric_literal(word: &str) -> bool {
1185 if word.is_empty() {
1186 return false;
1187 }
1188 let chars: Vec<char> = word.chars().collect();
1189 let first = chars[0];
1190 if first.is_ascii_digit() {
1191 return true;
1193 }
1194 if let Some(underscore_pos) = word.rfind('_') {
1197 let before_underscore = &word[..underscore_pos];
1198 let after_underscore = &word[underscore_pos + 1..];
1199 let is_math_symbol = matches!(
1201 before_underscore.to_lowercase().as_str(),
1202 "aleph" | "omega" | "beth"
1203 );
1204 if is_math_symbol
1205 && !after_underscore.is_empty()
1206 && after_underscore.chars().all(|c| c.is_ascii_digit())
1207 {
1208 return true;
1209 }
1210 }
1211 false
1212 }
1213
1214 fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1215 if word.starts_with("##") {
1217 let block_name = &word[2..];
1218 let block_type = match block_name.to_lowercase().as_str() {
1219 "theorem" => BlockType::Theorem,
1220 "main" => BlockType::Main,
1221 "definition" => BlockType::Definition,
1222 "proof" => BlockType::Proof,
1223 "example" => BlockType::Example,
1224 "logic" => BlockType::Logic,
1225 "note" => BlockType::Note,
1226 "to" => BlockType::Function, "a" | "an" => BlockType::TypeDef, "policy" => BlockType::Policy, _ => BlockType::Note, };
1231
1232 self.mode = match block_type {
1234 BlockType::Main | BlockType::Function => LexerMode::Imperative,
1235 _ => LexerMode::Declarative,
1236 };
1237
1238 return TokenType::BlockHeader { block_type };
1239 }
1240
1241 let lower = word.to_lowercase();
1242
1243 if lower == "each" && self.peek_sequence(&["other"]) {
1244 self.consume_words(1);
1245 return TokenType::Reciprocal;
1246 }
1247
1248 if lower == "to" {
1249 if let Some(next) = self.peek_word(1) {
1250 if self.is_verb_like(next) {
1251 return TokenType::To;
1252 }
1253 }
1254 let sym = self.interner.intern("to");
1255 return TokenType::Preposition(sym);
1256 }
1257
1258 if lower == "at" {
1259 if let Some(next) = self.peek_word(1) {
1260 let next_lower = next.to_lowercase();
1261 if next_lower == "least" {
1262 if let Some(num_word) = self.peek_word(2) {
1263 if let Some(n) = Self::word_to_number(num_word) {
1264 self.consume_words(2);
1265 return TokenType::AtLeast(n);
1266 }
1267 }
1268 }
1269 if next_lower == "most" {
1270 if let Some(num_word) = self.peek_word(2) {
1271 if let Some(n) = Self::word_to_number(num_word) {
1272 self.consume_words(2);
1273 return TokenType::AtMost(n);
1274 }
1275 }
1276 }
1277 }
1278 }
1279
1280 if let Some(n) = Self::word_to_number(&lower) {
1281 return TokenType::Cardinal(n);
1282 }
1283
1284 if Self::is_numeric_literal(word) {
1285 let sym = self.interner.intern(word);
1286 return TokenType::Number(sym);
1287 }
1288
1289 if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1290 self.consume_words(3);
1291 return TokenType::Iff;
1292 }
1293
1294 if lower == "is" {
1295 if self.peek_sequence(&["equal", "to"]) {
1296 self.consume_words(2);
1297 return TokenType::Identity;
1298 }
1299 if self.peek_sequence(&["identical", "to"]) {
1300 self.consume_words(2);
1301 return TokenType::Identity;
1302 }
1303 }
1304
1305 if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1306 if let Some(next) = self.peek_word(1) {
1309 let next_lower = next.to_lowercase();
1310 let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1311
1312 if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1314 let sym = self.interner.intern(word);
1315 return TokenType::ProperName(sym);
1316 }
1317
1318 let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1323 && !lexicon::is_disambiguation_not_verb(&next_lower);
1324 let is_gerund = next_lower.ends_with("ing");
1325 let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1326 if is_verb && !is_gerund && !is_also_noun_or_adj {
1327 let sym = self.interner.intern(word);
1328 return TokenType::ProperName(sym);
1329 }
1330
1331 if let Some(third) = self.peek_word(2) {
1334 let third_lower = third.to_lowercase();
1335 if third_lower == "is" || third_lower == "are" || third_lower == "has" {
1337 return TokenType::Article(Definiteness::Indefinite);
1338 }
1339 }
1340
1341 let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1345 if is_content_word || next_starts_lowercase {
1346 return TokenType::Article(Definiteness::Indefinite);
1347 }
1348 }
1349 let sym = self.interner.intern(word);
1350 return TokenType::ProperName(sym);
1351 }
1352
1353 self.classify_word(word)
1354 }
1355
1356 fn is_noun_like(&self, word: &str) -> bool {
1357 if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
1358 return true;
1359 }
1360 if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
1361 return true;
1362 }
1363 false
1364 }
1365
1366 fn is_adjective_like(&self, word: &str) -> bool {
1367 lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
1368 }
1369
1370 fn classify_word(&mut self, word: &str) -> TokenType {
1371 let lower = word.to_lowercase();
1372 let first_char = word.chars().next().unwrap();
1373
1374 if lower == "that" {
1377 if let Some(next) = self.peek_word(1) {
1378 let next_lower = next.to_lowercase();
1379 if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
1380 return TokenType::Article(Definiteness::Distal);
1381 }
1382 }
1383 }
1384
1385 if word == "->" {
1387 return TokenType::Arrow;
1388 }
1389
1390 if word == "<=" {
1392 return TokenType::LtEq;
1393 }
1394 if word == ">=" {
1395 return TokenType::GtEq;
1396 }
1397 if word == "==" {
1398 return TokenType::EqEq;
1399 }
1400 if word == "!=" {
1401 return TokenType::NotEq;
1402 }
1403 if word == "<" {
1404 return TokenType::Lt;
1405 }
1406 if word == ">" {
1407 return TokenType::Gt;
1408 }
1409
1410 if let Some(kind) = lexicon::lookup_keyword(&lower) {
1411 return kind;
1412 }
1413
1414 if let Some(kind) = lexicon::lookup_pronoun(&lower) {
1415 return kind;
1416 }
1417
1418 if let Some(def) = lexicon::lookup_article(&lower) {
1419 return TokenType::Article(def);
1420 }
1421
1422 if let Some(time) = lexicon::lookup_auxiliary(&lower) {
1423 return TokenType::Auxiliary(time);
1424 }
1425
1426 match lower.as_str() {
1428 "call" => return TokenType::Call,
1429 "in" if self.mode == LexerMode::Imperative => return TokenType::In,
1430 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
1432 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
1434 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
1436 _ => {}
1437 }
1438
1439 if lexicon::is_preposition(&lower) {
1440 let sym = self.interner.intern(&lower);
1441 return TokenType::Preposition(sym);
1442 }
1443
1444 match lower.as_str() {
1445 "equals" => return TokenType::Equals,
1446 "item" => return TokenType::Item,
1447 "items" => return TokenType::Items,
1448 "let" => {
1449 self.in_let_context = true;
1450 return TokenType::Let;
1451 }
1452 "set" => {
1453 if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
1456 } else if self.mode == LexerMode::Imperative {
1458 return TokenType::Set;
1460 } else {
1461 for offset in 2..=5 {
1464 if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
1465 return TokenType::Set;
1466 }
1467 }
1468 }
1469 }
1470 "return" => return TokenType::Return,
1471 "be" if self.in_let_context => {
1472 self.in_let_context = false;
1473 return TokenType::Be;
1474 }
1475 "while" => return TokenType::While,
1476 "assert" => return TokenType::Assert,
1477 "trust" => return TokenType::Trust,
1478 "check" => return TokenType::Check,
1479 "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
1481 "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
1482 "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
1483 "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
1485 "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
1486 "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
1487 "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
1489 "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
1491 "persistent" => return TokenType::Persistent, "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
1493 "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
1497 "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
1498 "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
1499 "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
1500 "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
1501 "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
1502 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
1503 "native" => return TokenType::Native,
1504 "from" => return TokenType::From,
1505 "otherwise" => return TokenType::Otherwise,
1506 "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
1508 "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
1510 "new" if self.mode == LexerMode::Imperative => return TokenType::New,
1512 "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
1515 "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
1516 "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
1518 "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
1519 "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
1520 "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
1521 "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
1522 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
1523 "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
1525 "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
1526 "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
1527 "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
1528 "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
1529 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
1531 "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
1532 "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
1533 "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
1534 "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
1535 "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
1537 "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
1538 "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
1539 "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
1541 "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
1542 "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
1543 "file" if self.mode == LexerMode::Imperative => return TokenType::File,
1544 "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
1546 "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
1547 "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
1548 "portable" => return TokenType::Portable,
1550 "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
1552 "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
1553 "shared" => return TokenType::Shared, "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
1556 "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
1557 "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
1559 "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
1560 "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
1561 "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
1562 "tally" => return TokenType::Tally,
1564 "sharedset" => return TokenType::SharedSet,
1565 "sharedsequence" => return TokenType::SharedSequence,
1566 "collaborativesequence" => return TokenType::CollaborativeSequence,
1567 "sharedmap" => return TokenType::SharedMap,
1568 "divergent" => return TokenType::Divergent,
1569 "removewins" => return TokenType::RemoveWins,
1570 "addwins" => return TokenType::AddWins,
1571 "yata" => return TokenType::YATA,
1572 "if" => return TokenType::If,
1573 "only" => return TokenType::Focus(FocusKind::Only),
1574 "even" => return TokenType::Focus(FocusKind::Even),
1575 "just" if self.peek_word(1).map_or(false, |w| {
1576 !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
1577 }) => return TokenType::Focus(FocusKind::Just),
1578 "much" => return TokenType::Measure(MeasureKind::Much),
1579 "little" => return TokenType::Measure(MeasureKind::Little),
1580 _ => {}
1581 }
1582
1583 if lexicon::is_scopal_adverb(&lower) {
1584 let sym = self.interner.intern(&Self::capitalize(&lower));
1585 return TokenType::ScopalAdverb(sym);
1586 }
1587
1588 if lexicon::is_temporal_adverb(&lower) {
1589 let sym = self.interner.intern(&Self::capitalize(&lower));
1590 return TokenType::TemporalAdverb(sym);
1591 }
1592
1593 if lexicon::is_non_intersective(&lower) {
1594 let sym = self.interner.intern(&Self::capitalize(&lower));
1595 return TokenType::NonIntersectiveAdjective(sym);
1596 }
1597
1598 if lexicon::is_adverb(&lower) {
1599 let sym = self.interner.intern(&Self::capitalize(&lower));
1600 return TokenType::Adverb(sym);
1601 }
1602 if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
1603 let sym = self.interner.intern(&Self::capitalize(&lower));
1604 return TokenType::Adverb(sym);
1605 }
1606
1607 if let Some(base) = self.try_parse_superlative(&lower) {
1608 let sym = self.interner.intern(&base);
1609 return TokenType::Superlative(sym);
1610 }
1611
1612 let irregular_comparative = match lower.as_str() {
1614 "less" => Some("Little"),
1615 "more" => Some("Much"),
1616 "better" => Some("Good"),
1617 "worse" => Some("Bad"),
1618 _ => None,
1619 };
1620 if let Some(base) = irregular_comparative {
1621 let sym = self.interner.intern(base);
1622 return TokenType::Comparative(sym);
1623 }
1624
1625 if let Some(base) = self.try_parse_comparative(&lower) {
1626 let sym = self.interner.intern(&base);
1627 return TokenType::Comparative(sym);
1628 }
1629
1630 if lexicon::is_performative(&lower) {
1631 let sym = self.interner.intern(&Self::capitalize(&lower));
1632 return TokenType::Performative(sym);
1633 }
1634
1635 if lexicon::is_base_verb_early(&lower) {
1636 let sym = self.interner.intern(&Self::capitalize(&lower));
1637 let class = lexicon::lookup_verb_class(&lower);
1638 return TokenType::Verb {
1639 lemma: sym,
1640 time: Time::Present,
1641 aspect: Aspect::Simple,
1642 class,
1643 };
1644 }
1645
1646 if lower.ends_with("ing") && lower.len() > 4 {
1649 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
1650 let sym = self.interner.intern(&entry.lemma);
1651 return TokenType::Verb {
1652 lemma: sym,
1653 time: entry.time,
1654 aspect: entry.aspect,
1655 class: entry.class,
1656 };
1657 }
1658 }
1659
1660 if first_char.is_uppercase() {
1661 if let Some(next) = self.peek_word(1) {
1668 let next_lower = next.to_lowercase();
1669 let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1671 || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
1672
1673 if is_followed_by_verb {
1674 if let Some(analysis) = lexicon::analyze_word(&lower) {
1676 match analysis {
1677 lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
1678 let sym = self.interner.intern(&lower);
1680 return TokenType::Noun(sym);
1681 }
1682 lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
1683 let sym = self.interner.intern(&lower);
1685 return TokenType::Noun(sym);
1686 }
1687 _ => {
1688 }
1691 }
1692 }
1693 }
1694 }
1695
1696 let sym = self.interner.intern(word);
1697 return TokenType::ProperName(sym);
1698 }
1699
1700 let verb_entry = self.lexicon.lookup_verb(&lower);
1701 let is_noun = lexicon::is_common_noun(&lower);
1702 let is_adj = self.is_adjective_like(&lower);
1703 let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
1704
1705 if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
1707 let entry = verb_entry.unwrap();
1708 let verb_token = TokenType::Verb {
1709 lemma: self.interner.intern(&entry.lemma),
1710 time: entry.time,
1711 aspect: entry.aspect,
1712 class: entry.class,
1713 };
1714
1715 let mut alternatives = Vec::new();
1716 if is_noun {
1717 alternatives.push(TokenType::Noun(self.interner.intern(word)));
1718 }
1719 if is_adj {
1720 alternatives.push(TokenType::Adjective(self.interner.intern(word)));
1721 }
1722
1723 return TokenType::Ambiguous {
1724 primary: Box::new(verb_token),
1725 alternatives,
1726 };
1727 }
1728
1729 if let Some(_) = &verb_entry {
1731 if is_disambiguated {
1732 let sym = self.interner.intern(word);
1733 if is_noun {
1734 return TokenType::Noun(sym);
1735 }
1736 return TokenType::Adjective(sym);
1737 }
1738 }
1739
1740 if let Some(entry) = verb_entry {
1742 let sym = self.interner.intern(&entry.lemma);
1743 return TokenType::Verb {
1744 lemma: sym,
1745 time: entry.time,
1746 aspect: entry.aspect,
1747 class: entry.class,
1748 };
1749 }
1750
1751 if is_noun {
1753 let sym = self.interner.intern(word);
1754 return TokenType::Noun(sym);
1755 }
1756
1757 if lexicon::is_base_verb(&lower) {
1758 let sym = self.interner.intern(&Self::capitalize(&lower));
1759 let class = lexicon::lookup_verb_class(&lower);
1760 return TokenType::Verb {
1761 lemma: sym,
1762 time: Time::Present,
1763 aspect: Aspect::Simple,
1764 class,
1765 };
1766 }
1767
1768 if lower.ends_with("ian")
1769 || lower.ends_with("er")
1770 || lower == "logic"
1771 || lower == "time"
1772 || lower == "men"
1773 || lower == "book"
1774 || lower == "house"
1775 || lower == "code"
1776 || lower == "user"
1777 {
1778 let sym = self.interner.intern(word);
1779 return TokenType::Noun(sym);
1780 }
1781
1782 if lexicon::is_particle(&lower) {
1783 let sym = self.interner.intern(&lower);
1784 return TokenType::Particle(sym);
1785 }
1786
1787 let sym = self.interner.intern(word);
1788 TokenType::Adjective(sym)
1789 }
1790
1791 fn capitalize(s: &str) -> String {
1792 let mut chars = s.chars();
1793 match chars.next() {
1794 None => String::new(),
1795 Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
1796 }
1797 }
1798
1799 pub fn is_collective_verb(lemma: &str) -> bool {
1800 lexicon::is_collective_verb(&lemma.to_lowercase())
1801 }
1802
1803 pub fn is_mixed_verb(lemma: &str) -> bool {
1804 lexicon::is_mixed_verb(&lemma.to_lowercase())
1805 }
1806
1807 pub fn is_distributive_verb(lemma: &str) -> bool {
1808 lexicon::is_distributive_verb(&lemma.to_lowercase())
1809 }
1810
1811 pub fn is_intensional_predicate(lemma: &str) -> bool {
1812 lexicon::is_intensional_predicate(&lemma.to_lowercase())
1813 }
1814
1815 pub fn is_opaque_verb(lemma: &str) -> bool {
1816 lexicon::is_opaque_verb(&lemma.to_lowercase())
1817 }
1818
1819 pub fn is_ditransitive_verb(lemma: &str) -> bool {
1820 lexicon::is_ditransitive_verb(&lemma.to_lowercase())
1821 }
1822
1823 fn is_verb_like(&self, word: &str) -> bool {
1824 let lower = word.to_lowercase();
1825 if lexicon::is_infinitive_verb(&lower) {
1826 return true;
1827 }
1828 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
1829 return entry.lemma.len() > 0;
1830 }
1831 false
1832 }
1833
1834 pub fn is_subject_control_verb(lemma: &str) -> bool {
1835 lexicon::is_subject_control_verb(&lemma.to_lowercase())
1836 }
1837
1838 pub fn is_raising_verb(lemma: &str) -> bool {
1839 lexicon::is_raising_verb(&lemma.to_lowercase())
1840 }
1841
1842 pub fn is_object_control_verb(lemma: &str) -> bool {
1843 lexicon::is_object_control_verb(&lemma.to_lowercase())
1844 }
1845
1846 pub fn is_weather_verb(lemma: &str) -> bool {
1847 matches!(
1848 lemma.to_lowercase().as_str(),
1849 "rain" | "snow" | "hail" | "thunder" | "pour"
1850 )
1851 }
1852
1853 fn try_parse_superlative(&self, word: &str) -> Option<String> {
1854 if !word.ends_with("est") || word.len() < 5 {
1855 return None;
1856 }
1857
1858 let base = &word[..word.len() - 3];
1859
1860 if base.len() >= 2 {
1861 let chars: Vec<char> = base.chars().collect();
1862 let last = chars[chars.len() - 1];
1863 let second_last = chars[chars.len() - 2];
1864 if last == second_last && !"aeiou".contains(last) {
1865 let stem = &base[..base.len() - 1];
1866 if lexicon::is_gradable_adjective(stem) {
1867 return Some(Self::capitalize(stem));
1868 }
1869 }
1870 }
1871
1872 if base.ends_with("i") {
1873 let stem = format!("{}y", &base[..base.len() - 1]);
1874 if lexicon::is_gradable_adjective(&stem) {
1875 return Some(Self::capitalize(&stem));
1876 }
1877 }
1878
1879 if lexicon::is_gradable_adjective(base) {
1880 return Some(Self::capitalize(base));
1881 }
1882
1883 None
1884 }
1885
1886 fn try_parse_comparative(&self, word: &str) -> Option<String> {
1887 if !word.ends_with("er") || word.len() < 4 {
1888 return None;
1889 }
1890
1891 let base = &word[..word.len() - 2];
1892
1893 if base.len() >= 2 {
1894 let chars: Vec<char> = base.chars().collect();
1895 let last = chars[chars.len() - 1];
1896 let second_last = chars[chars.len() - 2];
1897 if last == second_last && !"aeiou".contains(last) {
1898 let stem = &base[..base.len() - 1];
1899 if lexicon::is_gradable_adjective(stem) {
1900 return Some(Self::capitalize(stem));
1901 }
1902 }
1903 }
1904
1905 if base.ends_with("i") {
1906 let stem = format!("{}y", &base[..base.len() - 1]);
1907 if lexicon::is_gradable_adjective(&stem) {
1908 return Some(Self::capitalize(&stem));
1909 }
1910 }
1911
1912 if lexicon::is_gradable_adjective(base) {
1913 return Some(Self::capitalize(base));
1914 }
1915
1916 None
1917 }
1918}
1919
1920#[cfg(test)]
1921mod tests {
1922 use super::*;
1923
1924 #[test]
1925 fn lexer_handles_apostrophe() {
1926 let mut interner = Interner::new();
1927 let mut lexer = Lexer::new("it's raining", &mut interner);
1928 let tokens = lexer.tokenize();
1929 assert!(!tokens.is_empty());
1930 }
1931
1932 #[test]
1933 fn lexer_handles_question_mark() {
1934 let mut interner = Interner::new();
1935 let mut lexer = Lexer::new("Is it raining?", &mut interner);
1936 let tokens = lexer.tokenize();
1937 assert!(!tokens.is_empty());
1938 }
1939
1940 #[test]
1941 fn ring_is_not_verb() {
1942 let mut interner = Interner::new();
1943 let mut lexer = Lexer::new("ring", &mut interner);
1944 let tokens = lexer.tokenize();
1945 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
1946 }
1947
1948 #[test]
1949 fn debug_that_token() {
1950 let mut interner = Interner::new();
1951 let mut lexer = Lexer::new("The cat that runs", &mut interner);
1952 let tokens = lexer.tokenize();
1953 for (i, t) in tokens.iter().enumerate() {
1954 let lex = interner.resolve(t.lexeme);
1955 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
1956 }
1957 let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
1958 if let Some(t) = that_token {
1959 let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
1961 eprintln!("Discriminant check for That: {}", check);
1962 assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
1963 } else {
1964 panic!("No 'that' token found");
1965 }
1966 }
1967
1968 #[test]
1969 fn bus_is_not_verb() {
1970 let mut interner = Interner::new();
1971 let mut lexer = Lexer::new("bus", &mut interner);
1972 let tokens = lexer.tokenize();
1973 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
1974 }
1975
1976 #[test]
1977 fn lowercase_a_is_article() {
1978 let mut interner = Interner::new();
1979 let mut lexer = Lexer::new("a car", &mut interner);
1980 let tokens = lexer.tokenize();
1981 for (i, t) in tokens.iter().enumerate() {
1982 let lex = interner.resolve(t.lexeme);
1983 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
1984 }
1985 assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
1986 assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
1987 }
1988
1989 #[test]
1990 fn open_is_ambiguous() {
1991 let mut interner = Interner::new();
1992 let mut lexer = Lexer::new("open", &mut interner);
1993 let tokens = lexer.tokenize();
1994
1995 if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
1996 assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
1997 assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
1998 "Should have Adjective alternative");
1999 } else {
2000 panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2001 }
2002 }
2003
2004 #[test]
2005 fn basic_tokenization() {
2006 let mut interner = Interner::new();
2007 let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2008 let tokens = lexer.tokenize();
2009 assert_eq!(tokens[0].kind, TokenType::All);
2010 assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2011 assert_eq!(tokens[2].kind, TokenType::Are);
2012 }
2013
2014 #[test]
2015 fn iff_tokenizes_as_single_token() {
2016 let mut interner = Interner::new();
2017 let mut lexer = Lexer::new("A if and only if B", &mut interner);
2018 let tokens = lexer.tokenize();
2019 assert!(
2020 tokens.iter().any(|t| t.kind == TokenType::Iff),
2021 "should contain Iff token: got {:?}",
2022 tokens
2023 );
2024 }
2025
2026 #[test]
2027 fn is_equal_to_tokenizes_as_identity() {
2028 let mut interner = Interner::new();
2029 let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2030 let tokens = lexer.tokenize();
2031 assert!(
2032 tokens.iter().any(|t| t.kind == TokenType::Identity),
2033 "should contain Identity token: got {:?}",
2034 tokens
2035 );
2036 }
2037
2038 #[test]
2039 fn is_identical_to_tokenizes_as_identity() {
2040 let mut interner = Interner::new();
2041 let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2042 let tokens = lexer.tokenize();
2043 assert!(
2044 tokens.iter().any(|t| t.kind == TokenType::Identity),
2045 "should contain Identity token: got {:?}",
2046 tokens
2047 );
2048 }
2049
2050 #[test]
2051 fn itself_tokenizes_as_reflexive() {
2052 let mut interner = Interner::new();
2053 let mut lexer = Lexer::new("John loves itself", &mut interner);
2054 let tokens = lexer.tokenize();
2055 assert!(
2056 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2057 "should contain Reflexive token: got {:?}",
2058 tokens
2059 );
2060 }
2061
2062 #[test]
2063 fn himself_tokenizes_as_reflexive() {
2064 let mut interner = Interner::new();
2065 let mut lexer = Lexer::new("John sees himself", &mut interner);
2066 let tokens = lexer.tokenize();
2067 assert!(
2068 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2069 "should contain Reflexive token: got {:?}",
2070 tokens
2071 );
2072 }
2073
2074 #[test]
2075 fn to_stay_tokenizes_correctly() {
2076 let mut interner = Interner::new();
2077 let mut lexer = Lexer::new("to stay", &mut interner);
2078 let tokens = lexer.tokenize();
2079 assert!(
2080 tokens.iter().any(|t| t.kind == TokenType::To),
2081 "should contain To token: got {:?}",
2082 tokens
2083 );
2084 assert!(
2085 tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2086 "should contain Verb token for stay: got {:?}",
2087 tokens
2088 );
2089 }
2090
2091 #[test]
2092 fn possessive_apostrophe_s() {
2093 let mut interner = Interner::new();
2094 let mut lexer = Lexer::new("John's dog", &mut interner);
2095 let tokens = lexer.tokenize();
2096 assert!(
2097 tokens.iter().any(|t| t.kind == TokenType::Possessive),
2098 "should contain Possessive token: got {:?}",
2099 tokens
2100 );
2101 assert!(
2102 tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2103 "should have John as proper name: got {:?}",
2104 tokens
2105 );
2106 }
2107
2108 #[test]
2109 fn lexer_produces_valid_spans() {
2110 let input = "All men are mortal.";
2111 let mut interner = Interner::new();
2112 let mut lexer = Lexer::new(input, &mut interner);
2113 let tokens = lexer.tokenize();
2114
2115 assert_eq!(tokens[0].span.start, 0);
2117 assert_eq!(tokens[0].span.end, 3);
2118 assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2119
2120 assert_eq!(tokens[1].span.start, 4);
2122 assert_eq!(tokens[1].span.end, 7);
2123 assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2124
2125 assert_eq!(tokens[2].span.start, 8);
2127 assert_eq!(tokens[2].span.end, 11);
2128 assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2129
2130 assert_eq!(tokens[3].span.start, 12);
2132 assert_eq!(tokens[3].span.end, 18);
2133 assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2134
2135 assert_eq!(tokens[4].span.start, 18);
2137 assert_eq!(tokens[4].span.end, 19);
2138
2139 assert_eq!(tokens[5].span.start, input.len());
2141 assert_eq!(tokens[5].kind, TokenType::EOF);
2142 }
2143}