1use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50 Indent,
52 Dedent,
54 Newline,
56 Content { text: String, start: usize, end: usize },
58}
59
60pub struct LineLexer<'a> {
63 source: &'a str,
64 bytes: &'a [u8],
65 indent_stack: Vec<usize>,
66 pending_dedents: usize,
67 position: usize,
68 has_pending_content: bool,
70 pending_content_start: usize,
71 pending_content_end: usize,
72 pending_content_text: String,
73 finished_lines: bool,
75 emitted_indent: bool,
77 escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82 pub fn new(source: &'a str) -> Self {
83 Self {
84 source,
85 bytes: source.as_bytes(),
86 indent_stack: vec![0],
87 pending_dedents: 0,
88 position: 0,
89 has_pending_content: false,
90 pending_content_start: 0,
91 pending_content_end: 0,
92 pending_content_text: String::new(),
93 finished_lines: false,
94 emitted_indent: false,
95 escape_body_ranges: Vec::new(),
96 }
97 }
98
99 pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100 Self {
101 source,
102 bytes: source.as_bytes(),
103 indent_stack: vec![0],
104 pending_dedents: 0,
105 position: 0,
106 has_pending_content: false,
107 pending_content_start: 0,
108 pending_content_end: 0,
109 pending_content_text: String::new(),
110 finished_lines: false,
111 emitted_indent: false,
112 escape_body_ranges,
113 }
114 }
115
116 fn is_in_escape_body(&self, pos: usize) -> bool {
118 self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119 }
120
121 fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124 let mut indent = 0;
125 let mut pos = line_start;
126
127 while pos < self.bytes.len() {
128 match self.bytes[pos] {
129 b' ' => {
130 indent += 1;
131 pos += 1;
132 }
133 b'\t' => {
134 indent += 4; pos += 1;
136 }
137 _ => break,
138 }
139 }
140
141 (indent, pos)
142 }
143
144 fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147 let mut pos = content_start;
148
149 while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151 pos += 1;
152 }
153
154 let content_end = pos;
155 let text = self.source[content_start..content_end].trim_end().to_string();
156
157 let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159 pos + 1
160 } else {
161 pos
162 };
163
164 (text, content_start, content_end, next_line_start)
165 }
166
167 fn is_blank_line(&self, line_start: usize) -> bool {
169 let mut pos = line_start;
170 while pos < self.bytes.len() {
171 match self.bytes[pos] {
172 b' ' | b'\t' => pos += 1,
173 b'\n' => return true,
174 _ => return false,
175 }
176 }
177 true }
179
180 fn process_next_line(&mut self) -> bool {
183 while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185 while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187 self.position += 1;
188 }
189 if self.position < self.bytes.len() {
190 self.position += 1; }
192 }
193
194 if self.position >= self.bytes.len() {
196 self.finished_lines = true;
197 if self.indent_stack.len() > 1 {
199 self.pending_dedents = self.indent_stack.len() - 1;
200 self.indent_stack.truncate(1);
201 }
202 return self.pending_dedents > 0;
203 }
204
205 let (line_indent, content_start) = self.measure_indent(self.position);
207
208 let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211 if text.is_empty() {
213 self.position = next_pos;
214 return self.process_next_line();
215 }
216
217 let current_indent = *self.indent_stack.last().unwrap();
218
219 if line_indent > current_indent {
221 self.indent_stack.push(line_indent);
223 self.emitted_indent = true;
224 self.has_pending_content = true;
226 self.pending_content_text = text;
227 self.pending_content_start = start;
228 self.pending_content_end = end;
229 self.position = next_pos;
230 return true;
232 } else if line_indent < current_indent {
233 while self.indent_stack.len() > 1 {
235 let top = *self.indent_stack.last().unwrap();
236 if line_indent < top {
237 self.indent_stack.pop();
238 self.pending_dedents += 1;
239 } else {
240 break;
241 }
242 }
243 self.has_pending_content = true;
245 self.pending_content_text = text;
246 self.pending_content_start = start;
247 self.pending_content_end = end;
248 self.position = next_pos;
249 return true;
250 } else {
251 self.has_pending_content = true;
253 self.pending_content_text = text;
254 self.pending_content_start = start;
255 self.pending_content_end = end;
256 self.position = next_pos;
257 return true;
258 }
259 }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263 type Item = LineToken;
264
265 fn next(&mut self) -> Option<LineToken> {
266 if self.pending_dedents > 0 {
268 self.pending_dedents -= 1;
269 return Some(LineToken::Dedent);
270 }
271
272 if self.has_pending_content {
274 self.has_pending_content = false;
275 let text = std::mem::take(&mut self.pending_content_text);
276 let start = self.pending_content_start;
277 let end = self.pending_content_end;
278 return Some(LineToken::Content { text, start, end });
279 }
280
281 if !self.finished_lines {
287 let had_indent = self.indent_stack.len();
288 if self.process_next_line() {
289 if self.indent_stack.len() > had_indent {
291 return Some(LineToken::Indent);
292 }
293 if self.pending_dedents > 0 {
295 self.pending_dedents -= 1;
296 return Some(LineToken::Dedent);
297 }
298 if self.has_pending_content {
300 self.has_pending_content = false;
301 let text = std::mem::take(&mut self.pending_content_text);
302 let start = self.pending_content_start;
303 let end = self.pending_content_end;
304 return Some(LineToken::Content { text, start, end });
305 }
306 } else if self.pending_dedents > 0 {
307 self.pending_dedents -= 1;
309 return Some(LineToken::Dedent);
310 }
311 }
312
313 if self.pending_dedents > 0 {
315 self.pending_dedents -= 1;
316 return Some(LineToken::Dedent);
317 }
318
319 None
320 }
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329 #[default]
330 Declarative, Imperative, }
333
334pub struct Lexer<'a> {
335 words: Vec<WordItem>,
336 pos: usize,
337 lexicon: Lexicon,
338 interner: &'a mut Interner,
339 input_len: usize,
340 in_let_context: bool,
341 mode: LexerMode,
342 source: String,
343 escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348 word: String,
349 trailing_punct: Option<char>,
350 start: usize,
351 end: usize,
352 punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356 pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380 let escape_ranges = Self::find_escape_block_ranges(input);
381 let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382 .map(|(_, end, content_start, _)| (*content_start, *end))
383 .collect();
384 let words = Self::split_into_words(input, &escape_ranges);
385 let input_len = input.len();
386
387 Lexer {
388 words,
389 pos: 0,
390 lexicon: Lexicon::new(),
391 interner,
392 input_len,
393 in_let_context: false,
394 mode: LexerMode::Declarative,
395 source: input.to_string(),
396 escape_body_ranges,
397 }
398 }
399
400 fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405 let mut ranges = Vec::new();
406 let lines: Vec<&str> = source.split('\n').collect();
407 let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408 let mut pos = 0;
409 for line in &lines {
410 line_starts.push(pos);
411 pos += line.len() + 1; }
413
414 let mut i = 0;
415 while i < lines.len() {
416 let trimmed = lines[i].trim();
417 let lower = trimmed.to_lowercase();
421 if lower == "escape to rust:" ||
422 lower.ends_with(" escape to rust:") ||
423 (lower.starts_with("escape to ") && lower.ends_with(':'))
424 {
425 let header_indent = Self::measure_indent_static(lines[i]);
427 i += 1;
428
429 let mut body_start_line = i;
431 while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432 body_start_line += 1;
433 }
434
435 if body_start_line >= lines.len() {
436 continue;
438 }
439
440 let base_indent = Self::measure_indent_static(lines[body_start_line]);
441 if base_indent <= header_indent {
442 continue;
444 }
445
446 let body_byte_start = line_starts[body_start_line];
448 let mut body_end_line = body_start_line;
449 let mut code_lines: Vec<String> = Vec::new();
450
451 let mut j = body_start_line;
452 while j < lines.len() {
453 let line = lines[j];
454 if line.trim().is_empty() {
455 code_lines.push(String::new());
457 body_end_line = j;
458 j += 1;
459 continue;
460 }
461 let line_indent = Self::measure_indent_static(line);
462 if line_indent < base_indent {
463 break;
464 }
465 let stripped = Self::strip_indent(line, base_indent);
467 code_lines.push(stripped);
468 body_end_line = j;
469 j += 1;
470 }
471
472 while code_lines.last().map_or(false, |l| l.is_empty()) {
474 code_lines.pop();
475 }
476
477 if !code_lines.is_empty() {
478 let body_byte_end = if body_end_line + 1 < lines.len() {
479 line_starts[body_end_line + 1]
480 } else {
481 source.len()
482 };
483 let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485 let raw_code = code_lines.join("\n");
486 ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487 }
488
489 i = j;
490 } else {
491 i += 1;
492 }
493 }
494
495 ranges
496 }
497
498 fn leading_whitespace_bytes(line: &str) -> usize {
500 let mut count = 0;
501 for c in line.chars() {
502 match c {
503 ' ' | '\t' => count += c.len_utf8(),
504 _ => break,
505 }
506 }
507 count
508 }
509
510 fn measure_indent_static(line: &str) -> usize {
512 let mut indent = 0;
513 for c in line.chars() {
514 match c {
515 ' ' => indent += 1,
516 '\t' => indent += 4,
517 _ => break,
518 }
519 }
520 indent
521 }
522
523 fn strip_indent(line: &str, count: usize) -> String {
525 let mut stripped = 0;
526 let mut byte_pos = 0;
527 for (i, c) in line.char_indices() {
528 if stripped >= count {
529 byte_pos = i;
530 break;
531 }
532 match c {
533 ' ' => { stripped += 1; byte_pos = i + 1; }
534 '\t' => { stripped += 4; byte_pos = i + 1; }
535 _ => { byte_pos = i; break; }
536 }
537 }
538 if stripped < count {
539 byte_pos = line.len();
540 }
541 line[byte_pos..].to_string()
542 }
543
544 fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545 let mut items = Vec::new();
546 let mut current_word = String::new();
547 let mut word_start = 0;
548 let chars: Vec<char> = input.chars().collect();
549 let mut char_idx = 0;
550 let mut skip_count = 0;
551 let mut skip_to_byte: Option<usize> = None;
553
554 for (i, c) in input.char_indices() {
555 if skip_count > 0 {
556 skip_count -= 1;
557 char_idx += 1;
558 continue;
559 }
560 if let Some(end) = skip_to_byte {
562 if i < end {
563 char_idx += 1;
564 continue;
565 }
566 skip_to_byte = None;
567 word_start = i;
568 }
569 if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571 if !current_word.is_empty() {
573 items.push(WordItem {
574 word: std::mem::take(&mut current_word),
575 trailing_punct: None,
576 start: word_start,
577 end: i,
578 punct_pos: None,
579 });
580 }
581 items.push(WordItem {
584 word: format!("\x00ESC:{}", raw_code),
585 trailing_punct: None,
586 start: *content_start,
587 end: *end,
588 punct_pos: None,
589 });
590 skip_to_byte = Some(*end);
591 word_start = *end;
592 char_idx += 1;
593 continue;
594 }
595 let next_pos = i + c.len_utf8();
596 match c {
597 ' ' | '\t' | '\n' | '\r' => {
598 if !current_word.is_empty() {
599 items.push(WordItem {
600 word: std::mem::take(&mut current_word),
601 trailing_punct: None,
602 start: word_start,
603 end: i,
604 punct_pos: None,
605 });
606 }
607 word_start = next_pos;
608 }
609 '.' => {
610 let prev_is_digit = !current_word.is_empty()
612 && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613 let next_is_digit = char_idx + 1 < chars.len()
614 && chars[char_idx + 1].is_ascii_digit();
615
616 if prev_is_digit && next_is_digit {
617 current_word.push(c);
619 } else {
620 if !current_word.is_empty() {
622 items.push(WordItem {
623 word: std::mem::take(&mut current_word),
624 trailing_punct: Some(c),
625 start: word_start,
626 end: i,
627 punct_pos: Some(i),
628 });
629 } else {
630 items.push(WordItem {
631 word: String::new(),
632 trailing_punct: Some(c),
633 start: i,
634 end: next_pos,
635 punct_pos: Some(i),
636 });
637 }
638 word_start = next_pos;
639 }
640 }
641 '#' => {
642 if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644 if !current_word.is_empty() {
647 items.push(WordItem {
648 word: std::mem::take(&mut current_word),
649 trailing_punct: None,
650 start: word_start,
651 end: i,
652 punct_pos: None,
653 });
654 }
655 let header_start = i;
657 let mut j = char_idx + 2;
658 while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659 j += 1;
660 }
661 let mut block_word = String::from("##");
663 while j < chars.len() && chars[j].is_alphabetic() {
664 block_word.push(chars[j]);
665 j += 1;
666 }
667 if block_word.len() > 2 {
668 items.push(WordItem {
669 word: block_word,
670 trailing_punct: None,
671 start: header_start,
672 end: header_start + (j - char_idx),
673 punct_pos: None,
674 });
675 }
676 skip_count = j - char_idx - 1;
677 word_start = header_start + (j - char_idx);
678 } else {
679 let mut look_ahead = char_idx + 1;
683 while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684 skip_count += 1;
685 look_ahead += 1;
686 }
687 if !current_word.is_empty() {
688 items.push(WordItem {
689 word: std::mem::take(&mut current_word),
690 trailing_punct: None,
691 start: word_start,
692 end: i,
693 punct_pos: None,
694 });
695 }
696 word_start = look_ahead + 1; }
698 }
699 '"' => {
701 if !current_word.is_empty() {
703 items.push(WordItem {
704 word: std::mem::take(&mut current_word),
705 trailing_punct: None,
706 start: word_start,
707 end: i,
708 punct_pos: None,
709 });
710 }
711
712 if char_idx + 2 < chars.len() && chars[char_idx + 1] == '"' && chars[char_idx + 2] == '"' {
714 let string_start = i;
715 let mut j = char_idx + 3; if j < chars.len() && chars[j] == '\n' {
718 j += 1;
719 }
720 let mut raw_content = String::new();
721 while j < chars.len() {
723 if j + 2 < chars.len() && chars[j] == '"' && chars[j + 1] == '"' && chars[j + 2] == '"' {
724 break;
725 }
726 raw_content.push(chars[j]);
727 j += 1;
728 }
729 if raw_content.ends_with('\n') {
731 raw_content.pop();
732 }
733 let dedented = Self::dedent_triple_quote(&raw_content);
735 let end_pos = if j + 2 < chars.len() { j + 3 } else { chars.len() };
736 items.push(WordItem {
737 word: format!("\x00STR:{}", dedented),
738 trailing_punct: None,
739 start: string_start,
740 end: end_pos,
741 punct_pos: None,
742 });
743 if j + 2 < chars.len() {
745 skip_count = (j + 2) - char_idx;
746 } else {
747 skip_count = chars.len() - 1 - char_idx;
748 }
749 word_start = end_pos;
750 } else {
751 let string_start = i;
753 let mut j = char_idx + 1;
754 let mut string_content = String::new();
755 while j < chars.len() && chars[j] != '"' {
756 if chars[j] == '\\' && j + 1 < chars.len() {
757 j += 1;
759 if j < chars.len() {
760 string_content.push(chars[j]);
761 }
762 } else {
763 string_content.push(chars[j]);
764 }
765 j += 1;
766 }
767
768 items.push(WordItem {
771 word: format!("\x00STR:{}", string_content),
772 trailing_punct: None,
773 start: string_start,
774 end: if j < chars.len() { j + 1 } else { j },
775 punct_pos: None,
776 });
777
778 if j < chars.len() {
780 skip_count = j - char_idx;
781 } else {
782 skip_count = j - char_idx - 1;
783 }
784 word_start = if j < chars.len() { j + 1 } else { j };
785 }
786 }
787 '`' => {
789 if !current_word.is_empty() {
791 items.push(WordItem {
792 word: std::mem::take(&mut current_word),
793 trailing_punct: None,
794 start: word_start,
795 end: i,
796 punct_pos: None,
797 });
798 }
799
800 let char_start = i;
802 let mut j = char_idx + 1;
803 let mut char_content = String::new();
804
805 if j < chars.len() {
806 if chars[j] == '\\' && j + 1 < chars.len() {
807 j += 1;
809 let escaped_char = match chars[j] {
810 'n' => '\n',
811 't' => '\t',
812 'r' => '\r',
813 '\\' => '\\',
814 '`' => '`',
815 '0' => '\0',
816 c => c,
817 };
818 char_content.push(escaped_char);
819 j += 1;
820 } else if chars[j] != '`' {
821 char_content.push(chars[j]);
823 j += 1;
824 }
825 }
826
827 if j < chars.len() && chars[j] == '`' {
829 j += 1; }
831
832 items.push(WordItem {
834 word: format!("\x00CHAR:{}", char_content),
835 trailing_punct: None,
836 start: char_start,
837 end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
838 punct_pos: None,
839 });
840
841 if j > char_idx + 1 {
842 skip_count = j - char_idx - 1;
843 }
844 word_start = char_start + (j - char_idx);
845 }
846 '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
848 if !current_word.is_empty() {
850 items.push(WordItem {
851 word: std::mem::take(&mut current_word),
852 trailing_punct: None,
853 start: word_start,
854 end: i,
855 punct_pos: None,
856 });
857 }
858 items.push(WordItem {
860 word: "->".to_string(),
861 trailing_punct: None,
862 start: i,
863 end: i + 2,
864 punct_pos: None,
865 });
866 skip_count = 1; word_start = i + 2;
868 }
869 '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
871 if !current_word.is_empty() {
872 items.push(WordItem {
873 word: std::mem::take(&mut current_word),
874 trailing_punct: None,
875 start: word_start,
876 end: i,
877 punct_pos: None,
878 });
879 }
880 items.push(WordItem {
881 word: "<=".to_string(),
882 trailing_punct: None,
883 start: i,
884 end: i + 2,
885 punct_pos: None,
886 });
887 skip_count = 1;
888 word_start = i + 2;
889 }
890 '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
892 if !current_word.is_empty() {
893 items.push(WordItem {
894 word: std::mem::take(&mut current_word),
895 trailing_punct: None,
896 start: word_start,
897 end: i,
898 punct_pos: None,
899 });
900 }
901 items.push(WordItem {
902 word: ">=".to_string(),
903 trailing_punct: None,
904 start: i,
905 end: i + 2,
906 punct_pos: None,
907 });
908 skip_count = 1;
909 word_start = i + 2;
910 }
911 '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
913 if !current_word.is_empty() {
914 items.push(WordItem {
915 word: std::mem::take(&mut current_word),
916 trailing_punct: None,
917 start: word_start,
918 end: i,
919 punct_pos: None,
920 });
921 }
922 items.push(WordItem {
923 word: "==".to_string(),
924 trailing_punct: None,
925 start: i,
926 end: i + 2,
927 punct_pos: None,
928 });
929 skip_count = 1;
930 word_start = i + 2;
931 }
932 '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
934 if !current_word.is_empty() {
935 items.push(WordItem {
936 word: std::mem::take(&mut current_word),
937 trailing_punct: None,
938 start: word_start,
939 end: i,
940 punct_pos: None,
941 });
942 }
943 items.push(WordItem {
944 word: "!=".to_string(),
945 trailing_punct: None,
946 start: i,
947 end: i + 2,
948 punct_pos: None,
949 });
950 skip_count = 1;
951 word_start = i + 2;
952 }
953 '-' if Self::is_date_hyphen(¤t_word, &chars, char_idx) => {
955 current_word.push(c);
957 }
958 ':' if Self::is_time_colon(¤t_word, &chars, char_idx) => {
960 current_word.push(c);
962 }
963 '+' | '-' if Self::is_exponent_sign(¤t_word, &chars, char_idx) => {
965 current_word.push(c);
966 }
967 '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
968 if !current_word.is_empty() {
969 items.push(WordItem {
970 word: std::mem::take(&mut current_word),
971 trailing_punct: Some(c),
972 start: word_start,
973 end: i,
974 punct_pos: Some(i),
975 });
976 } else {
977 items.push(WordItem {
978 word: String::new(),
979 trailing_punct: Some(c),
980 start: i,
981 end: next_pos,
982 punct_pos: Some(i),
983 });
984 }
985 word_start = next_pos;
986 }
987 '\'' => {
988 let remaining: String = chars[char_idx + 1..].iter().collect();
990 let remaining_lower = remaining.to_lowercase();
991
992 if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
993 remaining_lower.starts_with("t,") || remaining_lower == "t" ||
994 (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
995 (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
996 let word_lower = current_word.to_lowercase();
998 if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
999 let base = if word_lower == "don" { "do" }
1001 else if word_lower == "doesn" { "does" }
1002 else { "did" };
1003 items.push(WordItem {
1004 word: base.to_string(),
1005 trailing_punct: None,
1006 start: word_start,
1007 end: i,
1008 punct_pos: None,
1009 });
1010 items.push(WordItem {
1011 word: "not".to_string(),
1012 trailing_punct: None,
1013 start: i,
1014 end: i + 2,
1015 punct_pos: None,
1016 });
1017 current_word.clear();
1018 word_start = next_pos + 1;
1019 skip_count = 1;
1020 } else if word_lower == "won" {
1021 items.push(WordItem {
1023 word: "will".to_string(),
1024 trailing_punct: None,
1025 start: word_start,
1026 end: i,
1027 punct_pos: None,
1028 });
1029 items.push(WordItem {
1030 word: "not".to_string(),
1031 trailing_punct: None,
1032 start: i,
1033 end: i + 2,
1034 punct_pos: None,
1035 });
1036 current_word.clear();
1037 word_start = next_pos + 1;
1038 skip_count = 1;
1039 } else if word_lower == "can" {
1040 items.push(WordItem {
1042 word: "cannot".to_string(),
1043 trailing_punct: None,
1044 start: word_start,
1045 end: i + 2,
1046 punct_pos: None,
1047 });
1048 current_word.clear();
1049 word_start = next_pos + 1;
1050 skip_count = 1;
1051 } else {
1052 if !current_word.is_empty() {
1054 items.push(WordItem {
1055 word: std::mem::take(&mut current_word),
1056 trailing_punct: Some('\''),
1057 start: word_start,
1058 end: i,
1059 punct_pos: Some(i),
1060 });
1061 }
1062 word_start = next_pos;
1063 }
1064 } else {
1065 if !current_word.is_empty() {
1067 items.push(WordItem {
1068 word: std::mem::take(&mut current_word),
1069 trailing_punct: Some('\''),
1070 start: word_start,
1071 end: i,
1072 punct_pos: Some(i),
1073 });
1074 }
1075 word_start = next_pos;
1076 }
1077 }
1078 c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1079 if current_word.is_empty() {
1080 word_start = i;
1081 }
1082 current_word.push(c);
1083 }
1084 _ => {
1085 word_start = next_pos;
1086 }
1087 }
1088 char_idx += 1;
1089 }
1090
1091 if !current_word.is_empty() {
1092 items.push(WordItem {
1093 word: current_word,
1094 trailing_punct: None,
1095 start: word_start,
1096 end: input.len(),
1097 punct_pos: None,
1098 });
1099 }
1100
1101 items
1102 }
1103
1104 fn peek_word(&self, offset: usize) -> Option<&str> {
1105 self.words.get(self.pos + offset).map(|w| w.word.as_str())
1106 }
1107
1108 fn prev_token_is_determiner(&self) -> bool {
1110 if self.pos == 0 { return false; }
1111 if let Some(prev) = self.words.get(self.pos - 1) {
1112 matches!(prev.word.to_lowercase().as_str(),
1113 "every" | "each" | "some" | "all" | "any" | "no" | "the" | "a" | "an")
1114 } else {
1115 false
1116 }
1117 }
1118
1119 fn next_token_is_copula(&self) -> bool {
1120 if let Some(next) = self.peek_word(1) {
1121 matches!(next.to_lowercase().as_str(), "is" | "are" | "was" | "were")
1122 } else {
1123 false
1124 }
1125 }
1126
1127 fn peek_sequence(&self, expected: &[&str]) -> bool {
1128 for (i, &exp) in expected.iter().enumerate() {
1129 match self.peek_word(i + 1) {
1130 Some(w) if w.to_lowercase() == exp => continue,
1131 _ => return false,
1132 }
1133 }
1134 true
1135 }
1136
1137 fn consume_words(&mut self, count: usize) {
1138 self.pos += count;
1139 }
1140
1141 pub fn tokenize(&mut self) -> Vec<Token> {
1152 let mut tokens = Vec::new();
1153
1154 while self.pos < self.words.len() {
1155 let item = &self.words[self.pos];
1156 let word = item.word.clone();
1157 let trailing_punct = item.trailing_punct;
1158 let word_start = item.start;
1159 let word_end = item.end;
1160 let punct_pos = item.punct_pos;
1161
1162 if word.is_empty() {
1163 if let Some(punct) = trailing_punct {
1164 let kind = match punct {
1165 '(' => TokenType::LParen,
1166 ')' => TokenType::RParen,
1167 '[' => TokenType::LBracket,
1168 ']' => TokenType::RBracket,
1169 ',' => TokenType::Comma,
1170 ':' => TokenType::Colon,
1171 '.' | '?' => {
1172 self.in_let_context = false;
1173 TokenType::Period
1174 }
1175 '!' => TokenType::Exclamation,
1176 '+' => TokenType::Plus,
1177 '-' => TokenType::Minus,
1178 '*' => TokenType::Star,
1179 '/' => TokenType::Slash,
1180 '%' => TokenType::Percent,
1181 '<' => TokenType::Lt,
1182 '>' => TokenType::Gt,
1183 '=' => TokenType::Assign,
1184 _ => {
1185 self.pos += 1;
1186 continue;
1187 }
1188 };
1189 let lexeme = self.interner.intern(&punct.to_string());
1190 let span = Span::new(word_start, word_end);
1191 tokens.push(Token::new(kind, lexeme, span));
1192 }
1193 self.pos += 1;
1194 continue;
1195 }
1196
1197 if word.starts_with("\x00STR:") {
1199 let content = &word[5..]; let span = Span::new(word_start, word_end);
1201 if Self::has_unescaped_brace(content) {
1202 let sym = self.interner.intern(content);
1203 tokens.push(Token::new(TokenType::InterpolatedString(sym), sym, span));
1204 } else {
1205 let normalized = content.replace("{{", "{").replace("}}", "}");
1207 let sym = self.interner.intern(&normalized);
1208 tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1209 }
1210 self.pos += 1;
1211 continue;
1212 }
1213
1214 if word.starts_with("\x00CHAR:") {
1216 let content = &word[6..]; let sym = self.interner.intern(content);
1218 let span = Span::new(word_start, word_end);
1219 tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1220 self.pos += 1;
1221 continue;
1222 }
1223
1224 if word.starts_with("\x00ESC:") {
1226 let content = &word[5..]; let sym = self.interner.intern(content);
1228 let span = Span::new(word_start, word_end);
1229 tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1230 self.pos += 1;
1231 continue;
1232 }
1233
1234 let kind = self.classify_with_lookahead(&word);
1235 let lexeme = self.interner.intern(&word);
1236 let span = Span::new(word_start, word_end);
1237 tokens.push(Token::new(kind, lexeme, span));
1238
1239 if let Some(punct) = trailing_punct {
1240 if punct == '\'' {
1241 if let Some(next_item) = self.words.get(self.pos + 1) {
1242 if next_item.word.to_lowercase() == "s" {
1243 let poss_lexeme = self.interner.intern("'s");
1244 let poss_start = punct_pos.unwrap_or(word_end);
1245 let poss_end = next_item.end;
1246 tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1247 self.pos += 1;
1248 if let Some(s_punct) = next_item.trailing_punct {
1249 let kind = match s_punct {
1250 '(' => TokenType::LParen,
1251 ')' => TokenType::RParen,
1252 '[' => TokenType::LBracket,
1253 ']' => TokenType::RBracket,
1254 ',' => TokenType::Comma,
1255 ':' => TokenType::Colon,
1256 '.' | '?' => TokenType::Period,
1257 '!' => TokenType::Exclamation,
1258 '+' => TokenType::Plus,
1259 '-' => TokenType::Minus,
1260 '*' => TokenType::Star,
1261 '/' => TokenType::Slash,
1262 '%' => TokenType::Percent,
1263 '<' => TokenType::Lt,
1264 '>' => TokenType::Gt,
1265 '=' => TokenType::Assign,
1266 _ => {
1267 self.pos += 1;
1268 continue;
1269 }
1270 };
1271 let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1272 let lexeme = self.interner.intern(&s_punct.to_string());
1273 tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1274 }
1275 self.pos += 1;
1276 continue;
1277 }
1278 }
1279 self.pos += 1;
1280 continue;
1281 }
1282
1283 let kind = match punct {
1284 '(' => TokenType::LParen,
1285 ')' => TokenType::RParen,
1286 '[' => TokenType::LBracket,
1287 ']' => TokenType::RBracket,
1288 ',' => TokenType::Comma,
1289 ':' => TokenType::Colon,
1290 '.' | '?' => {
1291 self.in_let_context = false;
1292 TokenType::Period
1293 }
1294 '!' => TokenType::Exclamation,
1295 '+' => TokenType::Plus,
1296 '-' => TokenType::Minus,
1297 '*' => TokenType::Star,
1298 '/' => TokenType::Slash,
1299 '%' => TokenType::Percent,
1300 '<' => TokenType::Lt,
1301 '>' => TokenType::Gt,
1302 '=' => TokenType::Assign,
1303 _ => {
1304 self.pos += 1;
1305 continue;
1306 }
1307 };
1308 let p_start = punct_pos.unwrap_or(word_end);
1309 let lexeme = self.interner.intern(&punct.to_string());
1310 tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1311 }
1312
1313 self.pos += 1;
1314 }
1315
1316 let eof_lexeme = self.interner.intern("");
1317 let eof_span = Span::new(self.input_len, self.input_len);
1318 tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1319
1320 self.insert_indentation_tokens(tokens)
1321 }
1322
1323 fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1328 let mut result = Vec::new();
1329 let empty_sym = self.interner.intern("");
1330
1331 let line_lexer = LineLexer::new(&self.source);
1333 let line_tokens: Vec<LineToken> = line_lexer.collect();
1334
1335 let mut structural_events: Vec<(usize, bool)> = Vec::new(); let mut pending_indents = 0usize;
1339 let mut pending_dedents = 0usize;
1340
1341 for line_token in &line_tokens {
1342 match line_token {
1343 LineToken::Indent => {
1344 pending_indents += 1;
1345 }
1346 LineToken::Dedent => {
1347 pending_dedents += 1;
1348 }
1349 LineToken::Content { start, .. } => {
1350 for _ in 0..pending_dedents {
1352 structural_events.push((*start, false)); }
1354 pending_dedents = 0;
1355
1356 for _ in 0..pending_indents {
1358 structural_events.push((*start, true)); }
1360 pending_indents = 0;
1361 }
1362 LineToken::Newline => {}
1363 }
1364 }
1365
1366 for _ in 0..pending_dedents {
1368 structural_events.push((self.input_len, false));
1369 }
1370
1371 if !self.escape_body_ranges.is_empty() {
1376 let mut filtered = Vec::new();
1380 for &(pos, is_indent) in &structural_events {
1381 let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1382 pos > *start && pos < *end
1384 });
1385 if !is_inside_escape_body {
1386 filtered.push((pos, is_indent));
1387 }
1388 }
1389 structural_events = filtered;
1390 }
1391
1392 {
1396 let string_spans: Vec<(usize, usize)> = tokens.iter()
1397 .filter(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)))
1398 .filter(|t| t.span.end - t.span.start > 6) .map(|t| (t.span.start, t.span.end))
1400 .collect();
1401 if !string_spans.is_empty() {
1402 structural_events.retain(|&(pos, _)| {
1403 !string_spans.iter().any(|(start, end)| pos > *start && pos < *end)
1404 });
1405 }
1406 }
1407
1408 structural_events.sort_by(|a, b| {
1410 if a.0 != b.0 {
1411 a.0.cmp(&b.0)
1412 } else {
1413 a.1.cmp(&b.1)
1415 }
1416 });
1417
1418 let mut event_idx = 0;
1423 let mut last_colon_pos: Option<usize> = None;
1424
1425 for token in tokens.iter() {
1426 let token_start = token.span.start;
1427
1428 while event_idx < structural_events.len() {
1430 let (event_pos, is_indent) = structural_events[event_idx];
1431
1432 if event_pos <= token_start {
1434 let span = if is_indent {
1435 Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1437 } else {
1438 Span::new(event_pos, event_pos)
1439 };
1440 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1441 result.push(Token::new(kind, empty_sym, span));
1442 event_idx += 1;
1443 } else {
1444 break;
1445 }
1446 }
1447
1448 result.push(token.clone());
1449
1450 if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1452 last_colon_pos = Some(token.span.end);
1453 }
1454 }
1455
1456 while event_idx < structural_events.len() {
1458 let (event_pos, is_indent) = structural_events[event_idx];
1459 let span = Span::new(event_pos, event_pos);
1460 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1461 result.push(Token::new(kind, empty_sym, span));
1462 event_idx += 1;
1463 }
1464
1465 let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1467 if let Some(pos) = eof_pos {
1468 let eof = result.remove(pos);
1469 result.push(eof);
1470 }
1471
1472 result
1473 }
1474
1475 fn is_end_of_line(&self, from_pos: usize) -> bool {
1477 let bytes = self.source.as_bytes();
1478 let mut pos = from_pos;
1479 while pos < bytes.len() {
1480 match bytes[pos] {
1481 b' ' | b'\t' => pos += 1,
1482 b'\n' => return true,
1483 _ => return false,
1484 }
1485 }
1486 true }
1488
1489 fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1490 let bytes = self.source.as_bytes();
1491 let mut pos = from_pos;
1492
1493 while pos < bytes.len() && bytes[pos] != b'\n' {
1494 pos += 1;
1495 }
1496
1497 if pos >= bytes.len() {
1498 return None;
1499 }
1500
1501 pos += 1;
1502
1503 let mut indent = 0;
1504 while pos < bytes.len() {
1505 match bytes[pos] {
1506 b' ' => indent += 1,
1507 b'\t' => indent += 4,
1508 b'\n' => {
1509 indent = 0;
1510 }
1511 _ => break,
1512 }
1513 pos += 1;
1514 }
1515
1516 if pos >= bytes.len() {
1517 return None;
1518 }
1519
1520 Some(indent)
1521 }
1522
1523 fn word_to_number(word: &str) -> Option<u32> {
1524 lexicon::word_to_number(&word.to_lowercase())
1525 }
1526
1527 fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1533 let word_chars: Vec<char> = current_word.chars().collect();
1535
1536 if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1538 if char_idx + 5 < chars.len()
1540 && chars[char_idx + 1].is_ascii_digit()
1541 && chars[char_idx + 2].is_ascii_digit()
1542 && chars[char_idx + 3] == '-'
1543 && chars[char_idx + 4].is_ascii_digit()
1544 && chars[char_idx + 5].is_ascii_digit()
1545 {
1546 return true;
1547 }
1548 }
1549
1550 if word_chars.len() == 7
1552 && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1553 && word_chars[4] == '-'
1554 && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1555 {
1556 if char_idx + 2 < chars.len()
1558 && chars[char_idx + 1].is_ascii_digit()
1559 && chars[char_idx + 2].is_ascii_digit()
1560 {
1561 let next_not_digit = char_idx + 3 >= chars.len()
1563 || !chars[char_idx + 3].is_ascii_digit();
1564 if next_not_digit {
1565 return true;
1566 }
1567 }
1568 }
1569
1570 false
1571 }
1572
1573 fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1579 let word_chars: Vec<char> = current_word.chars().collect();
1581 if word_chars.is_empty() || word_chars.len() > 2 {
1582 return false;
1583 }
1584 if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1585 return false;
1586 }
1587
1588 if char_idx + 4 < chars.len()
1590 && chars[char_idx + 1].is_ascii_digit()
1591 && chars[char_idx + 2].is_ascii_digit()
1592 {
1593 let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1595 let lower = next_two.to_lowercase();
1596 if lower == "am" || lower == "pm" {
1597 let after_suffix = char_idx + 5 >= chars.len()
1599 || !chars[char_idx + 5].is_alphabetic();
1600 if after_suffix {
1601 return true;
1602 }
1603 }
1604 }
1605
1606 false
1607 }
1608
1609 fn has_unescaped_brace(content: &str) -> bool {
1612 let bytes = content.as_bytes();
1613 let mut i = 0;
1614 while i < bytes.len() {
1615 if bytes[i] == b'{' {
1616 if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
1617 i += 2;
1618 } else {
1619 return true;
1620 }
1621 } else {
1622 i += 1;
1623 }
1624 }
1625 false
1626 }
1627
1628 fn is_exponent_sign(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1634 if !current_word.ends_with('e') && !current_word.ends_with('E') {
1636 return false;
1637 }
1638 let before_e = ¤t_word[..current_word.len() - 1];
1640 if before_e.is_empty() || !before_e.chars().next().unwrap().is_ascii_digit() {
1641 return false;
1642 }
1643 char_idx + 1 < chars.len() && chars[char_idx + 1].is_ascii_digit()
1645 }
1646
1647 fn dedent_triple_quote(raw: &str) -> String {
1650 let lines: Vec<&str> = raw.lines().collect();
1651 if lines.is_empty() {
1652 return String::new();
1653 }
1654 let min_indent = lines.iter()
1656 .filter(|l| !l.trim().is_empty())
1657 .map(|l| l.len() - l.trim_start().len())
1658 .min()
1659 .unwrap_or(0);
1660 lines.iter()
1662 .map(|l| {
1663 if l.len() >= min_indent {
1664 &l[min_indent..]
1665 } else {
1666 l.trim()
1667 }
1668 })
1669 .collect::<Vec<_>>()
1670 .join("\n")
1671 }
1672
1673 fn is_numeric_literal(word: &str) -> bool {
1674 if word.is_empty() {
1675 return false;
1676 }
1677 let chars: Vec<char> = word.chars().collect();
1678 let first = chars[0];
1679 if first.is_ascii_digit() {
1680 return true;
1682 }
1683 if let Some(underscore_pos) = word.rfind('_') {
1686 let before_underscore = &word[..underscore_pos];
1687 let after_underscore = &word[underscore_pos + 1..];
1688 let is_math_symbol = matches!(
1690 before_underscore.to_lowercase().as_str(),
1691 "aleph" | "omega" | "beth"
1692 );
1693 if is_math_symbol
1694 && !after_underscore.is_empty()
1695 && after_underscore.chars().all(|c| c.is_ascii_digit())
1696 {
1697 return true;
1698 }
1699 }
1700 false
1701 }
1702
1703 fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1716 if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1717 return None;
1718 }
1719
1720 const SUFFIXES: &[(&str, i64)] = &[
1722 ("ns", 1),
1723 ("μs", 1_000),
1724 ("us", 1_000),
1725 ("ms", 1_000_000),
1726 ("sec", 1_000_000_000),
1727 ("s", 1_000_000_000),
1728 ("min", 60_000_000_000),
1729 ("hr", 3_600_000_000_000),
1730 ("h", 3_600_000_000_000),
1731 ];
1732
1733 for (suffix, multiplier) in SUFFIXES {
1735 if word.ends_with(suffix) {
1736 let num_part = &word[..word.len() - suffix.len()];
1737 let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1739 if let Ok(n) = cleaned.parse::<i64>() {
1740 return Some((n.saturating_mul(*multiplier), *suffix));
1741 }
1742 }
1743 }
1744
1745 None
1746 }
1747
1748 fn parse_date_literal(word: &str) -> Option<i32> {
1753 if word.len() != 10 {
1755 return None;
1756 }
1757
1758 let bytes = word.as_bytes();
1759
1760 if bytes[4] != b'-' || bytes[7] != b'-' {
1762 return None;
1763 }
1764
1765 let year: i32 = word[0..4].parse().ok()?;
1767 let month: u32 = word[5..7].parse().ok()?;
1768 let day: u32 = word[8..10].parse().ok()?;
1769
1770 if month < 1 || month > 12 || day < 1 || day > 31 {
1772 return None;
1773 }
1774
1775 let y = if month <= 2 { year - 1 } else { year };
1778 let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1779 let yoe = (y - era * 400) as u32;
1780 let m = month;
1781 let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1782 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1783 let days = era * 146097 + doe as i32 - 719468;
1784
1785 Some(days)
1786 }
1787
1788 fn parse_time_literal(word: &str) -> Option<i64> {
1797 let lower = word.to_lowercase();
1798
1799 if lower == "noon" {
1801 return Some(12i64 * 3600 * 1_000_000_000);
1802 }
1803 if lower == "midnight" {
1804 return Some(0);
1805 }
1806
1807 let is_pm = lower.ends_with("pm");
1809 let is_am = lower.ends_with("am");
1810
1811 if !is_pm && !is_am {
1812 return None;
1813 }
1814
1815 let time_part = &lower[..lower.len() - 2];
1817
1818 let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1820 let hour_str = &time_part[..colon_idx];
1821 let min_str = &time_part[colon_idx + 1..];
1822 let h: i64 = hour_str.parse().ok()?;
1823 let m: i64 = min_str.parse().ok()?;
1824 (h, m)
1825 } else {
1826 let h: i64 = time_part.parse().ok()?;
1828 (h, 0)
1829 };
1830
1831 if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1833 return None;
1834 }
1835
1836 let hour_24 = if is_am {
1838 if hour == 12 { 0 } else { hour } } else {
1840 if hour == 12 { 12 } else { hour + 12 } };
1842
1843 let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1845 Some(nanos)
1846 }
1847
1848 fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1849 if word.starts_with("##") {
1851 let block_name = &word[2..];
1852 let block_type = match block_name.to_lowercase().as_str() {
1853 "theorem" => BlockType::Theorem,
1854 "main" => BlockType::Main,
1855 "definition" => BlockType::Definition,
1856 "proof" => BlockType::Proof,
1857 "example" => BlockType::Example,
1858 "logic" => BlockType::Logic,
1859 "note" => BlockType::Note,
1860 "to" => BlockType::Function, "a" | "an" => BlockType::TypeDef, "policy" => BlockType::Policy, "requires" => BlockType::Requires, "hardware" => BlockType::Hardware, "property" => BlockType::Property, "no" => BlockType::No, _ => BlockType::Note, };
1869
1870 self.mode = match block_type {
1872 BlockType::Main | BlockType::Function => LexerMode::Imperative,
1873 _ => LexerMode::Declarative,
1874 };
1875
1876 return TokenType::BlockHeader { block_type };
1877 }
1878
1879 let lower = word.to_lowercase();
1880
1881 if lower == "each" && self.peek_sequence(&["other"]) {
1882 self.consume_words(1);
1883 return TokenType::Reciprocal;
1884 }
1885
1886 if lower == "to" {
1887 if let Some(next) = self.peek_word(1) {
1888 if self.is_verb_like(next) {
1889 return TokenType::To;
1890 }
1891 }
1892 let sym = self.interner.intern("to");
1893 return TokenType::Preposition(sym);
1894 }
1895
1896 if lower == "at" {
1897 if let Some(next) = self.peek_word(1) {
1898 let next_lower = next.to_lowercase();
1899 if next_lower == "least" {
1900 if let Some(num_word) = self.peek_word(2) {
1901 if let Some(n) = Self::word_to_number(num_word) {
1902 self.consume_words(2);
1903 return TokenType::AtLeast(n);
1904 }
1905 }
1906 }
1907 if next_lower == "most" {
1908 if let Some(num_word) = self.peek_word(2) {
1909 if let Some(n) = Self::word_to_number(num_word) {
1910 self.consume_words(2);
1911 return TokenType::AtMost(n);
1912 }
1913 }
1914 }
1915 }
1916 }
1917
1918 if lower == "exactly" {
1920 if let Some(num_word) = self.peek_word(1) {
1921 if let Some(n) = Self::word_to_number(num_word) {
1922 self.consume_words(1);
1923 return TokenType::Cardinal(n);
1924 }
1925 }
1926 }
1927
1928 if let Some(n) = Self::word_to_number(&lower) {
1929 return TokenType::Cardinal(n);
1930 }
1931
1932 if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1934 let unit_sym = self.interner.intern(unit);
1935 return TokenType::DurationLiteral {
1936 nanos,
1937 original_unit: unit_sym,
1938 };
1939 }
1940
1941 if let Some(days) = Self::parse_date_literal(word) {
1943 return TokenType::DateLiteral { days };
1944 }
1945
1946 if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1948 return TokenType::TimeLiteral { nanos_from_midnight };
1949 }
1950
1951 if Self::is_numeric_literal(word) {
1952 let sym = self.interner.intern(word);
1953 return TokenType::Number(sym);
1954 }
1955
1956 if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1957 self.consume_words(3);
1958 return TokenType::Iff;
1959 }
1960
1961 if lower == "is" {
1962 if self.peek_sequence(&["equal", "to"]) {
1963 self.consume_words(2);
1964 return TokenType::Identity;
1965 }
1966 if self.peek_sequence(&["identical", "to"]) {
1967 self.consume_words(2);
1968 return TokenType::Identity;
1969 }
1970 }
1971
1972 if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1973 if let Some(next) = self.peek_word(1) {
1976 let next_lower = next.to_lowercase();
1977 let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1978
1979 if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1981 let sym = self.interner.intern(word);
1982 return TokenType::ProperName(sym);
1983 }
1984
1985 let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1990 && !lexicon::is_disambiguation_not_verb(&next_lower);
1991 let is_gerund = next_lower.ends_with("ing");
1992 let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1993 if is_verb && !is_gerund && !is_also_noun_or_adj {
1994 let sym = self.interner.intern(word);
1995 return TokenType::ProperName(sym);
1996 }
1997
1998 if let Some(third) = self.peek_word(2) {
2001 let third_lower = third.to_lowercase();
2002 if third_lower == "is" || third_lower == "are" || third_lower == "has" {
2004 return TokenType::Article(Definiteness::Indefinite);
2005 }
2006 }
2007
2008 let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
2012 if is_content_word || next_starts_lowercase {
2013 return TokenType::Article(Definiteness::Indefinite);
2014 }
2015 }
2016 let sym = self.interner.intern(word);
2017 return TokenType::ProperName(sym);
2018 }
2019
2020 self.classify_word(word)
2021 }
2022
2023 fn is_noun_like(&self, word: &str) -> bool {
2024 if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
2025 return true;
2026 }
2027 if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
2028 return true;
2029 }
2030 false
2031 }
2032
2033 fn is_adjective_like(&self, word: &str) -> bool {
2034 lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
2035 }
2036
2037 fn classify_word(&mut self, word: &str) -> TokenType {
2038 let lower = word.to_lowercase();
2039 let first_char = word.chars().next().unwrap();
2040
2041 if lower == "that" {
2044 if let Some(next) = self.peek_word(1) {
2045 let next_lower = next.to_lowercase();
2046 if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
2047 return TokenType::Article(Definiteness::Distal);
2048 }
2049 }
2050 }
2051
2052 if word == "->" {
2054 return TokenType::Arrow;
2055 }
2056
2057 if word == "<=" {
2059 return TokenType::LtEq;
2060 }
2061 if word == ">=" {
2062 return TokenType::GtEq;
2063 }
2064 if word == "==" {
2065 return TokenType::EqEq;
2066 }
2067 if word == "!=" {
2068 return TokenType::NotEq;
2069 }
2070 if word == "<" {
2071 return TokenType::Lt;
2072 }
2073 if word == ">" {
2074 return TokenType::Gt;
2075 }
2076 if word == "=" {
2078 return TokenType::Assign;
2079 }
2080
2081 if let Some(kind) = lexicon::lookup_keyword(&lower) {
2082 return kind;
2083 }
2084
2085 if let Some(kind) = lexicon::lookup_pronoun(&lower) {
2086 return kind;
2087 }
2088
2089 if let Some(def) = lexicon::lookup_article(&lower) {
2090 return TokenType::Article(def);
2091 }
2092
2093 if let Some(time) = lexicon::lookup_auxiliary(&lower) {
2094 return TokenType::Auxiliary(time);
2095 }
2096
2097 match lower.as_str() {
2099 "call" => return TokenType::Call,
2100 "in" if self.mode == LexerMode::Imperative => return TokenType::In,
2101 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2103 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2105 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2107 "before" => return TokenType::Before,
2109 _ => {}
2110 }
2111
2112 if lexicon::is_preposition(&lower) {
2113 let sym = self.interner.intern(&lower);
2114 return TokenType::Preposition(sym);
2115 }
2116
2117 match lower.as_str() {
2118 "equals" => return TokenType::Equals,
2119 "item" => return TokenType::Item,
2120 "items" => return TokenType::Items,
2121 "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
2123 "let" => {
2124 self.in_let_context = true;
2125 return TokenType::Let;
2126 }
2127 "set" => {
2128 if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
2131 } else if self.mode == LexerMode::Imperative {
2133 return TokenType::Set;
2135 } else {
2136 for offset in 2..=5 {
2139 if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
2140 return TokenType::Set;
2141 }
2142 }
2143 }
2144 }
2145 "return" => return TokenType::Return,
2146 "break" => return TokenType::Break,
2147 "xor" => return TokenType::Xor,
2148 "shifted" => return TokenType::Shifted,
2149 "be" if self.in_let_context => {
2150 self.in_let_context = false;
2151 return TokenType::Be;
2152 }
2153 "while" => return TokenType::While,
2154 "assert" => return TokenType::Assert,
2155 "trust" => return TokenType::Trust,
2156 "check" => return TokenType::Check,
2157 "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
2159 "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
2160 "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
2161 "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
2163 "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
2164 "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
2165 "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2167 "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2169 "persistent" => return TokenType::Persistent, "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2171 "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2175 "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2176 "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2177 "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2178 "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2179 "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2180 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2181 "native" => return TokenType::Native,
2182 "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2183 "from" => return TokenType::From,
2184 "otherwise" => return TokenType::Otherwise,
2185 "else" => return TokenType::Else,
2187 "elif" => return TokenType::Elif,
2188 "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2190 "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2192 "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2194 "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2197 "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2198 "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2200 "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2201 "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2202 "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2203 "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2204 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2205 "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2207 "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2208 "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2209 "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2210 "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2211 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2213 "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2214 "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2215 "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2216 "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2217 "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2219 "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2220 "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2221 "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2223 "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2224 "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2225 "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2226 "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2228 "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2229 "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2230 "portable" => return TokenType::Portable,
2232 "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2234 "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2235 "shared" => return TokenType::Shared, "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2238 "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2239 "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2241 "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2242 "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2243 "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2244 "tally" => return TokenType::Tally,
2246 "sharedset" => return TokenType::SharedSet,
2247 "sharedsequence" => return TokenType::SharedSequence,
2248 "collaborativesequence" => return TokenType::CollaborativeSequence,
2249 "sharedmap" => return TokenType::SharedMap,
2250 "divergent" => return TokenType::Divergent,
2251 "removewins" => return TokenType::RemoveWins,
2252 "addwins" => return TokenType::AddWins,
2253 "yata" => return TokenType::YATA,
2254 "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2256 "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2257 "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2258 "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2259 "ago" => return TokenType::Ago,
2261 "hence" => return TokenType::Hence,
2262 "if" => return TokenType::If,
2263 "only" => return TokenType::Focus(FocusKind::Only),
2264 "even" => return TokenType::Focus(FocusKind::Even),
2265 "just" if self.peek_word(1).map_or(false, |w| {
2266 !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2267 }) => return TokenType::Focus(FocusKind::Just),
2268 "much" => return TokenType::Measure(MeasureKind::Much),
2269 "little" => return TokenType::Measure(MeasureKind::Little),
2270 _ => {}
2271 }
2272
2273 if lexicon::is_scopal_adverb(&lower) {
2274 let sym = self.interner.intern(&Self::capitalize(&lower));
2275 return TokenType::ScopalAdverb(sym);
2276 }
2277
2278 if lexicon::is_temporal_adverb(&lower) {
2279 let sym = self.interner.intern(&Self::capitalize(&lower));
2280 return TokenType::TemporalAdverb(sym);
2281 }
2282
2283 if lexicon::is_non_intersective(&lower) {
2284 let sym = self.interner.intern(&Self::capitalize(&lower));
2285 return TokenType::NonIntersectiveAdjective(sym);
2286 }
2287
2288 if lexicon::is_adverb(&lower) {
2289 let sym = self.interner.intern(&Self::capitalize(&lower));
2290 return TokenType::Adverb(sym);
2291 }
2292 if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2293 let sym = self.interner.intern(&Self::capitalize(&lower));
2294 return TokenType::Adverb(sym);
2295 }
2296
2297 if let Some(base) = self.try_parse_superlative(&lower) {
2298 let sym = self.interner.intern(&base);
2299 return TokenType::Superlative(sym);
2300 }
2301
2302 let irregular_comparative = match lower.as_str() {
2304 "less" => Some("Little"),
2305 "more" => Some("Much"),
2306 "better" => Some("Good"),
2307 "worse" => Some("Bad"),
2308 _ => None,
2309 };
2310 if let Some(base) = irregular_comparative {
2311 let sym = self.interner.intern(base);
2312 return TokenType::Comparative(sym);
2313 }
2314
2315 if let Some(base) = self.try_parse_comparative(&lower) {
2316 let sym = self.interner.intern(&base);
2317 return TokenType::Comparative(sym);
2318 }
2319
2320 if lexicon::is_performative(&lower) {
2321 let after_determiner = self.prev_token_is_determiner();
2327 let before_copula = self.next_token_is_copula();
2328 if !lexicon::is_common_noun(&lower) || (!after_determiner && !before_copula) {
2329 let sym = self.interner.intern(&Self::capitalize(&lower));
2330 return TokenType::Performative(sym);
2331 }
2332 }
2334
2335 if lexicon::is_base_verb_early(&lower) {
2336 let after_determiner = self.prev_token_is_determiner();
2341 let before_copula = self.next_token_is_copula();
2342 if !lexicon::is_common_noun(&lower) || (!after_determiner && !before_copula) {
2343 let sym = self.interner.intern(&Self::capitalize(&lower));
2344 let class = lexicon::lookup_verb_class(&lower);
2345 return TokenType::Verb {
2346 lemma: sym,
2347 time: Time::Present,
2348 aspect: Aspect::Simple,
2349 class,
2350 };
2351 }
2352 }
2354
2355 if lower.ends_with("ing") && lower.len() > 4 {
2358 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2359 let sym = self.interner.intern(&entry.lemma);
2360 return TokenType::Verb {
2361 lemma: sym,
2362 time: entry.time,
2363 aspect: entry.aspect,
2364 class: entry.class,
2365 };
2366 }
2367 }
2368
2369 if first_char.is_uppercase() {
2370 if let Some(next) = self.peek_word(1) {
2377 let next_lower = next.to_lowercase();
2378 let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2380 || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2381
2382 if is_followed_by_verb {
2383 if let Some(analysis) = lexicon::analyze_word(&lower) {
2385 match analysis {
2386 lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2387 let sym = self.interner.intern(&lower);
2389 return TokenType::Noun(sym);
2390 }
2391 lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2392 let sym = self.interner.intern(&lower);
2394 return TokenType::Noun(sym);
2395 }
2396 _ => {
2397 }
2400 }
2401 }
2402 }
2403 }
2404
2405 let sym = self.interner.intern(word);
2406 return TokenType::ProperName(sym);
2407 }
2408
2409 let verb_entry = self.lexicon.lookup_verb(&lower);
2410 let is_noun = lexicon::is_common_noun(&lower);
2411 let is_adj = self.is_adjective_like(&lower);
2412 let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2413
2414 if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2416 let entry = verb_entry.unwrap();
2417 let verb_token = TokenType::Verb {
2418 lemma: self.interner.intern(&entry.lemma),
2419 time: entry.time,
2420 aspect: entry.aspect,
2421 class: entry.class,
2422 };
2423
2424 let mut alternatives = Vec::new();
2425 if is_noun {
2426 alternatives.push(TokenType::Noun(self.interner.intern(word)));
2427 }
2428 if is_adj {
2429 alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2430 }
2431
2432 return TokenType::Ambiguous {
2433 primary: Box::new(verb_token),
2434 alternatives,
2435 };
2436 }
2437
2438 if let Some(_) = &verb_entry {
2440 if is_disambiguated {
2441 let sym = self.interner.intern(word);
2442 if is_noun {
2443 return TokenType::Noun(sym);
2444 }
2445 return TokenType::Adjective(sym);
2446 }
2447 }
2448
2449 if let Some(entry) = verb_entry {
2451 let sym = self.interner.intern(&entry.lemma);
2452 return TokenType::Verb {
2453 lemma: sym,
2454 time: entry.time,
2455 aspect: entry.aspect,
2456 class: entry.class,
2457 };
2458 }
2459
2460 if is_noun {
2462 let sym = self.interner.intern(word);
2463 return TokenType::Noun(sym);
2464 }
2465
2466 if lexicon::is_base_verb(&lower) {
2467 let sym = self.interner.intern(&Self::capitalize(&lower));
2468 let class = lexicon::lookup_verb_class(&lower);
2469 return TokenType::Verb {
2470 lemma: sym,
2471 time: Time::Present,
2472 aspect: Aspect::Simple,
2473 class,
2474 };
2475 }
2476
2477 if lower.ends_with("ian")
2478 || lower.ends_with("er")
2479 || lower == "logic"
2480 || lower == "time"
2481 || lower == "men"
2482 || lower == "book"
2483 || lower == "house"
2484 || lower == "code"
2485 || lower == "user"
2486 {
2487 let sym = self.interner.intern(word);
2488 return TokenType::Noun(sym);
2489 }
2490
2491 if lexicon::is_particle(&lower) {
2492 let sym = self.interner.intern(&lower);
2493 return TokenType::Particle(sym);
2494 }
2495
2496 let sym = self.interner.intern(word);
2497 TokenType::Adjective(sym)
2498 }
2499
2500 fn capitalize(s: &str) -> String {
2501 let mut chars = s.chars();
2502 match chars.next() {
2503 None => String::new(),
2504 Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2505 }
2506 }
2507
2508 pub fn is_collective_verb(lemma: &str) -> bool {
2509 lexicon::is_collective_verb(&lemma.to_lowercase())
2510 }
2511
2512 pub fn is_mixed_verb(lemma: &str) -> bool {
2513 lexicon::is_mixed_verb(&lemma.to_lowercase())
2514 }
2515
2516 pub fn is_distributive_verb(lemma: &str) -> bool {
2517 lexicon::is_distributive_verb(&lemma.to_lowercase())
2518 }
2519
2520 pub fn is_intensional_predicate(lemma: &str) -> bool {
2521 lexicon::is_intensional_predicate(&lemma.to_lowercase())
2522 }
2523
2524 pub fn is_opaque_verb(lemma: &str) -> bool {
2525 lexicon::is_opaque_verb(&lemma.to_lowercase())
2526 }
2527
2528 pub fn is_ditransitive_verb(lemma: &str) -> bool {
2529 lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2530 }
2531
2532 fn is_verb_like(&self, word: &str) -> bool {
2533 let lower = word.to_lowercase();
2534 if lexicon::is_infinitive_verb(&lower) {
2535 return true;
2536 }
2537 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2538 return entry.lemma.len() > 0;
2539 }
2540 false
2541 }
2542
2543 pub fn is_subject_control_verb(lemma: &str) -> bool {
2544 lexicon::is_subject_control_verb(&lemma.to_lowercase())
2545 }
2546
2547 pub fn is_raising_verb(lemma: &str) -> bool {
2548 lexicon::is_raising_verb(&lemma.to_lowercase())
2549 }
2550
2551 pub fn is_object_control_verb(lemma: &str) -> bool {
2552 lexicon::is_object_control_verb(&lemma.to_lowercase())
2553 }
2554
2555 pub fn is_weather_verb(lemma: &str) -> bool {
2556 matches!(
2557 lemma.to_lowercase().as_str(),
2558 "rain" | "snow" | "hail" | "thunder" | "pour"
2559 )
2560 }
2561
2562 fn try_parse_superlative(&self, word: &str) -> Option<String> {
2563 if !word.ends_with("est") || word.len() < 5 {
2564 return None;
2565 }
2566
2567 let base = &word[..word.len() - 3];
2568
2569 if base.len() >= 2 {
2570 let chars: Vec<char> = base.chars().collect();
2571 let last = chars[chars.len() - 1];
2572 let second_last = chars[chars.len() - 2];
2573 if last == second_last && !"aeiou".contains(last) {
2574 let stem = &base[..base.len() - 1];
2575 if lexicon::is_gradable_adjective(stem) {
2576 return Some(Self::capitalize(stem));
2577 }
2578 }
2579 }
2580
2581 if base.ends_with("i") {
2582 let stem = format!("{}y", &base[..base.len() - 1]);
2583 if lexicon::is_gradable_adjective(&stem) {
2584 return Some(Self::capitalize(&stem));
2585 }
2586 }
2587
2588 if lexicon::is_gradable_adjective(base) {
2589 return Some(Self::capitalize(base));
2590 }
2591
2592 None
2593 }
2594
2595 fn try_parse_comparative(&self, word: &str) -> Option<String> {
2596 if !word.ends_with("er") || word.len() < 4 {
2597 return None;
2598 }
2599
2600 let base = &word[..word.len() - 2];
2601
2602 if base.len() >= 2 {
2603 let chars: Vec<char> = base.chars().collect();
2604 let last = chars[chars.len() - 1];
2605 let second_last = chars[chars.len() - 2];
2606 if last == second_last && !"aeiou".contains(last) {
2607 let stem = &base[..base.len() - 1];
2608 if lexicon::is_gradable_adjective(stem) {
2609 return Some(Self::capitalize(stem));
2610 }
2611 }
2612 }
2613
2614 if base.ends_with("i") {
2615 let stem = format!("{}y", &base[..base.len() - 1]);
2616 if lexicon::is_gradable_adjective(&stem) {
2617 return Some(Self::capitalize(&stem));
2618 }
2619 }
2620
2621 if lexicon::is_gradable_adjective(base) {
2622 return Some(Self::capitalize(base));
2623 }
2624
2625 None
2626 }
2627}
2628
2629#[cfg(test)]
2630mod tests {
2631 use super::*;
2632
2633 #[test]
2634 fn lexer_handles_apostrophe() {
2635 let mut interner = Interner::new();
2636 let mut lexer = Lexer::new("it's raining", &mut interner);
2637 let tokens = lexer.tokenize();
2638 assert!(!tokens.is_empty());
2639 }
2640
2641 #[test]
2642 fn lexer_handles_question_mark() {
2643 let mut interner = Interner::new();
2644 let mut lexer = Lexer::new("Is it raining?", &mut interner);
2645 let tokens = lexer.tokenize();
2646 assert!(!tokens.is_empty());
2647 }
2648
2649 #[test]
2650 fn ring_is_not_verb() {
2651 let mut interner = Interner::new();
2652 let mut lexer = Lexer::new("ring", &mut interner);
2653 let tokens = lexer.tokenize();
2654 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2655 }
2656
2657 #[test]
2658 fn debug_that_token() {
2659 let mut interner = Interner::new();
2660 let mut lexer = Lexer::new("The cat that runs", &mut interner);
2661 let tokens = lexer.tokenize();
2662 for (i, t) in tokens.iter().enumerate() {
2663 let lex = interner.resolve(t.lexeme);
2664 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2665 }
2666 let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2667 if let Some(t) = that_token {
2668 let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2670 eprintln!("Discriminant check for That: {}", check);
2671 assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2672 } else {
2673 panic!("No 'that' token found");
2674 }
2675 }
2676
2677 #[test]
2678 fn bus_is_not_verb() {
2679 let mut interner = Interner::new();
2680 let mut lexer = Lexer::new("bus", &mut interner);
2681 let tokens = lexer.tokenize();
2682 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2683 }
2684
2685 #[test]
2686 fn lowercase_a_is_article() {
2687 let mut interner = Interner::new();
2688 let mut lexer = Lexer::new("a car", &mut interner);
2689 let tokens = lexer.tokenize();
2690 for (i, t) in tokens.iter().enumerate() {
2691 let lex = interner.resolve(t.lexeme);
2692 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2693 }
2694 assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2695 assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2696 }
2697
2698 #[test]
2699 fn open_is_ambiguous() {
2700 let mut interner = Interner::new();
2701 let mut lexer = Lexer::new("open", &mut interner);
2702 let tokens = lexer.tokenize();
2703
2704 if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2705 assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2706 assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2707 "Should have Adjective alternative");
2708 } else {
2709 panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2710 }
2711 }
2712
2713 #[test]
2714 fn basic_tokenization() {
2715 let mut interner = Interner::new();
2716 let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2717 let tokens = lexer.tokenize();
2718 assert_eq!(tokens[0].kind, TokenType::All);
2719 assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2720 assert_eq!(tokens[2].kind, TokenType::Are);
2721 }
2722
2723 #[test]
2724 fn iff_tokenizes_as_single_token() {
2725 let mut interner = Interner::new();
2726 let mut lexer = Lexer::new("A if and only if B", &mut interner);
2727 let tokens = lexer.tokenize();
2728 assert!(
2729 tokens.iter().any(|t| t.kind == TokenType::Iff),
2730 "should contain Iff token: got {:?}",
2731 tokens
2732 );
2733 }
2734
2735 #[test]
2736 fn is_equal_to_tokenizes_as_identity() {
2737 let mut interner = Interner::new();
2738 let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2739 let tokens = lexer.tokenize();
2740 assert!(
2741 tokens.iter().any(|t| t.kind == TokenType::Identity),
2742 "should contain Identity token: got {:?}",
2743 tokens
2744 );
2745 }
2746
2747 #[test]
2748 fn is_identical_to_tokenizes_as_identity() {
2749 let mut interner = Interner::new();
2750 let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2751 let tokens = lexer.tokenize();
2752 assert!(
2753 tokens.iter().any(|t| t.kind == TokenType::Identity),
2754 "should contain Identity token: got {:?}",
2755 tokens
2756 );
2757 }
2758
2759 #[test]
2760 fn itself_tokenizes_as_reflexive() {
2761 let mut interner = Interner::new();
2762 let mut lexer = Lexer::new("John loves itself", &mut interner);
2763 let tokens = lexer.tokenize();
2764 assert!(
2765 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2766 "should contain Reflexive token: got {:?}",
2767 tokens
2768 );
2769 }
2770
2771 #[test]
2772 fn himself_tokenizes_as_reflexive() {
2773 let mut interner = Interner::new();
2774 let mut lexer = Lexer::new("John sees himself", &mut interner);
2775 let tokens = lexer.tokenize();
2776 assert!(
2777 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2778 "should contain Reflexive token: got {:?}",
2779 tokens
2780 );
2781 }
2782
2783 #[test]
2784 fn to_stay_tokenizes_correctly() {
2785 let mut interner = Interner::new();
2786 let mut lexer = Lexer::new("to stay", &mut interner);
2787 let tokens = lexer.tokenize();
2788 assert!(
2789 tokens.iter().any(|t| t.kind == TokenType::To),
2790 "should contain To token: got {:?}",
2791 tokens
2792 );
2793 assert!(
2794 tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2795 "should contain Verb token for stay: got {:?}",
2796 tokens
2797 );
2798 }
2799
2800 #[test]
2801 fn possessive_apostrophe_s() {
2802 let mut interner = Interner::new();
2803 let mut lexer = Lexer::new("John's dog", &mut interner);
2804 let tokens = lexer.tokenize();
2805 assert!(
2806 tokens.iter().any(|t| t.kind == TokenType::Possessive),
2807 "should contain Possessive token: got {:?}",
2808 tokens
2809 );
2810 assert!(
2811 tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2812 "should have John as proper name: got {:?}",
2813 tokens
2814 );
2815 }
2816
2817 #[test]
2818 fn lexer_produces_valid_spans() {
2819 let input = "All men are mortal.";
2820 let mut interner = Interner::new();
2821 let mut lexer = Lexer::new(input, &mut interner);
2822 let tokens = lexer.tokenize();
2823
2824 assert_eq!(tokens[0].span.start, 0);
2826 assert_eq!(tokens[0].span.end, 3);
2827 assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2828
2829 assert_eq!(tokens[1].span.start, 4);
2831 assert_eq!(tokens[1].span.end, 7);
2832 assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2833
2834 assert_eq!(tokens[2].span.start, 8);
2836 assert_eq!(tokens[2].span.end, 11);
2837 assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2838
2839 assert_eq!(tokens[3].span.start, 12);
2841 assert_eq!(tokens[3].span.end, 18);
2842 assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2843
2844 assert_eq!(tokens[4].span.start, 18);
2846 assert_eq!(tokens[4].span.end, 19);
2847
2848 assert_eq!(tokens[5].span.start, input.len());
2850 assert_eq!(tokens[5].kind, TokenType::EOF);
2851 }
2852
2853 #[test]
2854 fn triple_quote_produces_string_token() {
2855 let mut interner = Interner::new();
2856 let source = "## Main\nLet msg be \"\"\"\n Hello\n World\n\"\"\".\nShow msg.";
2857 let mut lexer = Lexer::new(source, &mut interner);
2858 let tokens = lexer.tokenize();
2859 for (i, t) in tokens.iter().enumerate() {
2861 let lex = interner.resolve(t.lexeme);
2862 eprintln!("Token[{}]: {:?} lex={:?} span={}..{}", i, t.kind, lex, t.span.start, t.span.end);
2863 }
2864 let str_token = tokens.iter().find(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)));
2866 assert!(str_token.is_some(), "Should have a string token. Tokens: {:?}", tokens.iter().map(|t| format!("{:?}", t.kind)).collect::<Vec<_>>());
2867 if let Some(tok) = str_token {
2868 let content = interner.resolve(tok.lexeme);
2869 eprintln!("Triple-quote content: {:?}", content);
2870 assert!(content.contains("Hello"), "Should contain Hello, got: {:?}", content);
2871 }
2872 }
2873}